Unverified Commit 5adc2032 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!3638 memcg: support OOM priority for memcg

Merge Pull Request from: @ci-robot 
 
PR sync from: Jinjiang Tu <tujinjiang@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/YMPERNQUGIRDCEWEYZRCHXDGAQHSCESX/ 
Support memcg oom priority.

Changelog:
* rename CONFIG_MEMCG_QOS to CONFIG_MEMCG_OOM_PRIORITY
* move CONFIG_MEMCG_OOM_PRIORITY to init/Kconfig
* set CONFIG_MEMCG_OOM_PRIORITY default from y to n, and enable it in openeuler_defconfig
* add rcu_read_lock() protection for mem_cgroup_from_task()
* Instead of static key, use variable to check if the feature is enabled
* use READ_ONCE/WRITE_ONCE protection when read/write oom_prio
* cleanup

Jing Xiangfeng (3):
  memcg: support priority for oom
  memcg: Add sysctl memcg_qos_enable
  memcg: enable CONFIG_MEMCG_OOM_PRIORITY by default


-- 
2.25.1
 
https://gitee.com/openeuler/kernel/issues/I8PXX8 
 
Link:https://gitee.com/openeuler/kernel/pulls/3638

 

Reviewed-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: default avatarLu Jialin <lujialin4@huawei.com>
Reviewed-by: default avatarLiu Chao <liuchao173@huawei.com>
Reviewed-by: default avatarZhang Jianhua <chris.zjh@huawei.com>
Signed-off-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
parents b608db20 9354d855
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -159,6 +159,7 @@ CONFIG_PAGE_COUNTER=y
CONFIG_MEMCG=y
CONFIG_MEMCG_V1_RECLAIM=y
CONFIG_MEMCG_MEMFS_INFO=y
CONFIG_MEMCG_OOM_PRIORITY=y
CONFIG_MEMCG_KMEM=y
CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_WRITEBACK=y
+1 −0
Original line number Diff line number Diff line
@@ -181,6 +181,7 @@ CONFIG_PAGE_COUNTER=y
CONFIG_MEMCG=y
CONFIG_MEMCG_V1_RECLAIM=y
CONFIG_MEMCG_MEMFS_INFO=y
CONFIG_MEMCG_OOM_PRIORITY=y
CONFIG_MEMCG_KMEM=y
CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_WRITEBACK=y
+25 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
struct oom_control;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -297,6 +298,12 @@ struct mem_cgroup {
	bool			tcpmem_active;
	int			tcpmem_pressure;

#ifdef CONFIG_MEMCG_OOM_PRIORITY
	/* Currently support 0 and -1.
	 * in the future it can expand to other value.
	 */
	int	oom_prio;
#endif
#ifdef CONFIG_MEMCG_KMEM
	int kmemcg_id;
	struct obj_cgroup __rcu *objcg;
@@ -346,6 +353,20 @@ struct mem_cgroup {
	struct mem_cgroup_per_node *nodeinfo[];
};

#ifdef CONFIG_MEMCG_OOM_PRIORITY
#define MEMCG_LOW_OOM_PRIORITY -1
#define MEMCG_HIGH_OOM_PRIORITY 0

bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *),
				   void *arg);
void memcg_print_bad_task(struct oom_control *oc);
bool memcg_oom_prio_disabled(void);
#else
static inline void memcg_print_bad_task(struct oom_control *oc)
{
}
#endif

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
@@ -1602,6 +1623,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{
	return 0;
}

static inline void memcg_print_bad_task(struct oom_control *oc)
{
}
#endif /* CONFIG_MEMCG */

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
+14 −0
Original line number Diff line number Diff line
@@ -959,6 +959,20 @@ config MEMCG_MEMFS_INFO
	  through interface "memory.memfs_files_info" or printed when OOM is
	  triggered.

config MEMCG_OOM_PRIORITY
	bool "Enable Memory Cgroup OOM Priority"
	depends on MEMCG
	depends on X86 || ARM64
	default n
	help
	  Prefer to kill the process from the low priority memcg when OOM occurs.

	  When OOM occurs, this feature first selects the low priority memcg that
	  uses most memory, and then kill the process that uses most memory in the
	  memcg. If the process is not found, then fallback to normal processing.

	  If unsure, say "n".

config MEMCG_KMEM
	bool
	depends on MEMCG
+222 −0
Original line number Diff line number Diff line
@@ -4055,6 +4055,212 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
}
#endif

#ifdef CONFIG_MEMCG_OOM_PRIORITY
#define ENABLE_MEMCG_OOM_PROIRITY	1
#define DISABLE_MEMCG_OOM_PROIRITY	0
int sysctl_memcg_oom_prio = DISABLE_MEMCG_OOM_PROIRITY;

bool memcg_oom_prio_disabled(void)
{
	return READ_ONCE(sysctl_memcg_oom_prio) == DISABLE_MEMCG_OOM_PROIRITY;
}

static void memcg_oom_prio_init(struct mem_cgroup *memcg)
{
	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
	int oom_prio;

	if (!parent)
		return;

	oom_prio = READ_ONCE(parent->oom_prio);
	WRITE_ONCE(memcg->oom_prio, oom_prio);
}

static s64 memcg_oom_prio_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	if (memcg_oom_prio_disabled())
		return 0;

	return READ_ONCE(memcg->oom_prio);
}

static int memcg_oom_prio_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, s64 val)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct mem_cgroup *iter_memcg;
	struct cgroup_subsys_state *iter_css;

	if (memcg_oom_prio_disabled())
		return -EACCES;

	if (mem_cgroup_is_root(memcg))
		return -EINVAL;

	if (val != MEMCG_LOW_OOM_PRIORITY && val != MEMCG_HIGH_OOM_PRIORITY)
		return -EINVAL;

	rcu_read_lock();
	css_for_each_descendant_pre(iter_css, &memcg->css) {
		iter_memcg = mem_cgroup_from_css(iter_css);

		WRITE_ONCE(iter_memcg->oom_prio, val);
	}
	rcu_read_unlock();

	return 0;
}

static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last)
{
	struct mem_cgroup *iter, *max_memcg = NULL;
	struct cgroup_subsys_state *css;
	unsigned long usage, max_usage = 0;
	int oom_prio;

	rcu_read_lock();
	css_for_each_descendant_pre(css, &root_mem_cgroup->css) {
		iter = mem_cgroup_from_css(css);
		oom_prio = READ_ONCE(iter->oom_prio);

		if (oom_prio == MEMCG_HIGH_OOM_PRIORITY ||
			iter == root_mem_cgroup ||
			iter == last)
			continue;

		usage = mem_cgroup_usage(iter, false);
		if (usage > max_usage) {
			max_usage = usage;
			max_memcg = iter;
		}
	}
	rcu_read_unlock();

	return max_memcg;
}

bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *),
				   void *arg)
{
	struct mem_cgroup *max, *last = NULL;
	struct oom_control *oc = arg;
	struct css_task_iter it;
	struct task_struct *task;
	int ret = 0;
	bool retry = true;

	if (memcg_oom_prio_disabled())
		return false;
retry:
	max = memcg_find_max_usage(last);
	if (!max)
		return false;

	css_task_iter_start(&max->css, 0, &it);
	while (!ret && (task = css_task_iter_next(&it))) {
		if (test_tsk_thread_flag(task, TIF_MEMDIE))
			continue;

		ret = fn(task, arg);
	}
	css_task_iter_end(&it);

	if (ret)
		return false;

	if (!oc->chosen && retry) {
		last = max;
		retry = false;
		goto retry;
	}

	if (oc->chosen)
		pr_info("The bad task [%d:%s] is from low-priority memcg.\n",
				oc->chosen->pid, oc->chosen->comm);

	return oc->chosen ? true : false;
}

void memcg_print_bad_task(struct oom_control *oc)
{
	if (memcg_oom_prio_disabled())
		return;

	if (oc->chosen) {
		struct mem_cgroup *memcg;

		rcu_read_lock();
		memcg = mem_cgroup_from_task(oc->chosen);
		if (READ_ONCE(memcg->oom_prio) == MEMCG_LOW_OOM_PRIORITY)
			pr_info("The bad task [%d:%s] is from low-priority memcg.\n",
				oc->chosen->pid, oc->chosen->comm);
		rcu_read_unlock();
	}
}

static void memcg_oom_prio_reset(void)
{
	struct mem_cgroup *iter;
	struct cgroup_subsys_state *css;

	rcu_read_lock();
	css_for_each_descendant_pre(css, &root_mem_cgroup->css) {
		iter = mem_cgroup_from_css(css);
		WRITE_ONCE(iter->oom_prio, 0);
	}
	rcu_read_unlock();
}

static int sysctl_memcg_oom_prio_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *length, loff_t *ppos)
{
	int ret;

	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
	if (ret)
		return ret;

	if (write) {
		if (READ_ONCE(sysctl_memcg_oom_prio) == DISABLE_MEMCG_OOM_PROIRITY)
			memcg_oom_prio_reset();
	}

	return ret;
}

static struct ctl_table memcg_oom_prio_sysctls[] = {
	{
		/*
		 * This sysctl is used to control memcg oom priority
		 * feature, the sysctl name is for compatibility.
		 */
		.procname	= "memcg_qos_enable",
		.data		= &sysctl_memcg_oom_prio,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= sysctl_memcg_oom_prio_handler,
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_ONE,
	},
};

static __init int memcg_oom_prio_sysctls_init(void)
{
	register_sysctl_init("vm", memcg_oom_prio_sysctls);
	return 0;
}
#else
static inline int memcg_oom_prio_sysctls_init(void)
{
	return 0;
}

#endif

#ifdef CONFIG_NUMA

#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
@@ -5417,6 +5623,17 @@ static struct cftype mem_cgroup_legacy_files[] = {
		.seq_show = mem_cgroup_memfs_files_show,
	},
#endif
#ifdef CONFIG_MEMCG_OOM_PRIORITY
	{
		/*
		 * This interface is used to control the oom priority
		 * of the memcg. The interface name is for compatibility.
		 */
		.name = "qos_level",
		.read_s64 = memcg_oom_prio_read,
		.write_s64 = memcg_oom_prio_write,
	},
#endif
#ifdef CONFIG_NUMA
	{
		.name = "numa_stat",
@@ -5816,6 +6033,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
				   FLUSH_TIME);
	lru_gen_online_memcg(memcg);

#ifdef CONFIG_MEMCG_OOM_PRIORITY
	memcg_oom_prio_init(memcg);
#endif

	/* Online state pins memcg ID, memcg ID pins CSS */
	refcount_set(&memcg->id.ref, 1);
	css_get(css);
@@ -7808,6 +8029,7 @@ static int __init mem_cgroup_init(void)
	}

	mem_cgroup_memfs_info_init();
	memcg_oom_prio_sysctls_init();

	return 0;
}
Loading