Commit be8d9553 authored by Jing Xiangfeng's avatar Jing Xiangfeng Committed by Jinjiang Tu
Browse files

memcg: support priority for oom

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8PXX8


CVE: NA

--------------------------------------

We first kill the process from the low priority memcg if OOM occurs.
If the process is not found, then fallback to normal handle.

Signed-off-by: default avatarJing Xiangfeng <jingxiangfeng@huawei.com>
Signed-off-by: default avatarJinjiang Tu <tujinjiang@huawei.com>
parent a60f40a8
Loading
Loading
Loading
Loading
+24 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
struct oom_control;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -297,6 +298,12 @@ struct mem_cgroup {
	bool			tcpmem_active;
	int			tcpmem_pressure;

#ifdef CONFIG_MEMCG_OOM_PRIORITY
	/* Currently support 0 and -1.
	 * in the future it can expand to other value.
	 */
	int	oom_prio;
#endif
#ifdef CONFIG_MEMCG_KMEM
	int kmemcg_id;
	struct obj_cgroup __rcu *objcg;
@@ -346,6 +353,19 @@ struct mem_cgroup {
	struct mem_cgroup_per_node *nodeinfo[];
};

#ifdef CONFIG_MEMCG_OOM_PRIORITY
#define MEMCG_LOW_OOM_PRIORITY -1
#define MEMCG_HIGH_OOM_PRIORITY 0

bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *),
				   void *arg);
void memcg_print_bad_task(struct oom_control *oc);
#else
static inline void memcg_print_bad_task(struct oom_control *oc)
{
}
#endif

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
@@ -1602,6 +1622,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{
	return 0;
}

static inline void memcg_print_bad_task(struct oom_control *oc)
{
}
#endif /* CONFIG_MEMCG */

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
+14 −0
Original line number Diff line number Diff line
@@ -959,6 +959,20 @@ config MEMCG_MEMFS_INFO
	  through interface "memory.memfs_files_info" or printed when OOM is
	  triggered.

config MEMCG_OOM_PRIORITY
	bool "Enable Memory Cgroup OOM Priority"
	depends on MEMCG
	depends on X86 || ARM64
	default n
	help
	  Prefer to kill the process from the low priority memcg when OOM occurs.

	  When OOM occurs, this feature first selects the low priority memcg that
	  uses most memory, and then kill the process that uses most memory in the
	  memcg. If the process is not found, then fallback to normal processing.

	  If unsure, say "n".

config MEMCG_KMEM
	bool
	depends on MEMCG
+143 −0
Original line number Diff line number Diff line
@@ -4055,6 +4055,134 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
}
#endif

#ifdef CONFIG_MEMCG_OOM_PRIORITY
static void memcg_oom_prio_init(struct mem_cgroup *memcg)
{
	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
	int oom_prio;

	if (!parent)
		return;

	oom_prio = READ_ONCE(parent->oom_prio);
	WRITE_ONCE(memcg->oom_prio, oom_prio);
}

static s64 memcg_oom_prio_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	return READ_ONCE(memcg->oom_prio);
}

static int memcg_oom_prio_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, s64 val)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct mem_cgroup *iter_memcg;
	struct cgroup_subsys_state *iter_css;

	if (mem_cgroup_is_root(memcg))
		return -EINVAL;

	if (val != MEMCG_LOW_OOM_PRIORITY && val != MEMCG_HIGH_OOM_PRIORITY)
		return -EINVAL;

	rcu_read_lock();
	css_for_each_descendant_pre(iter_css, &memcg->css) {
		iter_memcg = mem_cgroup_from_css(iter_css);

		WRITE_ONCE(iter_memcg->oom_prio, val);
	}
	rcu_read_unlock();

	return 0;
}

static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last)
{
	struct mem_cgroup *iter, *max_memcg = NULL;
	struct cgroup_subsys_state *css;
	unsigned long usage, max_usage = 0;
	int oom_prio;

	rcu_read_lock();
	css_for_each_descendant_pre(css, &root_mem_cgroup->css) {
		iter = mem_cgroup_from_css(css);
		oom_prio = READ_ONCE(iter->oom_prio);

		if (oom_prio == MEMCG_HIGH_OOM_PRIORITY ||
			iter == root_mem_cgroup ||
			iter == last)
			continue;

		usage = mem_cgroup_usage(iter, false);
		if (usage > max_usage) {
			max_usage = usage;
			max_memcg = iter;
		}
	}
	rcu_read_unlock();

	return max_memcg;
}

bool memcg_oom_prio_scan_tasks(int (*fn)(struct task_struct *, void *),
				   void *arg)
{
	struct mem_cgroup *max, *last = NULL;
	struct oom_control *oc = arg;
	struct css_task_iter it;
	struct task_struct *task;
	int ret = 0;
	bool retry = true;

retry:
	max = memcg_find_max_usage(last);
	if (!max)
		return false;

	css_task_iter_start(&max->css, 0, &it);
	while (!ret && (task = css_task_iter_next(&it))) {
		if (test_tsk_thread_flag(task, TIF_MEMDIE))
			continue;

		ret = fn(task, arg);
	}
	css_task_iter_end(&it);

	if (ret)
		return false;

	if (!oc->chosen && retry) {
		last = max;
		retry = false;
		goto retry;
	}

	if (oc->chosen)
		pr_info("The bad task [%d:%s] is from low-priority memcg.\n",
				oc->chosen->pid, oc->chosen->comm);

	return oc->chosen ? true : false;
}

void memcg_print_bad_task(struct oom_control *oc)
{
	if (oc->chosen) {
		struct mem_cgroup *memcg;

		rcu_read_lock();
		memcg = mem_cgroup_from_task(oc->chosen);
		if (READ_ONCE(memcg->oom_prio) == MEMCG_LOW_OOM_PRIORITY)
			pr_info("The bad task [%d:%s] is from low-priority memcg.\n",
				oc->chosen->pid, oc->chosen->comm);
		rcu_read_unlock();
	}
}
#endif

#ifdef CONFIG_NUMA

#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
@@ -5417,6 +5545,17 @@ static struct cftype mem_cgroup_legacy_files[] = {
		.seq_show = mem_cgroup_memfs_files_show,
	},
#endif
#ifdef CONFIG_MEMCG_OOM_PRIORITY
	{
		/*
		 * This interface is used to control the oom priority
		 * of the memcg. The interface name is for compatibility.
		 */
		.name = "qos_level",
		.read_s64 = memcg_oom_prio_read,
		.write_s64 = memcg_oom_prio_write,
	},
#endif
#ifdef CONFIG_NUMA
	{
		.name = "numa_stat",
@@ -5816,6 +5955,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
				   FLUSH_TIME);
	lru_gen_online_memcg(memcg);

#ifdef CONFIG_MEMCG_OOM_PRIORITY
	memcg_oom_prio_init(memcg);
#endif

	/* Online state pins memcg ID, memcg ID pins CSS */
	refcount_set(&memcg->id.ref, 1);
	css_get(css);
+53 −5
Original line number Diff line number Diff line
@@ -306,6 +306,48 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
	return CONSTRAINT_NONE;
}

#ifdef CONFIG_MEMCG_OOM_PRIORITY
/**
 * We choose the task in low-priority memcg firstly. For the same state, we
 * choose the task with the highest number of 'points'.
 */
static bool oom_next_task(struct task_struct *task, struct oom_control *oc,
			long points)
{
	struct mem_cgroup *cur_memcg;
	struct mem_cgroup *oc_memcg;
	int cur_memcg_prio, oc_memcg_prio;

	if (points == LONG_MIN)
		return true;

	if (!oc->chosen)
		return false;

	rcu_read_lock();
	oc_memcg = mem_cgroup_from_task(oc->chosen);
	cur_memcg = mem_cgroup_from_task(task);
	oc_memcg_prio = READ_ONCE(oc_memcg->oom_prio);
	cur_memcg_prio = READ_ONCE(cur_memcg->oom_prio);
	rcu_read_unlock();

	if (cur_memcg_prio == oc_memcg_prio)
		return points < oc->chosen_points;

	/* if oc is low-priority, so skip the task */
	if (oc_memcg_prio == MEMCG_LOW_OOM_PRIORITY)
		return true;

	return false;
}
#else
static inline bool oom_next_task(struct task_struct *task,
				struct oom_control *oc, long points)
{
	return points == LONG_MIN || points < oc->chosen_points;
}
#endif

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
	struct oom_control *oc = arg;
@@ -340,7 +382,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
	}

	points = oom_badness(task, oc->totalpages);
	if (points == LONG_MIN || points < oc->chosen_points)
	if (oom_next_task(task, oc, points))
		goto next;

select:
@@ -366,11 +408,16 @@ static void select_bad_process(struct oom_control *oc)
{
	oc->chosen_points = LONG_MIN;

	if (is_memcg_oom(oc))
	if (is_memcg_oom(oc)) {
		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
	else {
		memcg_print_bad_task(oc);
	} else {
		struct task_struct *p;

#ifdef CONFIG_MEMCG_OOM_PRIORITY
		if (memcg_oom_prio_scan_tasks(oom_evaluate_task, oc))
			return;
#endif
		rcu_read_lock();
		for_each_process(p)
			if (oom_evaluate_task(p, oc))
@@ -426,9 +473,10 @@ static void dump_tasks(struct oom_control *oc)
	pr_info("Tasks state (memory values in pages):\n");
	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");

	if (is_memcg_oom(oc))
	if (is_memcg_oom(oc)) {
		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
	else {
		memcg_print_bad_task(oc);
	} else {
		struct task_struct *p;

		rcu_read_lock();