Commit 200321e8 authored by Ma Wupeng's avatar Ma Wupeng Committed by Wupeng Ma
Browse files

mm: mem_reliable: Add limiting the usage of reliable memory

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8USBA


CVE: NA

--------------------------------

Add reliable limit for reliable user task, page cache and shmem.

1. For reliable user task:

There is a upper limit for all memory allocation if the following
conditions are met:
- gfp_zone(gfp & ~ GFP_RELIABLE) == ZONE_MOVABLE
- gfp & GFP_RELIABLE is true

Init tasks will alloc memory from non-mirrored region if their allocation
trigger limit.

The limit can be set or access via /proc/sys/vm/task_reliable_limit

2. For page cache:

This limit's default value is ULONG_MAX. User can update this value between
current user used reliable memory size and total reliable memory size.

Add interface /proc/sys/vm/pagecache_reliable_limit to set the
max size for reliable page cache, the max size can not beyond total
reliable ram.

the whole reliable memory feature depend on kernelcore=mirror,
and which depend on NUMA, so remove redundant code in UMA.

3. For shmem:

This limit is used to restrict the amount of mirrored memory by shmem.
This memory allocation will return no memory if reliable fallback is off
or fallback to non-mirrored region if reliable fallback on.

This limit can be set or access via
/proc/sys/vm/shmem_reliable_bytes_limit.
The default value of this limit is LONG_MAX. This limit can be set from 0
to the total size of mirrored memory.

Signed-off-by: default avatarChen Wandun <chenwandun@huawei.com>
Signed-off-by: default avatarMa Wupeng <mawupeng1@huawei.com>
parent e1767ef2
Loading
Loading
Loading
Loading
+68 −9
Original line number Diff line number Diff line
@@ -7,8 +7,10 @@
#include <linux/stddef.h>
#include <linux/gfp.h>
#include <linux/mmzone.h>
#include <linux/oom.h>
#include <linux/mm_types.h>
#include <linux/sched.h>
#include <linux/percpu_counter.h>

DECLARE_STATIC_KEY_FALSE(mem_reliable);

@@ -19,6 +21,9 @@ extern bool pagecache_reliable;
extern struct percpu_counter pagecache_reliable_pages;
extern struct percpu_counter anon_reliable_pages;
extern struct percpu_counter shmem_reliable_pages;
extern unsigned long task_reliable_limit __read_mostly;
extern unsigned long shmem_reliable_limit __read_mostly;
extern unsigned long pagecache_reliable_limit __read_mostly;

void mem_reliable_init(bool has_unmirrored_mem, unsigned long mirrored_sz);
bool mem_reliable_status(void);
@@ -28,6 +33,8 @@ void reliable_lru_add(enum lru_list lru, struct folio *folio, int val);
void reliable_lru_add_batch(int zid, enum lru_list lru, int val);
bool mem_reliable_counter_initialized(void);
void reliable_report_meminfo(struct seq_file *m);
void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order,
				int preferred_nid, nodemask_t *nodemask);

static inline bool mem_reliable_is_enabled(void)
{
@@ -84,26 +91,53 @@ static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z)
	return false;
}

static inline void shmem_prepare_alloc(gfp_t *gfp_mask)
static inline bool mem_reliable_shmem_limit_check(void)
{
	return percpu_counter_read_positive(&shmem_reliable_pages) <
	       (shmem_reliable_limit >> PAGE_SHIFT);
}

/*
 * Check if this memory allocation for shmem is allowed.
 * Return false if limit is triggered.
 */
static inline bool shmem_prepare_alloc(gfp_t *gfp_mask)
{
	if (!mem_reliable_is_enabled())
		return;
		return true;

	if (shmem_reliable_is_enabled())
		*gfp_mask |= GFP_RELIABLE;
	else
	if (!shmem_reliable_is_enabled()) {
		*gfp_mask &= ~GFP_RELIABLE;
		return true;
	}

	if (mem_reliable_shmem_limit_check()) {
		*gfp_mask |= GFP_RELIABLE;
		return true;
	}

	return false;
}

static inline void filemap_prepare_alloc(gfp_t *gfp_mask)
{
	s64 nr_reliable = 0;

	if (!mem_reliable_is_enabled())
		return;

	if (filemap_reliable_is_enabled())
		*gfp_mask |= GFP_RELIABLE;
	else
	if (!filemap_reliable_is_enabled()) {
		*gfp_mask &= ~GFP_RELIABLE;
		return;
	}

	nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages);
	if (nr_reliable > pagecache_reliable_limit >> PAGE_SHIFT) {
		*gfp_mask &= ~GFP_RELIABLE;
		return;
	}

	*gfp_mask |= GFP_RELIABLE;
}

static inline unsigned long task_reliable_used_pages(void)
@@ -122,6 +156,21 @@ static inline void shmem_reliable_folio_add(struct folio *folio, int nr_page)
		percpu_counter_add(&shmem_reliable_pages, nr_page);
}


static inline bool reliable_mem_limit_check(unsigned long nr_page)
{
	return (task_reliable_used_pages() + nr_page) <=
	       (task_reliable_limit >> PAGE_SHIFT);
}

static inline bool mem_reliable_should_reclaim(void)
{
	if (percpu_counter_sum_positive(&pagecache_reliable_pages) >=
	    MAX_ORDER_NR_PAGES)
		return true;

	return false;
}
#else
#define reliable_enabled 0

@@ -137,7 +186,7 @@ static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z)
}
static inline bool mem_reliable_status(void) { return false; }
static inline bool mem_reliable_hide_file(const char *name) { return false; }
static inline void shmem_prepare_alloc(gfp_t *gfp_mask) {}
static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) { return true; }
static inline void filemap_prepare_alloc(gfp_t *gfp_mask) {}
static inline void shmem_reliable_init(void) {}
static inline void reliable_lru_add(enum lru_list lru, struct folio *folio,
@@ -148,6 +197,16 @@ static inline bool mem_reliable_counter_initialized(void) { return false; }
static inline void shmem_reliable_folio_add(struct folio *folio,
					    int nr_page) {}
static inline void reliable_report_meminfo(struct seq_file *m) {}
static inline bool mem_reliable_shmem_limit_check(void) { return true; }
static inline bool reliable_mem_limit_check(unsigned long nr_page)
{
	return false;
}
static inline bool mem_reliable_should_reclaim(void) { return false; }
static inline void mem_reliable_out_of_memory(gfp_t gfp_mask,
					      unsigned int order,
					      int preferred_nid,
					      nodemask_t *nodemask) {}
#endif

#endif
+113 −0
Original line number Diff line number Diff line
@@ -20,6 +20,10 @@ bool pagecache_reliable __read_mostly = true;
struct percpu_counter pagecache_reliable_pages;
struct percpu_counter anon_reliable_pages;
struct percpu_counter shmem_reliable_pages;
unsigned long pagecache_reliable_limit = ULONG_MAX;
/* reliable user limit for user tasks with reliable flag */
unsigned long task_reliable_limit = ULONG_MAX;
unsigned long shmem_reliable_limit = ULONG_MAX;

bool mem_reliable_counter_initialized(void)
{
@@ -117,11 +121,103 @@ void reliable_lru_add(enum lru_list lru, struct folio *folio, int val)
	}
}

static int reliable_pagecache_max_bytes_write(struct ctl_table *table,
					      int write, void __user *buffer,
					      size_t *length, loff_t *ppos)
{
	unsigned long old_value = pagecache_reliable_limit;
	int ret;

	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
	if (!ret && write) {
		if (pagecache_reliable_limit >
		    PAGES_TO_B(total_reliable_pages())) {
			pagecache_reliable_limit = old_value;
			return -EINVAL;
		}
	}

	return ret;
}

static int reliable_limit_handler(struct ctl_table *table, int write,
				  void __user *buffer, size_t *length,
				  loff_t *ppos)
{
	unsigned long old = task_reliable_limit;
	int ret;

	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
	if (!ret && write) {
		if (task_reliable_limit > PAGES_TO_B(total_reliable_pages()) ||
		    task_reliable_limit <
			    (task_reliable_used_pages() << PAGE_SHIFT)) {
			task_reliable_limit = old;
			return -EINVAL;
		}
	}

	return ret;
}

#ifdef CONFIG_SHMEM
static int reliable_shmem_bytes_limit_handler(struct ctl_table *table,
					      int write, void __user *buffer,
					      size_t *length, loff_t *ppos)
{
	unsigned long *data_ptr = (unsigned long *)(table->data);
	unsigned long old = *data_ptr;
	int ret;

	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
	if (!ret && write) {
		if (*data_ptr > PAGES_TO_B(total_reliable_pages())) {
			*data_ptr = old;
			return -EINVAL;
		}
	}

	return ret;
}
#endif

static struct ctl_table reliable_ctl_table[] = {
	{
		.procname = "reliable_pagecache_max_bytes",
		.data = &pagecache_reliable_limit,
		.maxlen = sizeof(pagecache_reliable_limit),
		.mode = 0644,
		.proc_handler = reliable_pagecache_max_bytes_write,
	},
	{
		.procname = "task_reliable_limit",
		.data = &task_reliable_limit,
		.maxlen = sizeof(task_reliable_limit),
		.mode = 0644,
		.proc_handler = reliable_limit_handler,
	},
#ifdef CONFIG_SHMEM
	{
		.procname = "shmem_reliable_bytes_limit",
		.data = &shmem_reliable_limit,
		.maxlen = sizeof(shmem_reliable_limit),
		.mode = 0644,
		.proc_handler = reliable_shmem_bytes_limit_handler,
	},
#endif
	{}
};

static int __init reliable_sysctl_init(void)
{
	if (!mem_reliable_is_enabled())
		return 0;

	if (!register_sysctl("vm", reliable_ctl_table)) {
		pr_err("register sysctl failed.");
		return -ENOMEM;
	}

	percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL);
	percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL);

@@ -167,6 +263,23 @@ void reliable_report_meminfo(struct seq_file *m)
	}
}

void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order,
				int preferred_nid, nodemask_t *nodemask)
{
	struct oom_control oc = {
		.zonelist = node_zonelist(preferred_nid, gfp),
		.nodemask = nodemask,
		.memcg = NULL,
		.gfp_mask = gfp,
		.order = order,
	};

	if (!mutex_trylock(&oom_lock))
		return;
	out_of_memory(&oc);
	mutex_unlock(&oom_lock);
}

static int __init setup_reliable_debug(char *str)
{
	if (*str++ != '=' || !*str)
+87 −0
Original line number Diff line number Diff line
@@ -4557,6 +4557,89 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask)
	*gfp_mask &= ~GFP_RELIABLE;
}

static inline long mem_reliable_direct_reclaim(int nr_pages, struct alloc_context *ac)
{
	long nr_reclaimed = 0;

	while (nr_reclaimed < nr_pages) {
		/* try to free cache from reliable region */
		long progress = __perform_reclaim(GFP_KERNEL, 0, ac);

		nr_reclaimed += progress;
		if (progress < SWAP_CLUSTER_MAX)
			break;
	}

	return nr_reclaimed;
}

/*
 * return true means memory allocation need retry and flag GFP_RELIABLE
 * must be cleared.
 */
static inline bool check_after_alloc(gfp_t *gfp, unsigned int order,
				     int preferred_nid,
				     struct alloc_context *ac,
				     struct page **_page)
{
	int retry_times = MAX_RECLAIM_RETRIES;
	int nr_pages;

	if (!mem_reliable_is_enabled())
		return false;

	if (!(*gfp & GFP_RELIABLE))
		return false;

	if (!*_page)
		goto out_retry;

	if (*gfp & __GFP_NOFAIL || current->flags & PF_MEMALLOC)
		goto out;

	/* percpu counter is not initialized, ignore limit check */
	if (!mem_reliable_counter_initialized())
		goto out;

limit_check:
	/* user task is limited by task_reliable_limit */
	if (!reliable_mem_limit_check(1 << order))
		goto out_free_page;

	goto out;

out_free_page:
	if (mem_reliable_should_reclaim() && retry_times--) {
		nr_pages = mem_reliable_direct_reclaim(1 << order, ac);
		if (nr_pages)
			goto limit_check;
	}

	__free_pages(*_page, order);
	*_page = NULL;

out_retry:
	if (is_global_init(current)) {
		*gfp &= ~GFP_RELIABLE;
		return true;
	}

	if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE))
		goto out;

	/* Coredumps can quickly deplete all memory reserves */
	if (current->flags & PF_DUMPCORE)
		goto out;
	/* The OOM killer will not help higher order allocs */
	if (order > PAGE_ALLOC_COSTLY_ORDER)
		goto out;

	/* oom here */
	mem_reliable_out_of_memory(*gfp, order, preferred_nid, ac->nodemask);
out:
	return false;
}

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
@@ -4579,6 +4662,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,

	prepare_before_alloc(&gfp);

retry:
	/*
	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
	 * resp. GFP_NOIO which has to be inherited for all allocation requests
@@ -4621,6 +4705,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
		page = NULL;
	}

	if (check_after_alloc(&gfp, order, preferred_nid, &ac, &page))
		goto retry;

	trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
	kmsan_alloc_page(page, order, alloc_gfp);

+3 −1
Original line number Diff line number Diff line
@@ -1679,7 +1679,8 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
	if (err)
		goto failed;

	shmem_prepare_alloc(&gfp);
	if (!shmem_prepare_alloc(&gfp))
		goto no_mem;

	if (huge)
		folio = shmem_alloc_hugefolio(gfp, info, index);
@@ -1691,6 +1692,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
		return folio;
	}

no_mem:
	err = -ENOMEM;
	shmem_inode_unacct_blocks(inode, nr);
failed: