Commit 789303ae authored by Yosry Ahmed's avatar Yosry Ahmed Committed by Liu Shixin
Browse files

mm: vmpressure: don't count proactive reclaim in vmpressure

mainline inclusion
from mainline-v6.0-rc1
commit 73b73bac
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7CGGT
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73b73bac90d97400e29e585c678c4d0ebfd2680d

--------------------------------

memory.reclaim is a cgroup v2 interface that allows users to proactively
reclaim memory from a memcg, without real memory pressure.  Reclaim
operations invoke vmpressure, which is used: (a) To notify userspace of
reclaim efficiency in cgroup v1, and (b) As a signal for a memcg being
under memory pressure for networking (see
mem_cgroup_under_socket_pressure()).

For (a), vmpressure notifications in v1 are not affected by this change
since memory.reclaim is a v2 feature.

For (b), the effects of the vmpressure signal (according to Shakeel [1])
are as follows:
1. Reducing send and receive buffers of the current socket.
2. May drop packets on the rx path.
3. May throttle current thread on the tx path.

Since proactive reclaim is invoked directly by userspace, not by memory
pressure, it makes sense not to throttle networking.  Hence, this change
makes sure that proactive reclaim caused by memory.reclaim does not
trigger vmpressure.

[1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/

[yosryahmed@google.com: update documentation]
  Link: https://lkml.kernel.org/r/20220721173015.2643248-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com


Signed-off-by: default avatarYosry Ahmed <yosryahmed@google.com>
Acked-by: default avatarShakeel Butt <shakeelb@google.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Acked-by: default avatarDavid Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent aacf4228
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -1210,6 +1210,13 @@ PAGE_SIZE multiple when read back.
	the target cgroup. If less bytes are reclaimed than the
	specified amount, -EAGAIN is returned.

	Please note that the proactive reclaim (triggered by this
	interface) is not meant to indicate memory pressure on the
	memory cgroup. Therefore socket memory balancing triggered by
	the memory reclaim normally is not exercised in this case.
	This means that the networking layer will not adapt based on
	reclaim induced by memory.reclaim.

  memory.oom.group
	A read-write single value file which exists on non-root
	cgroups.  The default value is "0".
+4 −1
Original line number Diff line number Diff line
@@ -376,10 +376,13 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
					gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
						  unsigned long nr_pages,
						  gfp_t gfp_mask,
						  bool may_swap);
						  unsigned int reclaim_options);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
						gfp_t gfp_mask, bool noswap,
						pg_data_t *pgdat,
+13 −10
Original line number Diff line number Diff line
@@ -2397,7 +2397,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,

		psi_memstall_enter(&pflags);
		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
							     gfp_mask, true);
							gfp_mask,
							MEMCG_RECLAIM_MAY_SWAP);
		psi_memstall_leave(&pflags);
	} while ((memcg = parent_mem_cgroup(memcg)) &&
		 !mem_cgroup_is_root(memcg));
@@ -2660,7 +2661,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
	enum oom_status oom_status;
	unsigned long nr_reclaimed;
	bool passed_oom = false;
	bool may_swap = true;
	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
	bool drained = false;
	unsigned long pflags;

@@ -2679,7 +2680,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
		mem_over_limit = mem_cgroup_from_counter(counter, memory);
	} else {
		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
		may_swap = false;
		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
	}

	if (batch > nr_pages) {
@@ -2715,7 +2716,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

	psi_memstall_enter(&pflags);
	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
						    gfp_mask, may_swap);
						    gfp_mask, reclaim_options);
	psi_memstall_leave(&pflags);

	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3365,8 +3366,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
			continue;
		}

		if (!try_to_free_mem_cgroup_pages(memcg, 1,
					GFP_KERNEL, !memsw)) {
		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
			ret = -EBUSY;
			break;
		}
@@ -3483,7 +3484,7 @@ int mem_cgroup_force_empty(struct mem_cgroup *memcg)
			return -EINTR;

		progress = try_to_free_mem_cgroup_pages(memcg, 1,
							GFP_KERNEL, true);
				      GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
		if (!progress) {
			nr_retries--;
			/* maybe some writeback is necessary */
@@ -5230,7 +5231,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
		}

		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
							 GFP_KERNEL, true);
					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);

		if (!reclaimed && !nr_retries--)
			break;
@@ -5271,6 +5272,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
	unsigned long nr_to_reclaim, nr_reclaimed = 0;
	unsigned int reclaim_options;
	int err;

	buf = strstrip(buf);
@@ -5282,6 +5284,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
			mem_cgroup_is_root(memcg))
		return -EINVAL;

	reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
	while (nr_reclaimed < nr_to_reclaim) {
		unsigned long reclaimed;

@@ -5297,7 +5300,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,

		reclaimed = try_to_free_mem_cgroup_pages(memcg,
						nr_to_reclaim - nr_reclaimed,
						GFP_KERNEL, true);
						GFP_KERNEL, reclaim_options);

		if (!reclaimed && !nr_retries--)
			return -EAGAIN;
@@ -6984,7 +6987,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,

		if (nr_reclaims) {
			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
							  GFP_KERNEL, true))
					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
				nr_reclaims--;
			continue;
		}
+17 −10
Original line number Diff line number Diff line
@@ -103,6 +103,9 @@ struct scan_control {
	/* Can pages be swapped as part of reclaim? */
	unsigned int may_swap:1;

	/* Proactive reclaim invoked by userspace through memory.reclaim */
	unsigned int proactive:1;

	/*
	 * Cgroup memory below memory.low is protected as long as we
	 * don't threaten to OOM. If any cgroup is reclaimed at
@@ -2880,6 +2883,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
			    sc->priority);

		/* Record the group's reclaim efficiency */
		if (!sc->proactive)
			vmpressure(sc->gfp_mask, memcg, false,
				   sc->nr_scanned - scanned,
				   sc->nr_reclaimed - reclaimed);
@@ -3005,6 +3009,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
	}

	/* Record the subtree's reclaim efficiency */
	if (!sc->proactive)
		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
			   sc->nr_scanned - nr_scanned,
			   sc->nr_reclaimed - nr_reclaimed);
@@ -3252,6 +3257,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);

	do {
		if (!sc->proactive)
			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
					sc->priority);
		sc->nr_scanned = 0;
@@ -3562,7 +3568,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
					   unsigned long nr_pages,
					   gfp_t gfp_mask,
					   bool may_swap)
					   unsigned int reclaim_options)
{
	unsigned long nr_reclaimed;
	unsigned int noreclaim_flag;
@@ -3575,7 +3581,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
		.priority = DEF_PRIORITY,
		.may_writepage = !laptop_mode,
		.may_unmap = 1,
		.may_swap = may_swap,
		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
	};
	/*
	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put