Commit cfa55e62 authored by Yu Zhao's avatar Yu Zhao Committed by Peng Zhang
Browse files

mm/mglru: try to stop at high watermarks

stable inclusion
from stable-v6.6.8
commit c5f67b7e847490d41219f1dfe1b1610e88a997e4
bugzilla: https://gitee.com/openeuler/kernel/issues/I99K53

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c5f67b7e847490d41219f1dfe1b1610e88a997e4

--------------------------------

commit 5095a2b23987d3c3c47dd16b3d4080e2733b8bb9 upstream.

The initial MGLRU patchset didn't include the memcg LRU support, and it
relied on should_abort_scan(), added by commit f76c8337 ("mm:
multi-gen LRU: optimize multiple memcgs"), to "backoff to avoid
overshooting their aggregate reclaim target by too much".

Later on when the memcg LRU was added, should_abort_scan() was deemed
unnecessary, and the test results [1] showed no side effects after it was
removed by commit a579086c ("mm: multi-gen LRU: remove eviction
fairness safeguard").

However, that test used memory.reclaim, which sets nr_to_reclaim to
SWAP_CLUSTER_MAX.  So it can overshoot only by SWAP_CLUSTER_MAX-1 pages,
i.e., from nr_reclaimed=nr_to_reclaim-1 to
nr_reclaimed=nr_to_reclaim+SWAP_CLUSTER_MAX-1.  Compared with the batch
size kswapd sets to nr_to_reclaim, SWAP_CLUSTER_MAX is tiny.  Therefore
that test isn't able to reproduce the worst case scenario, i.e., kswapd
overshooting GBs on large systems and "consuming 100% CPU" (see the Closes
tag).

Bring back a simplified version of should_abort_scan() on top of the memcg
LRU, so that kswapd stops when all eligible zones are above their
respective high watermarks plus a small delta to lower the chance of
KSWAPD_HIGH_WMARK_HIT_QUICKLY.  Note that this only applies to order-0
reclaim, meaning compaction-induced reclaim can still run wild (which is a
different problem).

On Android, launching 55 apps sequentially:
           Before     After      Change
  pgpgin   838377172  802955040  -4%
  pgpgout  38037080   34336300   -10%

[1] https://lore.kernel.org/20221222041905.2431096-1-yuzhao@google.com/

Link: https://lkml.kernel.org/r/20231208061407.2125867-2-yuzhao@google.com


Fixes: a579086c ("mm: multi-gen LRU: remove eviction fairness safeguard")
Signed-off-by: default avatarYu Zhao <yuzhao@google.com>
Reported-by: default avatarCharan Teja Kalla <quic_charante@quicinc.com>
Reported-by: default avatarJaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Closes: https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@mail.gmail.com/


Tested-by: default avatarJaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Tested-by: default avatarKalesh Singh <kaleshsingh@google.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: T.J. Mercier <tjmercier@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: default avatarZhangPeng <zhangpeng362@huawei.com>
parent 684a783e
Loading
Loading
Loading
Loading
+28 −8
Original line number Diff line number Diff line
@@ -5361,20 +5361,41 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
}

static unsigned long get_nr_to_reclaim(struct scan_control *sc)
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
{
	int i;
	enum zone_watermarks mark;

	/* don't abort memcg reclaim to ensure fairness */
	if (!root_reclaim(sc))
		return -1;
		return false;

	if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
		return true;

	/* check the order to exclude compaction-induced reclaim */
	if (!current_is_kswapd() || sc->order)
		return false;

	return max(sc->nr_to_reclaim, compact_gap(sc->order));
	mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
	       WMARK_PROMO : WMARK_HIGH;

	for (i = 0; i <= sc->reclaim_idx; i++) {
		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
		unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;

		if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
			return false;
	}

	/* kswapd should abort if all eligible zones are safe */
	return true;
}

static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
	long nr_to_scan;
	unsigned long scanned = 0;
	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
	int swappiness = get_swappiness(lruvec, sc);

	/* clean file folios are more likely to exist */
@@ -5396,7 +5417,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
		if (scanned >= nr_to_scan)
			break;

		if (sc->nr_reclaimed >= nr_to_reclaim)
		if (should_abort_scan(lruvec, sc))
			break;

		cond_resched();
@@ -5457,7 +5478,6 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
	struct lru_gen_folio *lrugen;
	struct mem_cgroup *memcg;
	const struct hlist_nulls_node *pos;
	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);

	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
restart:
@@ -5490,7 +5510,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)

		rcu_read_lock();

		if (sc->nr_reclaimed >= nr_to_reclaim)
		if (should_abort_scan(lruvec, sc))
			break;
	}

@@ -5501,7 +5521,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)

	mem_cgroup_put(memcg);

	if (sc->nr_reclaimed >= nr_to_reclaim)
	if (!is_a_nulls(pos))
		return;

	/* restart if raced with lru_gen_rotate_memcg() */