Commit 7f8674f7 authored by Barry Song's avatar Barry Song Committed by Liu Shixin
Browse files

mm: madvise: pageout: ignore references rather than clearing young

mainline inclusion
from mainline-v6.9-rc1
commit 2864f3d0f5831a50253befc5d4583868268b7153
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9OCYO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2864f3d0f5831a50253befc5d4583868268b7153

--------------------------------

While doing MADV_PAGEOUT, the current code will clear PTE young so that
vmscan won't read young flags to allow the reclamation of madvised folios
to go ahead.  It seems we can do it by directly ignoring references, thus
we can remove tlb flush in madvise and rmap overhead in vmscan.

Regarding the side effect, in the original code, if a parallel thread runs
side by side to access the madvised memory with the thread doing madvise,
folios will get a chance to be re-activated by vmscan (though the time gap
is actually quite small since checking PTEs is done immediately after
clearing PTEs young).  But with this patch, they will still be reclaimed.
But this behaviour doing PAGEOUT and doing access at the same time is
quite silly like DoS.  So probably, we don't need to care.  Or ignoring
the new access during the quite small time gap is even better.

For DAMON's DAMOS_PAGEOUT based on physical address region, we still keep
its behaviour as is since a physical address might be mapped by multiple
processes.  MADV_PAGEOUT based on virtual address is actually much more
aggressive on reclamation.  To untouch paddr's DAMOS_PAGEOUT, we simply
pass ignore_references as false in reclaim_pages().

A microbench as below has shown 6% decrement on the latency of
MADV_PAGEOUT,

 #define PGSIZE 4096
 main()
 {
 	int i;
 #define SIZE 512*1024*1024
 	volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

 	for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
 		p[i] =  0x11;

 	madvise(p, SIZE, MADV_PAGEOUT);
 }

w/o patch                    w/ patch
root@10:~# time ./a.out      root@10:~# time ./a.out
real	0m49.634s            real   0m46.334s
user	0m0.637s             user   0m0.648s
sys	0m47.434s            sys    0m44.265s

Link: https://lkml.kernel.org/r/20240226005739.24350-1-21cnbao@gmail.com


Signed-off-by: default avatarBarry Song <v-songbaohua@oppo.com>
Acked-by: default avatarMinchan Kim <minchan@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/vmscan.c
	mm/etmem.c
	include/linux/swap.h
	fs/proc/etmem_swap.c
[ Adapt reclaim_pages() and reclaim_folio_list() used in etmem. ]
Signed-off-by: default avatarLiu Shixin <liushixin2@huawei.com>
parent 63861ca1
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -72,7 +72,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf,
	}

	if (!list_empty(&pagelist))
		reclaim_pages(&pagelist);
		reclaim_pages(&pagelist, false);

	ret = count;
	kfree(data_ptr_res);
+3 −2
Original line number Diff line number Diff line
@@ -420,8 +420,9 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
					gfp_t gfp_mask, nodemask_t *mask);
extern unsigned int reclaim_folio_list(struct list_head *folio_list,
						struct pglist_data *pgdat);
extern unsigned long reclaim_pages(struct list_head *folio_list);
						struct pglist_data *pgdat,
						bool ignore_references);
extern unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+1 −1
Original line number Diff line number Diff line
@@ -250,7 +250,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
put_folio:
		folio_put(folio);
	}
	applied = reclaim_pages(&folio_list);
	applied = reclaim_pages(&folio_list, false);
	cond_resched();
	return applied * PAGE_SIZE;
}
+1 −1
Original line number Diff line number Diff line
@@ -248,7 +248,7 @@ int do_swapcache_reclaim(unsigned long *swapcache_watermark,
	/* Reclaim all the swapcache we have scanned */
	for_each_node_state(nid, N_MEMORY) {
		cond_resched();
		reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid));
		reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid), false);
	}

	/* Put pack all the pages that are not reclaimed by shrink_folio_list */
+4 −4
Original line number Diff line number Diff line
@@ -429,7 +429,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
			return 0;
		}

		if (pmd_young(orig_pmd)) {
		if (!pageout && pmd_young(orig_pmd)) {
			pmdp_invalidate(vma, addr, pmd);
			orig_pmd = pmd_mkold(orig_pmd);

@@ -453,7 +453,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
huge_unlock:
		spin_unlock(ptl);
		if (pageout)
			reclaim_pages(&folio_list);
			reclaim_pages(&folio_list, true);
		return 0;
	}

@@ -522,7 +522,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,

		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);

		if (pte_young(ptent)) {
		if (!pageout && pte_young(ptent)) {
			ptent = ptep_get_and_clear_full(mm, addr, pte,
							tlb->fullmm);
			ptent = pte_mkold(ptent);
@@ -556,7 +556,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
		pte_unmap_unlock(start_pte, ptl);
	}
	if (pageout)
		reclaim_pages(&folio_list);
		reclaim_pages(&folio_list, true);
	cond_resched();

	return 0;
Loading