!11561 mm: support large folio swap-out and swap-in for shmem (c6c933e9) · Commits · EulixOS / Software / Kernel

include/linux/swap.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -507,9 +507,9 @@ extern swp_entry_t get_swap_page_of_type(int);
		extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order,
		int type);
		extern int add_swap_count_continuation(swp_entry_t, gfp_t);
		extern void swap_shmem_alloc(swp_entry_t);
		extern void swap_shmem_alloc(swp_entry_t, int);
		extern int swap_duplicate(swp_entry_t);
		extern int swapcache_prepare(swp_entry_t);
		extern int swapcache_prepare(swp_entry_t entry, int nr);
		extern void swap_free_nr(swp_entry_t entry, int nr_pages);
		extern void swapcache_free_entries(swp_entry_t *entries, int n);
		extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
		@@ -583,7 +583,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
		return 0;
		}

		static inline void swap_shmem_alloc(swp_entry_t swp)
		static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
		{
		}

		@@ -592,7 +592,7 @@ static inline int swap_duplicate(swp_entry_t swp)
		return 0;
		}

		static inline int swapcache_prepare(swp_entry_t swp)
		static inline int swapcache_prepare(swp_entry_t swp, int nr)
		{
		return 0;
		}

include/linux/writeback.h

+3 −1

Original line number	Diff line number	Diff line
		@@ -90,9 +90,11 @@ struct writeback_control {
		size_t wb_tcand_bytes; /* bytes written by this candidate */
		#endif

		KABI_RESERVE(1)
		/* Target list for splitting a large folio */
		KABI_USE(1, struct list_head *list)
		KABI_RESERVE(2)
		KABI_RESERVE(3)

		};

		static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)

mm/filemap.c

+27 −14

Original line number	Diff line number	Diff line
		@@ -2048,17 +2048,20 @@ unsigned find_get_entries(struct address_space mapping, pgoff_t start,
		if (!folio_batch_add(fbatch, folio))
		break;
		}
		rcu_read_unlock();

		if (folio_batch_count(fbatch)) {
		unsigned long nr = 1;
		unsigned long nr;
		int idx = folio_batch_count(fbatch) - 1;

		folio = fbatch->folios[idx];
		if (!xa_is_value(folio))
		nr = folio_nr_pages(folio);
		*start = indices[idx] + nr;
		else
		nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
		*start = round_down(indices[idx] + nr, nr);
		}
		rcu_read_unlock();

		return folio_batch_count(fbatch);
		}

		@@ -2090,10 +2093,17 @@ unsigned find_lock_entries(struct address_space mapping, pgoff_t start,

		rcu_read_lock();
		while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
		unsigned long base;
		unsigned long nr;

		if (!xa_is_value(folio)) {
		if (folio->index < *start)
		nr = folio_nr_pages(folio);
		base = folio->index;
		/* Omit large folio which begins before the start */
		if (base < *start)
		goto put;
		if (folio_next_index(folio) - 1 > end)
		/* Omit large folio which extends beyond the end */
		if (base + nr - 1 > end)
		goto put;
		if (!folio_trylock(folio))
		goto put;
		@@ -2102,7 +2112,19 @@ unsigned find_lock_entries(struct address_space mapping, pgoff_t start,
		goto unlock;
		VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
		folio);
		} else {
		nr = 1 << xa_get_order(&mapping->i_pages, xas.xa_index);
		base = xas.xa_index & ~(nr - 1);
		/* Omit order>0 value which begins before the start */
		if (base < *start)
		continue;
		/* Omit order>0 value which extends beyond the end */
		if (base + nr - 1 > end)
		break;
		}

		/* Update start now so that last update is correct on return */
		*start = base + nr;
		indices[fbatch->nr] = xas.xa_index;
		if (!folio_batch_add(fbatch, folio))
		break;
		@@ -2114,15 +2136,6 @@ unsigned find_lock_entries(struct address_space mapping, pgoff_t start,
		}
		rcu_read_unlock();

		if (folio_batch_count(fbatch)) {
		unsigned long nr = 1;
		int idx = folio_batch_count(fbatch) - 1;

		folio = fbatch->folios[idx];
		if (!xa_is_value(folio))
		nr = folio_nr_pages(folio);
		*start = indices[idx] + nr;
		}
		return folio_batch_count(fbatch);
		}

mm/memory.c

+3 −3

Original line number	Diff line number	Diff line
		@@ -4062,7 +4062,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
		* reusing the same entry. It's undetectable as
		* pte_same() returns true due to entry reuse.
		*/
		if (swapcache_prepare(entry)) {
		if (swapcache_prepare(entry, 1)) {
		/* Relax a bit to prevent rapid repeated page faults */
		schedule_timeout_uninterruptible(1);
		goto out;
		@@ -4369,7 +4369,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
		out:
		/* Clear the swap cache pin for direct swapin after PTL unlock */
		if (need_clear_cache)
		swapcache_clear(si, entry);
		swapcache_clear(si, entry, 1);
		if (si)
		put_swap_device(si);
		return ret;
		@@ -4385,7 +4385,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
		folio_put(swapcache);
		}
		if (need_clear_cache)
		swapcache_clear(si, entry);
		swapcache_clear(si, entry, 1);
		if (si)
		put_swap_device(si);
		return ret;

mm/shmem.c

+205 −55

Original line number	Diff line number	Diff line
		@@ -155,7 +155,7 @@ static unsigned long shmem_default_max_inodes(void)

		static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
		struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
		struct mm_struct fault_mm, vm_fault_t fault_type);
		struct vm_area_struct vma, vm_fault_t fault_type);

		static inline struct shmem_sb_info SHMEM_SB(struct super_block sb)
		{
		@@ -791,7 +791,6 @@ static int shmem_add_to_page_cache(struct folio *folio,
		VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
		VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
		VM_BUG_ON(expected && folio_test_large(folio));

		folio_ref_add(folio, nr);
		folio->mapping = mapping;
		@@ -849,23 +848,27 @@ static void shmem_delete_from_page_cache(struct folio folio, void radswap)
		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
		shmem_reliable_folio_add(folio, -nr);
		xa_unlock_irq(&mapping->i_pages);
		folio_put(folio);
		folio_put_refs(folio, nr);
		BUG_ON(error);
		}

		/*
		* Remove swap entry from page cache, free the swap and its page cache.
		* Remove swap entry from page cache, free the swap and its page cache. Returns
		* the number of pages being freed. 0 means entry not found in XArray (0 pages
		* being freed).
		*/
		static int shmem_free_swap(struct address_space *mapping,
		static long shmem_free_swap(struct address_space *mapping,
		pgoff_t index, void *radswap)
		{
		int order = xa_get_order(&mapping->i_pages, index);
		void *old;

		old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
		if (old != radswap)
		return -ENOENT;
		free_swap_and_cache(radix_to_swp_entry(radswap));
		return 0;
		free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);

		return 1 << order;
		}

		/*
		@@ -888,7 +891,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
		if (xas_retry(&xas, page))
		continue;
		if (xa_is_value(page))
		swapped++;
		swapped += 1 << xa_get_order(xas.xa, xas.xa_index);
		if (xas.xa_index == max)
		break;
		if (need_resched()) {
		@@ -1017,7 +1020,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
		if (xa_is_value(folio)) {
		if (unfalloc)
		continue;
		nr_swaps_freed += !shmem_free_swap(mapping,
		nr_swaps_freed += shmem_free_swap(mapping,
		indices[i], folio);
		continue;
		}
		@@ -1084,14 +1087,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
		folio = fbatch.folios[i];

		if (xa_is_value(folio)) {
		long swaps_freed;

		if (unfalloc)
		continue;
		if (shmem_free_swap(mapping, indices[i], folio)) {
		swaps_freed = shmem_free_swap(mapping, indices[i], folio);
		if (!swaps_freed) {
		/* Swap was replaced by page: retry */
		index = indices[i];
		break;
		}
		nr_swaps_freed++;
		nr_swaps_freed += swaps_freed;
		continue;
		}

		@@ -1452,6 +1458,8 @@ static int shmem_writepage(struct page page, struct writeback_control wbc)
		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
		swp_entry_t swap;
		pgoff_t index;
		int nr_pages;
		bool split = false;

		/*
		* Our capabilities prevent regular writeback or sync from ever calling
		@@ -1470,20 +1478,33 @@ static int shmem_writepage(struct page page, struct writeback_control wbc)
		goto redirty;

		/*
		* If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
		* "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
		* and its shmem_writeback() needs them to be split when swapping.
		* If CONFIG_THP_SWAP is not enabled, the large folio should be
		* split when swapping.
		*
		* And shrinkage of pages beyond i_size does not split swap, so
		* swapout of a large folio crossing i_size needs to split too
		* (unless fallocate has been used to preallocate beyond EOF).
		*/
		if (folio_test_large(folio)) {
		index = shmem_fallocend(inode,
		DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
		if ((index > folio->index && index < folio_next_index(folio)) \|\|
		!IS_ENABLED(CONFIG_THP_SWAP))
		split = true;
		}

		if (split) {
		try_split:
		/* Ensure the subpages are still dirty */
		folio_test_set_dirty(folio);
		if (split_huge_page(page) < 0)
		if (split_huge_page_to_list_to_order(page, wbc->list, 0))
		goto redirty;
		folio = page_folio(page);
		folio_clear_dirty(folio);
		}

		index = folio->index;
		nr_pages = folio_nr_pages(folio);

		/*
		* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
		@@ -1518,8 +1539,12 @@ static int shmem_writepage(struct page page, struct writeback_control wbc)
		}

		swap = folio_alloc_swap(folio);
		if (!swap.val)
		if (!swap.val) {
		if (nr_pages > 1)
		goto try_split;

		goto redirty;
		}

		/*
		* Add inode to shmem_unuse()'s list of swapped-out inodes,
		@@ -1536,8 +1561,8 @@ static int shmem_writepage(struct page page, struct writeback_control wbc)
		if (add_to_swap_cache(folio, swap,
		__GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN,
		NULL) == 0) {
		shmem_recalc_inode(inode, 0, 1);
		swap_shmem_alloc(swap);
		shmem_recalc_inode(inode, 0, nr_pages);
		swap_shmem_alloc(swap, nr_pages);
		shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));

		mutex_unlock(&shmem_swaplist_mutex);
		@@ -1903,30 +1928,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
		}

		static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
		struct shmem_inode_info *info, pgoff_t index)
		struct shmem_inode_info *info, pgoff_t index,
		struct vm_area_struct *vma)
		{
		struct folio old, new;
		struct address_space *swap_mapping;
		swp_entry_t entry;
		pgoff_t swap_index;
		int error;

		old = *foliop;
		entry = old->swap;
		swap_index = swp_offset(entry);
		swap_mapping = swap_address_space(entry);
		struct folio new, old = *foliop;
		swp_entry_t entry = old->swap;
		struct address_space *swap_mapping = swap_address_space(entry);
		pgoff_t swap_index = swp_offset(entry);
		XA_STATE(xas, &swap_mapping->i_pages, swap_index);
		int nr_pages = folio_nr_pages(old);
		int error = 0, i;

		/*
		* We have arrived here because our zones are constrained, so don't
		* limit chance of success by further cpuset and node constraints.
		*/
		gfp &= ~GFP_CONSTRAINT_MASK;
		VM_BUG_ON_FOLIO(folio_test_large(old), old);
		new = shmem_alloc_folio(gfp, 0, info, index);
		#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		if (nr_pages > 1) {
		gfp_t huge_gfp = vma_thp_gfp_mask(vma);

		gfp = limit_gfp_mask(huge_gfp, gfp);
		}
		#endif

		new = shmem_alloc_folio(gfp, folio_order(old), info, index);
		if (!new)
		return -ENOMEM;

		folio_get(new);
		folio_ref_add(new, nr_pages);
		folio_copy(new, old);
		flush_dcache_folio(new);

		@@ -1936,20 +1966,27 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
		new->swap = entry;
		folio_set_swapcache(new);

		/*
		* Our caller will very soon move newpage out of swapcache, but it's
		* a nice clean interface for us to replace oldpage by newpage there.
		*/
		/* Swap cache still stores N entries instead of a high-order entry */
		xa_lock_irq(&swap_mapping->i_pages);
		error = shmem_replace_entry(swap_mapping, swap_index, old, new);
		for (i = 0; i < nr_pages; i++) {
		void *item = xas_load(&xas);

		if (item != old) {
		error = -ENOENT;
		break;
		}

		xas_store(&xas, new);
		xas_next(&xas);
		}
		if (!error) {
		mem_cgroup_replace_folio(old, new);
		__lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
		__lruvec_stat_mod_folio(new, NR_SHMEM, 1);
		shmem_reliable_folio_add(new, 1);
		__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
		__lruvec_stat_mod_folio(old, NR_SHMEM, -1);
		shmem_reliable_folio_add(old, -1);
		__lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages);
		__lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages);
		shmem_reliable_folio_add(new, nr_pages);
		__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages);
		__lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages);
		shmem_reliable_folio_add(old, -nr_pages);
		}
		xa_unlock_irq(&swap_mapping->i_pages);

		@@ -1969,7 +2006,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
		old->private = NULL;

		folio_unlock(old);
		folio_put_refs(old, 2);
		/*
		* The old folio are removed from swap cache, drop the 'nr_pages'
		* reference, as well as one temporary reference getting from swap
		* cache.
		*/
		folio_put_refs(old, nr_pages + 1);
		return error;
		}

		@@ -1979,6 +2021,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
		struct address_space *mapping = inode->i_mapping;
		swp_entry_t swapin_error;
		void *old;
		int nr_pages;

		swapin_error = make_poisoned_swp_entry();
		old = xa_cmpxchg_irq(&mapping->i_pages, index,
		@@ -1987,6 +2030,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
		if (old != swp_to_radix_entry(swap))
		return;

		nr_pages = folio_nr_pages(folio);
		folio_wait_writeback(folio);
		delete_from_swap_cache(folio);
		/*
		@@ -1994,8 +2038,86 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
		* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
		* in shmem_evict_inode().
		*/
		shmem_recalc_inode(inode, -1, -1);
		swap_free(swap);
		shmem_recalc_inode(inode, -nr_pages, -nr_pages);
		swap_free_nr(swap, nr_pages);
		}

		static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
		swp_entry_t swap, gfp_t gfp)
		{
		struct address_space *mapping = inode->i_mapping;
		XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
		void *alloced_shadow = NULL;
		int alloced_order = 0, i;

		/* Convert user data gfp flags to xarray node gfp flags */
		gfp &= GFP_RECLAIM_MASK;

		for (;;) {
		int order = -1, split_order = 0;
		void *old = NULL;

		xas_lock_irq(&xas);
		old = xas_load(&xas);
		if (!xa_is_value(old) \|\| swp_to_radix_entry(swap) != old) {
		xas_set_err(&xas, -EEXIST);
		goto unlock;
		}

		order = xas_get_order(&xas);

		/* Swap entry may have changed before we re-acquire the lock */
		if (alloced_order &&
		(old != alloced_shadow \|\| order != alloced_order)) {
		xas_destroy(&xas);
		alloced_order = 0;
		}

		/* Try to split large swap entry in pagecache */
		if (order > 0) {
		if (!alloced_order) {
		split_order = order;
		goto unlock;
		}
		xas_split(&xas, old, order);

		/*
		* Re-set the swap entry after splitting, and the swap
		* offset of the original large entry must be continuous.
		*/
		for (i = 0; i < 1 << order; i++) {
		pgoff_t aligned_index = round_down(index, 1 << order);
		swp_entry_t tmp;

		tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
		__xa_store(&mapping->i_pages, aligned_index + i,
		swp_to_radix_entry(tmp), 0);
		}
		}

		unlock:
		xas_unlock_irq(&xas);

		/* split needed, alloc here and retry. */
		if (split_order) {
		xas_split_alloc(&xas, old, split_order, gfp);
		if (xas_error(&xas))
		goto error;
		alloced_shadow = old;
		alloced_order = split_order;
		xas_reset(&xas);
		continue;
		}

		if (!xas_nomem(&xas, gfp))
		break;
		}

		error:
		if (xas_error(&xas))
		return xas_error(&xas);

		return alloced_order;
		}

		/*
		@@ -2006,15 +2128,16 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
		*/
		static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
		struct folio **foliop, enum sgp_type sgp,
		gfp_t gfp, struct mm_struct *fault_mm,
		gfp_t gfp, struct vm_area_struct *vma,
		vm_fault_t *fault_type)
		{
		struct address_space *mapping = inode->i_mapping;
		struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
		struct shmem_inode_info *info = SHMEM_I(inode);
		struct swap_info_struct *si;
		struct folio *folio = NULL;
		swp_entry_t swap;
		int error;
		int error, nr_pages;

		VM_BUG_ON(!foliop \|\| !xa_is_value(foliop));
		swap = radix_to_swp_entry(*foliop);
		@@ -2034,12 +2157,37 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
		/* Look it up and read it in.. */
		folio = swap_cache_get_folio(swap, NULL, 0);
		if (!folio) {
		int split_order;

		/* Or update major stats only when swapin succeeds?? */
		if (fault_type) {
		*fault_type \|= VM_FAULT_MAJOR;
		count_vm_event(PGMAJFAULT);
		count_memcg_event_mm(fault_mm, PGMAJFAULT);
		}

		/*
		* Now swap device can only swap in order 0 folio, then we
		* should split the large swap entry stored in the pagecache
		* if necessary.
		*/
		split_order = shmem_split_large_entry(inode, index, swap, gfp);
		if (split_order < 0) {
		error = split_order;
		goto failed;
		}

		/*
		* If the large swap entry has already been split, it is
		* necessary to recalculate the new swap entry based on
		* the old order alignment.
		*/
		if (split_order > 0) {
		pgoff_t offset = index - round_down(index, 1 << split_order);

		swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
		}

		/* Here we actually start the io */
		folio = shmem_swapin(swap, gfp, info, index);
		if (!folio) {
		@@ -2061,6 +2209,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
		goto failed;
		}
		folio_wait_writeback(folio);
		nr_pages = folio_nr_pages(folio);

		/*
		* Some architectures may have to restore extra metadata to the
		@@ -2069,24 +2218,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
		arch_swap_restore(folio_swap(swap, folio), folio);

		if (shmem_should_replace_folio(folio, gfp)) {
		error = shmem_replace_folio(&folio, gfp, info, index);
		error = shmem_replace_folio(&folio, gfp, info, index, vma);
		if (error)
		goto failed;
		}

		error = shmem_add_to_page_cache(folio, mapping, index,
		error = shmem_add_to_page_cache(folio, mapping,
		round_down(index, nr_pages),
		swp_to_radix_entry(swap), gfp);
		if (error)
		goto failed;

		shmem_recalc_inode(inode, 0, -1);
		shmem_recalc_inode(inode, 0, -nr_pages);

		if (sgp == SGP_WRITE)
		folio_mark_accessed(folio);

		delete_from_swap_cache(folio);
		folio_mark_dirty(folio);
		swap_free(swap);
		swap_free_nr(swap, nr_pages);
		put_swap_device(si);

		*foliop = folio;
		@@ -2149,7 +2299,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,

		if (xa_is_value(folio)) {
		error = shmem_swapin_folio(inode, index, &folio,
		sgp, gfp, fault_mm, fault_type);
		sgp, gfp, vma, fault_type);
		if (error == -EEXIST)
		goto repeat;