Commit c97cdd7e authored by ZhangPeng's avatar ZhangPeng Committed by Ma Wupeng
Browse files

userswap: introduce MREMAP_USWAP_SET_PTE to remap for swapping out

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM



--------------------------------

We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out
phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to
'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is
set to SWP_USERSWAP_ENTRY.

Signed-off-by: default avatarZhangPeng <zhangpeng362@huawei.com>
parent 444ec524
Loading
Loading
Loading
Loading
+24 −0
Original line number Diff line number Diff line
@@ -6,16 +6,28 @@
#ifndef _LINUX_USERSWAP_H
#define _LINUX_USERSWAP_H

#include <linux/mman.h>
#include <linux/userfaultfd.h>

#ifdef CONFIG_USERSWAP

extern int enable_userswap;

/*
 * In uswap situation, we use the bit 0 of the returned address to indicate
 * whether the pages are dirty.
 */
#define USWAP_PAGES_DIRTY	1

int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm,
			    pmd_t *dst_pmd,
			    struct vm_area_struct *dst_vma,
			    unsigned long dst_addr,
			    unsigned long src_addr);

unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
			   unsigned long new_addr, unsigned long new_len);

static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode)
{
	if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP))
@@ -23,6 +35,18 @@ static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode)
	return true;
}

static inline bool uswap_validate_mremap_flags(unsigned long flags)
{
	if (!enable_userswap && flags & MREMAP_USWAP_SET_PTE)
		return false;
	if (flags & MREMAP_USWAP_SET_PTE && flags & ~MREMAP_USWAP_SET_PTE)
		return false;
	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP |
		      MREMAP_USWAP_SET_PTE))
		return false;
	return true;
}

#endif /* CONFIG_USERSWAP */

#endif /* _LINUX_USERSWAP_H */
+0 −2
Original line number Diff line number Diff line
@@ -30,8 +30,6 @@
#define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */

#define MAP_REPLACE		0x1000000

#define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
					 * uninitialized */

+1 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#define MREMAP_MAYMOVE		1
#define MREMAP_FIXED		2
#define MREMAP_DONTUNMAP	4
#define MREMAP_USWAP_SET_PTE	64

#define OVERCOMMIT_GUESS		0
#define OVERCOMMIT_ALWAYS		1
+0 −205
Original line number Diff line number Diff line
@@ -49,7 +49,6 @@
#include <linux/sched/mm.h>
#include <linux/swapops.h>
#include <linux/share_pool.h>
#include <linux/userswap.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -1623,205 +1622,6 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len,
{
	return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf);
}
#ifdef CONFIG_USERSWAP
/*
 * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
 * the reference of the pages and return the pages through input parameters
 * 'ppages'.
 */
static int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr,
				unsigned long len, struct page ***ppages)
{
	struct vm_area_struct *vma;
	struct page *page = NULL;
	struct page **pages = NULL;
	unsigned long addr_end = addr + len;
	unsigned long ret;
	int i, page_num = 0;

	pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
	if (!pages)
		return -ENOMEM;

	while (addr < addr_end) {
		vma = find_vma(mm, addr);
		if (!vma || !vma_is_anonymous(vma) || vma->vm_file ||
		    (vma->vm_flags & VM_LOCKED) || (vma->vm_flags & VM_STACK) ||
		    (vma->vm_flags & (VM_IO | VM_PFNMAP))) {
			ret = -EINVAL;
			goto out;
		}
		if (!(vma->vm_flags & VM_UFFD_MISSING)) {
			ret = -EAGAIN;
			goto out;
		}
get_again:
		/* follow_page will inc page ref, dec the ref after we remap the page */
		page = follow_page(vma, addr, FOLL_GET);
		if (IS_ERR_OR_NULL(page)) {
			ret = -ENODEV;
			goto out;
		}
		pages[page_num++] = page;
		if (!PageAnon(page) || !PageSwapBacked(page) ||
		    PageHuge(page) || PageSwapCache(page)) {
			ret = -EINVAL;
			goto out;
		} else if (PageTransCompound(page)) {
			if (trylock_page(page)) {
				if (!split_huge_page(page)) {
					put_page(page);
					page_num--;
					unlock_page(page);
					goto get_again;
				} else {
					unlock_page(page);
					ret = -EINVAL;
					goto out;
				}
			} else {
				ret = -EINVAL;
				goto out;
			}
		}
		if (page_mapcount(page) > 1 ||
		    page_mapcount(page) + 1 != page_count(page)) {
			ret = -EBUSY;
			goto out;
		}
		addr += PAGE_SIZE;
	}

	*ppages = pages;
	return 0;

out:
	for (i = 0; i < page_num; i++)
		put_page(pages[i]);
	if (pages)
		kfree(pages);
	*ppages = NULL;
	return ret;
}

/*
 * In uswap situation, we use the bit 0 of the returned address to indicate
 * whether the pages are dirty.
 */
#define USWAP_PAGES_DIRTY	1

/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long
do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len,
	     struct page **pages, unsigned long new_addr)
{
	struct vm_area_struct *vma;
	struct page *page;
	struct mmu_notifier_range range;
	pmd_t *pmd;
	pte_t *pte, old_pte;
	spinlock_t *ptl;
	unsigned long addr;
	bool pages_dirty = false;
	int i = 0;

	addr = addr_start;
	lru_add_drain();
	i = 0;
	while (addr < addr_start + len) {
		page = pages[i];
		vma = find_vma(mm, addr);
		if (!vma)
			return -EINVAL;

		mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma,
				vma->vm_mm, addr, addr + PAGE_SIZE);
		mmu_notifier_invalidate_range_start(&range);
		pmd = mm_find_pmd(mm, addr);
		if (!pmd) {
			mmu_notifier_invalidate_range_end(&range);
			return -ENXIO;
		}
		pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
		flush_cache_page(vma, addr, pte_pfn(*pte));
		old_pte = ptep_clear_flush(vma, addr, pte);
		if (pte_dirty(old_pte)  || PageDirty(page))
			pages_dirty = true;
		set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY,
							page_to_pfn(page))));
		dec_mm_counter(mm, MM_ANONPAGES);
		reliable_page_counter(page, mm, -1);
		page_remove_rmap(page, false);
		put_page(page);

		pte_unmap_unlock(pte, ptl);
		mmu_notifier_invalidate_range_end(&range);
		vma->vm_flags |= VM_USWAP;
		page->mapping = NULL;
		addr += PAGE_SIZE;
		i++;
	}

	addr = new_addr;
	vma = find_vma(mm, addr);
	i = 0;
	while (addr < new_addr + len) {
		if (addr > vma->vm_end - 1)
			vma = find_vma(mm, addr);
		if (!vma)
			return -ENODEV;

		page = pages[i++];
		if (vm_insert_page(vma, addr, page))
			return -EFAULT;

		addr += PAGE_SIZE;
	}
	vma->vm_flags |= VM_USWAP;

	if (pages_dirty)
		new_addr = new_addr | USWAP_PAGES_DIRTY;

	return new_addr;
}

static inline unsigned long
do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len,
	      unsigned long prot, unsigned long flags, unsigned long pgoff,
	      unsigned long *populate, struct list_head *uf)
{
	struct mm_struct *mm = current->mm;
	unsigned long old_addr = addr;
	struct page **pages = NULL;
	unsigned long ret;
	int i;

	if (!len || offset_in_page(addr) || (len % PAGE_SIZE))
		return -EINVAL;

	ret = pages_can_be_swapped(mm, addr, len, &pages);
	if (ret)
		return ret;

	/* mark the vma as special to avoid merging with other vmas */
	addr = __do_mmap(file, addr, len, prot, flags, VM_SPECIAL, pgoff,
			 populate, uf);
	if (IS_ERR_VALUE(addr)) {
		ret = addr;
		goto out;
	}

	ret = do_user_swap(mm, old_addr, len, pages, addr);
out:
	/* follow_page() above increased the reference*/
	for (i = 0; i < len / PAGE_SIZE; i++)
		put_page(pages[i]);
	if (pages)
		kfree(pages);

	return ret;
}
#endif

/*
 * The caller must write-lock current->mm->mmap_lock. 
@@ -1831,11 +1631,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
			unsigned long flags, unsigned long pgoff,
			unsigned long *populate, struct list_head *uf)
{
#ifdef CONFIG_USERSWAP
	if (enable_userswap && (flags & MAP_REPLACE))
		return do_uswap_mmap(file, addr, len, prot, flags, pgoff,
				     populate, uf);
#endif
	return __do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}

+11 −0
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
#include <linux/mm-arch-hooks.h>
#include <linux/userfaultfd_k.h>
#include <linux/share_pool.h>
#include <linux/userswap.h>

#include <asm/cacheflush.h>
#include <asm/tlb.h>
@@ -915,8 +916,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
	 */
	addr = untagged_addr(addr);

#ifdef CONFIG_USERSWAP
	if (!uswap_validate_mremap_flags(flags))
		return ret;
#else
	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
		return ret;
#endif

	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
		return ret;
@@ -947,6 +953,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
	if (!new_len)
		return ret;

#ifdef CONFIG_USERSWAP
	if (flags & MREMAP_USWAP_SET_PTE)
		return uswap_mremap(addr, old_len, new_addr, new_len);
#endif

	if (mmap_write_lock_killable(current->mm))
		return -EINTR;

Loading