Commit c3e6287f authored by Guo Fan's avatar Guo Fan Committed by Cheng Jian
Browse files

userswap: support userswap via userfaultfd



hulk inclusion
category: feature
bugzilla: 47439
CVE: NA

-------------------------------------------------

This patch modify the userfaultfd to support userswap. To check whether
tha pages are dirty since the last swap in, we make them clean when we
swap in the pages. The userspace may swap in a large area and part of it
are not swapped out. We need to skip those pages that are not swapped
out.

Signed-off-by: default avatarGuo Fan <guofan5@huawei.com>
Signed-off-by: default avatarXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: default avatarJing Xiangfeng <jingxiangfeng@huawei.com>
Reviewed-by: default avatarKefeng&nbsp; Wang <wangkefeng.wang@huawei.com>
Signed-off-by: default avatarYang Yingliang <yangyingliang@huawei.com>
Signed-off-by: default avatarCheng Jian <cj.chengjian@huawei.com>
parent e3452806
Loading
Loading
Loading
Loading
+25 −1
Original line number Diff line number Diff line
@@ -327,6 +327,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
	 * Lockless access: we're in a wait_event so it's ok if it
	 * changes under us.
	 */
#ifdef CONFIG_USERSWAP
	if ((reason & VM_USWAP) && (!pte_present(*pte)))
		ret = true;
#endif
	if (pte_none(*pte))
		ret = true;
	pte_unmap(pte);
@@ -1321,10 +1325,30 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
	ret = -EINVAL;
	if (!uffdio_register.mode)
		goto out;
	vm_flags = 0;
#ifdef CONFIG_USERSWAP
	/*
	 * register the whole vma overlapping with the address range to avoid
	 * splitting the vma.
	 */
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_USWAP) {
		uffdio_register.mode &= ~UFFDIO_REGISTER_MODE_USWAP;
		vm_flags |= VM_USWAP;
		end = uffdio_register.range.start + uffdio_register.range.len - 1;
		vma = find_vma(mm, uffdio_register.range.start);
		if (!vma)
			goto out;
		uffdio_register.range.start = vma->vm_start;

		vma = find_vma(mm, end);
		if (!vma)
			goto out;
		uffdio_register.range.len = vma->vm_end - uffdio_register.range.start;
	}
#endif
	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
				     UFFDIO_REGISTER_MODE_WP))
		goto out;
	vm_flags = 0;
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
		vm_flags |= VM_UFFD_MISSING;
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+4 −0
Original line number Diff line number Diff line
@@ -47,7 +47,11 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,

static inline bool userfaultfd_missing(struct vm_area_struct *vma)
{
#ifdef CONFIG_USERSWAP
	return (vma->vm_flags & VM_UFFD_MISSING) && !(vma->vm_flags & VM_USWAP);
#else
	return vma->vm_flags & VM_UFFD_MISSING;
#endif
}

static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+3 −0
Original line number Diff line number Diff line
@@ -190,6 +190,9 @@ struct uffdio_register {
	struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
#define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
#ifdef CONFIG_USERSWAP
#define UFFDIO_REGISTER_MODE_USWAP		((__u64)1<<2)
#endif
	__u64 mode;

	/*
+19 −0
Original line number Diff line number Diff line
@@ -2769,6 +2769,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
		goto out;

	entry = pte_to_swp_entry(vmf->orig_pte);
#ifdef CONFIG_USERSWAP
	if (swp_type(entry) == SWP_USERSWAP_ENTRY) {
		/* print error if we come across a nested fault */
		if (!strncmp(current->comm, "uswap", 5)) {
			pr_err("USWAP: fault %lx is triggered by %s\n",
					vmf->address, current->comm);
			return VM_FAULT_SIGBUS;
		}
		if (!(vma->vm_flags & VM_UFFD_MISSING)) {
			pr_err("USWAP: addr %lx flags %lx is not a user swap page",
					vmf->address, vma->vm_flags);
			goto skip_uswap;
		}
		BUG_ON(!(vma->vm_flags & VM_UFFD_MISSING));
		ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP);
		return ret;
	}
skip_uswap:
#endif
	if (unlikely(non_swap_entry(entry))) {
		if (is_migration_entry(entry)) {
			migration_entry_wait(vma->vm_mm, vmf->pmd,
+26 −0
Original line number Diff line number Diff line
@@ -60,6 +60,10 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
		*pagep = NULL;
	}

#ifdef CONFIG_USERSWAP
	if (dst_vma->vm_flags & VM_USWAP)
		ClearPageDirty(page);
#endif
	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * preceeding stores to the page contents become visible before
@@ -74,6 +78,10 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	if (dst_vma->vm_flags & VM_WRITE)
		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
#ifdef CONFIG_USERSWAP
	if (dst_vma->vm_flags & VM_USWAP)
		_dst_pte = pte_mkclean(_dst_pte);
#endif

	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (dst_vma->vm_file) {
@@ -85,9 +93,27 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
		if (unlikely(offset >= max_off))
			goto out_release_uncharge_unlock;
	}

#ifdef CONFIG_USERSWAP
	if (!(dst_vma->vm_flags & VM_USWAP)) {
		ret = -EEXIST;
		if (!pte_none(*dst_pte))
			goto out_release_uncharge_unlock;
	} else {
		/*
		 * The userspace may swap in a large area. Part of the area is
		 * not swapped out. Skip those pages.
		 */
		ret = 0;
		if (swp_type(pte_to_swp_entry(*dst_pte)) != SWP_USERSWAP_ENTRY ||
		    pte_present(*dst_pte))
			goto out_release_uncharge_unlock;
	}
#else
	ret = -EEXIST;
	if (!pte_none(*dst_pte))
		goto out_release_uncharge_unlock;
#endif

	inc_mm_counter(dst_mm, MM_ANONPAGES);
	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);