Commit 3ef2eace authored by ZhangPeng's avatar ZhangPeng Committed by Peng Zhang
Browse files

mm/userswap: introduce MREMAP_USWAP_SET_PTE

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX


CVE: NA

--------------------------------

We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out
phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to
'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is
set to SWP_USERSWAP_ENTRY.

Signed-off-by: default avatarZhangPeng <zhangpeng362@huawei.com>
parent af2100e2
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -2492,6 +2492,7 @@ int set_page_dirty_lock(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

extern pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
		unsigned long old_addr, struct vm_area_struct *new_vma,
		unsigned long new_addr, unsigned long len,
+25 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved.
 */

#ifndef _LINUX_USERSWAP_H
#define _LINUX_USERSWAP_H

#include <linux/mman.h>

#ifdef CONFIG_USERSWAP

extern struct static_key_false userswap_enabled;

/*
 * In uswap situation, we use the bit 0 of the returned address to indicate
 * whether the pages are dirty.
 */
#define USWAP_PAGES_DIRTY	1

unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
			   unsigned long new_addr, unsigned long new_len);

#endif /* CONFIG_USERSWAP */
#endif /* _LINUX_USERSWAP_H */
+1 −0
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
#define MREMAP_MAYMOVE		1
#define MREMAP_FIXED		2
#define MREMAP_DONTUNMAP	4
#define MREMAP_USWAP_SET_PTE	64

#define OVERCOMMIT_GUESS		0
#define OVERCOMMIT_ALWAYS		1
+7 −1
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
#include <linux/share_pool.h>
#include <linux/userswap.h>

#include <asm/cacheflush.h>
#include <asm/tlb.h>
@@ -33,7 +34,7 @@

#include "internal.h"

static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pgd;
	p4d_t *p4d;
@@ -931,6 +932,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
	 */
	addr = untagged_addr(addr);

#ifdef CONFIG_USERSWAP
	if (flags == MREMAP_USWAP_SET_PTE)
		return uswap_mremap(addr, old_len, new_addr, new_len);
#endif

	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
		return ret;

+380 −0
Original line number Diff line number Diff line
@@ -5,10 +5,390 @@
 * userswap core file
 */

#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/userswap.h>

#include "internal.h"

DEFINE_STATIC_KEY_FALSE(userswap_enabled);

static bool vma_uswap_compatible(struct vm_area_struct *vma)
{
	if (!vma || !vma_is_anonymous(vma) || vma->vm_file ||
	    (vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | VM_IO |
	    VM_PFNMAP | VM_HUGETLB)))
		return false;
	return true;
}

/*
 * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
 * the reference of the pages and return the pages through input parameters
 * 'ppages'.
 */
static unsigned long pages_can_be_swapped(struct mm_struct *mm,
					  unsigned long addr,
					  unsigned long len,
					  struct page ***ppages)
{
	struct vm_area_struct *vma;
	struct page *page = NULL;
	struct page **pages = NULL;
	unsigned long addr_end = addr + len;
	unsigned long ret;
	unsigned long i, page_num = 0;
	*ppages = NULL;

	pages = kvzalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
	if (!pages)
		return -ENOMEM;

	while (addr < addr_end) {
		vma = find_vma(mm, addr);
		if (!vma || addr < vma->vm_start ||
		    !(vma->vm_flags & VM_USWAP) ||
		    !vma_uswap_compatible(vma)) {
			ret = -EINVAL;
			goto out_err;
		}

		if (!(vma->vm_flags & VM_UFFD_MISSING)) {
			ret = -EAGAIN;
			goto out_err;
		}
get_again:
		/*
		 * follow_page will inc page ref, dec the ref after we remap
		 * the page.
		 */
		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
		if (IS_ERR_OR_NULL(page)) {
			ret = -ENODEV;
			goto out_err;
		}

		pages[page_num++] = page;
		if (!PageAnon(page) || !PageSwapBacked(page) ||
		    PageHuge(page) || PageSwapCache(page)) {
			ret = -EINVAL;
			goto out_err;
		}

		if (PageTransCompound(page)) {
			if (trylock_page(page)) {
				if (!split_huge_page(page)) {
					unlock_page(page);
					put_page(page);
					page_num--;
					goto get_again;
				} else {
					unlock_page(page);
				}
			}
			ret = -EINVAL;
			goto out_err;
		}

		/*
		 * Check that no O_DIRECT or similar I/O is in progress on the
		 * page
		 */
		if (page_mapcount(page) > 1 ||
		    page_mapcount(page) + 1 != page_count(page)) {
			ret = -EBUSY;
			goto out_err;
		}
		addr += PAGE_SIZE;
	}

	*ppages = pages;
	return 0;

out_err:
	for (i = 0; i < page_num; i++)
		put_page(pages[i]);
	kvfree(pages);
	return ret;
}

static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr)
{
	pud_t *pud;
	pmd_t *pmd;

	pud = get_old_pud(mm, addr);
	if (!pud)
		return false;
	else if (pud_huge(*pud))
		return true;

	pmd = pmd_offset(pud, addr);
	if (!pmd)
		return false;
	else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd))
		return true;

	return false;
}

static int uswap_unmap_anon_page(struct mm_struct *mm,
				 struct vm_area_struct *vma,
				 unsigned long addr, struct page *page,
				 pmd_t *pmd, pte_t *old_pte, bool set_to_swp)
{
	struct mmu_notifier_range range;
	spinlock_t *ptl;
	pte_t *pte, _old_pte;
	int ret = 0;

	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, addr,
				addr + PAGE_SIZE);
	mmu_notifier_invalidate_range_start(&range);
	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!pte_present(*pte)) {
		ret = -EINVAL;
		goto out_release_unlock;
	}
	flush_cache_page(vma, addr, pte_pfn(*pte));
	_old_pte = ptep_clear_flush(vma, addr, pte);
	if (old_pte)
		*old_pte = _old_pte;
	if (set_to_swp)
		set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry(
			   SWP_USERSWAP_ENTRY, page_to_pfn(page))));

	dec_mm_counter(mm, MM_ANONPAGES);
	page_remove_rmap(page, vma, false);
	page->mapping = NULL;

out_release_unlock:
	pte_unmap_unlock(pte, ptl);
	mmu_notifier_invalidate_range_end(&range);
	return ret;
}

static unsigned long vm_insert_anon_page(struct vm_area_struct *vma,
					 unsigned long addr, struct page *page)
{
	struct mm_struct *mm = vma->vm_mm;
	int ret = 0;
	pte_t *pte, dst_pte;
	spinlock_t *ptl;

	if (unlikely(anon_vma_prepare(vma)))
		return -ENOMEM;

	flush_dcache_page(page);
	pte = get_locked_pte(mm, addr, &ptl);
	if (!pte)
		return -ENOMEM;
	if (!pte_none(*pte)) {
		ret = -EBUSY;
		goto out_unlock;
	}

	inc_mm_counter(mm, MM_ANONPAGES);
	page_add_new_anon_rmap(page, vma, addr);
	dst_pte = mk_pte(page, vma->vm_page_prot);
	if (vma->vm_flags & VM_WRITE)
		dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte));
	set_pte_at(mm, addr, pte, dst_pte);

out_unlock:
	pte_unmap_unlock(pte, ptl);
	return ret;
}

static void uswap_map_anon_page(struct mm_struct *mm,
				struct vm_area_struct *vma,
				unsigned long addr,
				struct page *page,
				pmd_t *pmd,
				pte_t old_pte)
{
	spinlock_t *ptl;
	pte_t *pte;

	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	flush_cache_page(vma, addr, pte_pfn(*pte));
	set_pte_at(mm, addr, pte, old_pte);
	inc_mm_counter(mm, MM_ANONPAGES);
	page_add_new_anon_rmap(page, vma, addr);
	pte_unmap_unlock(pte, ptl);
}

static void uswapout_recover(struct mm_struct *mm,
			     unsigned long old_addr_start, unsigned long len,
			     struct page **pages, unsigned long new_addr_start,
			     pte_t *ptes)
{
	unsigned long unmap_old_addr = old_addr_start;
	unsigned long unmap_new_addr = new_addr_start;
	struct page *page;
	pmd_t *old_pmd, *new_pmd;
	pte_t pte;
	unsigned long i;

	for (i = 0; i < len; i++) {
		page = pages[i];
		pte = ptes[i];
		new_pmd = mm_find_pmd(mm, new_addr_start);
		old_pmd = mm_find_pmd(mm, unmap_old_addr);

		uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr),
				      unmap_new_addr, page, new_pmd, NULL,
				      false);
		uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
				    unmap_old_addr, page, old_pmd, pte);
		unmap_old_addr += PAGE_SIZE;
		unmap_new_addr += PAGE_SIZE;
	}
	if (pte_val(ptes[len]) != 0) {
		page = pages[len];
		pte = ptes[len];
		old_pmd = mm_find_pmd(mm, unmap_old_addr);

		uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
				    unmap_old_addr, page, old_pmd, pte);
		get_page(page);
	}
}

/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long do_user_swap(struct mm_struct *mm,
				  unsigned long old_addr_start,
				  unsigned long len, struct page **pages,
				  unsigned long new_addr_start)
{
	struct vm_area_struct *old_vma, *new_vma;
	unsigned long old_addr = old_addr_start;
	unsigned long new_addr = new_addr_start;
	struct page *page;
	pmd_t *pmd;
	pte_t old_pte, *ptes;
	bool pages_dirty = false;
	unsigned long i = 0, j;
	int ret;

	ptes = kvzalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL);
	if (!ptes)
		return -ENOMEM;
	lru_add_drain();
	for (j = 0; j < len; j += PAGE_SIZE) {
		page = pages[i];
		ret = -EINVAL;
		if (!page)
			goto out_recover;
		if (is_thp_or_huge(mm, new_addr))
			goto out_recover;
		old_vma = find_vma(mm, old_addr);
		if (!old_vma || old_addr < old_vma->vm_start)
			goto out_recover;
		new_vma = find_vma(mm, new_addr);
		if (!new_vma || new_addr < new_vma->vm_start)
			goto out_recover;
		if (!vma_uswap_compatible(new_vma))
			goto out_recover;

		ret = -EACCES;
		if (!(old_vma->vm_flags & VM_WRITE) &&
		    (new_vma->vm_flags & VM_WRITE))
			goto out_recover;

		ret = -ENXIO;
		pmd = mm_find_pmd(mm, old_addr);
		if (!pmd)
			goto out_recover;
		ret = uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd,
					    &old_pte, true);
		if (ret)
			goto out_recover;
		ptes[i] = old_pte;
		if (pte_dirty(old_pte)  || PageDirty(page))
			pages_dirty = true;
		put_page(page);

		ret = vm_insert_anon_page(new_vma, new_addr, page);
		if (ret)
			goto out_recover;
		get_page(page);

		old_addr += PAGE_SIZE;
		new_addr += PAGE_SIZE;
		i++;
	}

	if (pages_dirty)
		new_addr_start = new_addr_start | USWAP_PAGES_DIRTY;
	kvfree(ptes);
	return new_addr_start;

out_recover:
	uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes);
	kvfree(ptes);
	return ret;
}


/*
 * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall
 * mremap.
 * Unmap the pages between 'addr ~ addr + old_len' and remap them to 'new_addr
 * ~ new_addr + new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY.
 */
unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
			   unsigned long new_addr, unsigned long new_len)
{
	struct page **pages = NULL;
	struct mm_struct *mm = current->mm;
	unsigned long len = old_len;
	unsigned long ret = -EINVAL;
	unsigned long i;

	if (!static_branch_unlikely(&userswap_enabled))
		goto out;

	if (offset_in_page(old_addr))
		goto out;

	old_len = PAGE_ALIGN(old_len);
	new_len = PAGE_ALIGN(new_len);

	if (!new_len || old_len != new_len || offset_in_page(new_addr))
		goto out;

	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len ||
	    old_addr > TASK_SIZE - old_len)
		goto out;

	/* Ensure the old/new locations do not overlap */
	if (old_addr + old_len > new_addr && new_addr + new_len > old_addr)
		goto out;

	lru_add_drain_all();
	mmap_write_lock(mm);
	ret = pages_can_be_swapped(mm, old_addr, len, &pages);
	if (ret)
		goto out_release_unlock;

	ret = do_user_swap(mm, old_addr, len, pages, new_addr);
	/* follow_page() above increased the reference */
	for (i = 0; i < len / PAGE_SIZE; i++)
		if (pages[i])
			put_page(pages[i]);

	kvfree(pages);

out_release_unlock:
	mmap_write_unlock(mm);
out:
	return ret;
}

static int __init enable_userswap_setup(char *str)
{
	static_branch_enable(&userswap_enabled);