Commit 8cc5fcbb authored by Mina Almasry's avatar Mina Almasry Committed by Linus Torvalds
Browse files

mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY

On UFFDIO_COPY, if we fail to copy the page contents while holding the
hugetlb_fault_mutex, we will drop the mutex and return to the caller after
allocating a page that consumed a reservation.  In this case there may be
a fault that double consumes the reservation.  To handle this, we free the
allocated page, fix the reservations, and allocate a temporary hugetlb
page and return that to the caller.  When the caller does the copy outside
of the lock, we again check the cache, and allocate a page consuming the
reservation, and copy over the contents.

Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning and the copy_huge_page_from_user() always fails, then:

./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
        2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
	2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success

Both tests succeed and produce no warnings. After the
test runs number of free/resv hugepages is correct.

[yuehaibing@huawei.com: remove set but not used variable 'vm_alloc_shared']
  Link: https://lkml.kernel.org/r/20210601141610.28332-1-yuehaibing@huawei.com
[almasrymina@google.com: fix allocation error check and copy func name]
  Link: https://lkml.kernel.org/r/20210605010626.1459873-1-almasrymina@google.com

Link: https://lkml.kernel.org/r/20210528005029.88088-1-almasrymina@google.com


Signed-off-by: default avatarMina Almasry <almasrymina@google.com>
Signed-off-by: default avatarYueHaibing <yuehaibing@huawei.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 22f3c951
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
				  struct page *newpage, struct page *page);
extern int migrate_page_move_mapping(struct address_space *mapping,
		struct page *newpage, struct page *page, int extra_count);
extern void copy_huge_page(struct page *dst, struct page *src);
#else

static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
	return -ENOSYS;
}

static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_COMPACTION
+39 −9
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>

#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
			    struct page **pagep)
{
	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
	struct address_space *mapping;
	pgoff_t idx;
	struct hstate *h = hstate_vma(dst_vma);
	struct address_space *mapping = dst_vma->vm_file->f_mapping;
	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
	unsigned long size;
	int vm_shared = dst_vma->vm_flags & VM_SHARED;
	struct hstate *h = hstate_vma(dst_vma);
	pte_t _dst_pte;
	spinlock_t *ptl;
	int ret;
	int ret = -ENOMEM;
	struct page *page;
	int writable;

	mapping = dst_vma->vm_file->f_mapping;
	idx = vma_hugecache_offset(h, dst_vma, dst_addr);

	if (is_continue) {
		ret = -EFAULT;
		page = find_lock_page(mapping, idx);
@@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
		/* fallback to copy_from_user outside mmap_lock */
		if (unlikely(ret)) {
			ret = -ENOENT;
			/* Free the allocated page which may have
			 * consumed a reservation.
			 */
			restore_reserve_on_error(h, dst_vma, dst_addr, page);
			put_page(page);

			/* Allocate a temporary page to hold the copied
			 * contents.
			 */
			page = alloc_huge_page_vma(h, dst_vma, dst_addr);
			if (!page) {
				ret = -ENOMEM;
				goto out;
			}
			*pagep = page;
			/* don't free the page */
			/* Set the outparam pagep and return to the caller to
			 * copy the contents outside the lock. Don't free the
			 * page.
			 */
			goto out;
		}
	} else {
		page = *pagep;
		if (vm_shared &&
		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
			put_page(*pagep);
			ret = -EEXIST;
			*pagep = NULL;
			goto out;
		}

		page = alloc_huge_page(dst_vma, dst_addr, 0);
		if (IS_ERR(page)) {
			ret = -ENOMEM;
			*pagep = NULL;
			goto out;
		}
		copy_huge_page(page, *pagep);
		put_page(*pagep);
		*pagep = NULL;
	}

+1 −1
Original line number Diff line number Diff line
@@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
	}
}

static void copy_huge_page(struct page *dst, struct page *src)
void copy_huge_page(struct page *dst, struct page *src)
{
	int i;
	int nr_pages;
+1 −49
Original line number Diff line number Diff line
@@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
					      unsigned long len,
					      enum mcopy_atomic_mode mode)
{
	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
	int vm_shared = dst_vma->vm_flags & VM_SHARED;
	ssize_t err;
	pte_t *dst_pte;
@@ -308,7 +307,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,

		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		i_mmap_unlock_read(mapping);
		vm_alloc_shared = vm_shared;

		cond_resched();

@@ -346,54 +344,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
out_unlock:
	mmap_read_unlock(dst_mm);
out:
	if (page) {
		/*
		 * We encountered an error and are about to free a newly
		 * allocated huge page.
		 *
		 * Reservation handling is very subtle, and is different for
		 * private and shared mappings.  See the routine
		 * restore_reserve_on_error for details.  Unfortunately, we
		 * can not call restore_reserve_on_error now as it would
		 * require holding mmap_lock.
		 *
		 * If a reservation for the page existed in the reservation
		 * map of a private mapping, the map was modified to indicate
		 * the reservation was consumed when the page was allocated.
		 * We clear the HPageRestoreReserve flag now so that the global
		 * reserve count will not be incremented in free_huge_page.
		 * The reservation map will still indicate the reservation
		 * was consumed and possibly prevent later page allocation.
		 * This is better than leaking a global reservation.  If no
		 * reservation existed, it is still safe to clear
		 * HPageRestoreReserve as no adjustments to reservation counts
		 * were made during allocation.
		 *
		 * The reservation map for shared mappings indicates which
		 * pages have reservations.  When a huge page is allocated
		 * for an address with a reservation, no change is made to
		 * the reserve map.  In this case HPageRestoreReserve will be
		 * set to indicate that the global reservation count should be
		 * incremented when the page is freed.  This is the desired
		 * behavior.  However, when a huge page is allocated for an
		 * address without a reservation a reservation entry is added
		 * to the reservation map, and HPageRestoreReserve will not be
		 * set. When the page is freed, the global reserve count will
		 * NOT be incremented and it will appear as though we have
		 * leaked reserved page.  In this case, set HPageRestoreReserve
		 * so that the global reserve count will be incremented to
		 * match the reservation map entry which was created.
		 *
		 * Note that vm_alloc_shared is based on the flags of the vma
		 * for which the page was originally allocated.  dst_vma could
		 * be different or NULL on error.
		 */
		if (vm_alloc_shared)
			SetHPageRestoreReserve(page);
		else
			ClearHPageRestoreReserve(page);
	if (page)
		put_page(page);
	}
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);