Commit 5d4b821d authored by Mike Kravetz's avatar Mike Kravetz Committed by Ze Zuo
Browse files

hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing

stable inclusion
from stable-v6.0.13
commit bb8f66f6afbbc822f61d8bfc01ecefe2a437256c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GVYW
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bb8f66f6afbbc822f61d8bfc01ecefe2a437256c

--------------------------------

commit 04ada095 upstream.

madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
tables associated with the address range.  For hugetlb vmas,
zap_page_range will call __unmap_hugepage_range_final.  However,
__unmap_hugepage_range_final assumes the passed vma is about to be removed
and deletes the vma_lock to prevent pmd sharing as the vma is on the way
out.  In the case of madvise(MADV_DONTNEED) the vma remains, but the
missing vma_lock prevents pmd sharing and could potentially lead to issues
with truncation/fault races.

This issue was originally reported here [1] as a BUG triggered in
page_try_dup_anon_rmap.  Prior to the introduction of the hugetlb
vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
prevent pmd sharing.  Subsequent faults on this vma were confused as
VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
not set in new pages added to the page table.  This resulted in pages that
appeared anonymous in a VM_SHARED vma and triggered the BUG.

Address issue by adding a new zap flag ZAP_FLAG_UNMAP to indicate an unmap
call from unmap_vmas().  This is used to indicate the 'final' unmapping of
a hugetlb vma.  When called via MADV_DONTNEED, this flag is not set and
the vm_lock is not deleted.

NOTE - Prior to the introduction of the huegtlb vma_lock in v6.1,  this
       issue is addressed by not clearing the VM_MAYSHARE flag when
       __unmap_hugepage_range_final is called in the MADV_DONTNEED case.

[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@mail.gmail.com/

Link: https://lkml.kernel.org/r/20221114235507.294320-3-mike.kravetz@oracle.com


Fixes: 90e7e7f5 ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reported-by: default avatarWei Chen <harperchen1110@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>

Conflicts:
	include/linux/mm.h
	mm/memory.c

Signed-off-by: default avatarZe Zuo <zuoze1@huawei.com>
parent 1ea043fd
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -142,7 +142,7 @@ void unmap_hugepage_range(struct vm_area_struct *,
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
			  struct vm_area_struct *vma,
			  unsigned long start, unsigned long end,
			  struct page *ref_page);
			  struct page *ref_page, zap_flags_t zap_flags);
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
				unsigned long start, unsigned long end,
				struct page *ref_page);
@@ -371,7 +371,8 @@ static inline unsigned long hugetlb_change_protection(

static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
			struct vm_area_struct *vma, unsigned long start,
			unsigned long end, struct page *ref_page)
			unsigned long end, struct page *ref_page,
			zap_flags_t zap_flags)
{
	BUG();
}
+4 −0
Original line number Diff line number Diff line
@@ -1706,8 +1706,12 @@ struct zap_details {
	pgoff_t	first_index;			/* Lowest page->index to unmap */
	pgoff_t last_index;			/* Highest page->index to unmap */
	struct page *single_page;		/* Locked page to be unmapped */
	zap_flags_t zap_flags;			/* Extra flags for zapping */
};

/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
#define  ZAP_FLAG_UNMAP		((__force zap_flags_t) BIT(1))

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
			     pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+2 −0
Original line number Diff line number Diff line
@@ -898,4 +898,6 @@ typedef struct {
	unsigned long val;
} swp_entry_t;

typedef unsigned int __bitwise zap_flags_t;

#endif /* _LINUX_MM_TYPES_H */
+16 −12
Original line number Diff line number Diff line
@@ -4512,22 +4512,26 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,

void __unmap_hugepage_range_final(struct mmu_gather *tlb,
			  struct vm_area_struct *vma, unsigned long start,
			  unsigned long end, struct page *ref_page)
			  unsigned long end, struct page *ref_page,
			  zap_flags_t zap_flags)
{
	__unmap_hugepage_range(tlb, vma, start, end, ref_page);

	if (zap_flags & ZAP_FLAG_UNMAP) {	/* final unmap */
		/*
	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
	 * test will fail on a vma being torn down, and not grab a page table
	 * on its way out.  We're lucky that the flag has such an appropriate
	 * name, and can in fact be safely cleared here. We could clear it
	 * before the __unmap_hugepage_range above, but all that's necessary
		 * Clear this flag so that x86's huge_pmd_share
		 * page_table_shareable test will fail on a vma being torn
		 * down, and not grab a page table on its way out.  We're lucky
		 * that the flag has such an appropriate name, and can in fact
		 * be safely cleared here. We could clear it before the
		 * __unmap_hugepage_range above, but all that's necessary
		 * is to clear it before releasing the i_mmap_rwsem. This works
	 * because in the context this is called, the VMA is about to be
	 * destroyed and the i_mmap_rwsem is held.
		 * because in the context this is called, the VMA is about to
		 * be destroyed and the i_mmap_rwsem is held.
		 */
		vma->vm_flags &= ~VM_MAYSHARE;
	}
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			  unsigned long end, struct page *ref_page)
+7 −2
Original line number Diff line number Diff line
@@ -1518,8 +1518,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
			 * safe to do nothing in this case.
			 */
			if (vma->vm_file) {
				zap_flags_t zap_flags = details ?
					details->zap_flags : 0;
				i_mmap_lock_write(vma->vm_file->f_mapping);
				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
				__unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags);
				i_mmap_unlock_write(vma->vm_file->f_mapping);
			}
		} else
@@ -1550,12 +1552,15 @@ void unmap_vmas(struct mmu_gather *tlb,
		unsigned long end_addr)
{
	struct mmu_notifier_range range;
	struct zap_details details = {
		.zap_flags = ZAP_FLAG_UNMAP,
	};

	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
				start_addr, end_addr);
	mmu_notifier_invalidate_range_start(&range);
	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
		unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
	mmu_notifier_invalidate_range_end(&range);
}