Commit eccd9064 authored by Brijesh Singh's avatar Brijesh Singh Committed by Borislav Petkov
Browse files

x86/mm: Do not use set_{pud, pmd}_safe() when splitting a large page



The commit

  0a9fe8ca ("x86/mm: Validate kernel_physical_mapping_init() PTE population")

triggers this warning in SEV guests:

  WARNING: CPU: 0 PID: 0 at arch/x86/include/asm/pgalloc.h:87 phys_pmd_init+0x30d/0x386
  Call Trace:
   kernel_physical_mapping_init+0xce/0x259
   early_set_memory_enc_dec+0x10f/0x160
   kvm_smp_prepare_boot_cpu+0x71/0x9d
   start_kernel+0x1c9/0x50b
   secondary_startup_64+0xa4/0xb0

A SEV guest calls kernel_physical_mapping_init() to clear the encryption
mask from an existing mapping. While doing so, it also splits large
pages into smaller.

To split a page, kernel_physical_mapping_init() allocates a new page and
updates the existing entry. The set_{pud,pmd}_safe() helpers trigger a
warning when updating an entry with a page in the present state.

Add a new kernel_physical_mapping_change() helper which uses the
non-safe variants of set_{pmd,pud,p4d}() and {pmd,pud,p4d}_populate()
routines when updating the entry.

Since kernel_physical_mapping_change() may replace an existing
entry with a new entry, the caller is responsible to flush
the TLB at the end. Change early_set_memory_enc_dec() to use
kernel_physical_mapping_change() when it wants to clear the memory
encryption mask from the page table entry.

 [ bp:
   - massage commit message.
   - flesh out comment according to dhansen's request.
   - align function arguments at opening brace. ]

Fixes: 0a9fe8ca ("x86/mm: Validate kernel_physical_mapping_init() PTE population")
Signed-off-by: default avatarBrijesh Singh <brijesh.singh@amd.com>
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
Reviewed-by: default avatarDave Hansen <dave.hansen@intel.com>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Lendacky <Thomas.Lendacky@amd.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190417154102.22613-1-brijesh.singh@amd.com
parent 0e72499c
Loading
Loading
Loading
Loading
+104 −40
Original line number Diff line number Diff line
@@ -58,6 +58,37 @@

#include "ident_map.c"

#define DEFINE_POPULATE(fname, type1, type2, init)		\
static inline void fname##_init(struct mm_struct *mm,		\
		type1##_t *arg1, type2##_t *arg2, bool init)	\
{								\
	if (init)						\
		fname##_safe(mm, arg1, arg2);			\
	else							\
		fname(mm, arg1, arg2);				\
}

DEFINE_POPULATE(p4d_populate, p4d, pud, init)
DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
DEFINE_POPULATE(pud_populate, pud, pmd, init)
DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)

#define DEFINE_ENTRY(type1, type2, init)			\
static inline void set_##type1##_init(type1##_t *arg1,		\
			type2##_t arg2, bool init)		\
{								\
	if (init)						\
		set_##type1##_safe(arg1, arg2);			\
	else							\
		set_##type1(arg1, arg2);			\
}

DEFINE_ENTRY(p4d, p4d, init)
DEFINE_ENTRY(pud, pud, init)
DEFINE_ENTRY(pmd, pmd, init)
DEFINE_ENTRY(pte, pte, init)


/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
@@ -414,7 +445,7 @@ void __init cleanup_highmap(void)
 */
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
	      pgprot_t prot)
	      pgprot_t prot, bool init)
{
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
@@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
				set_pte_safe(pte, __pte(0));
				set_pte_init(pte, __pte(0), init);
			continue;
		}

@@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
		set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
		set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
	}

@@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
 */
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask, pgprot_t prot)
	      unsigned long page_size_mask, pgprot_t prot, bool init)
{
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
@@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
				set_pmd_safe(pmd, __pmd(0));
				set_pmd_init(pmd, __pmd(0), init);
			continue;
		}

@@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
				spin_lock(&init_mm.page_table_lock);
				pte = (pte_t *)pmd_page_vaddr(*pmd);
				paddr_last = phys_pte_init(pte, paddr,
							   paddr_end, prot);
							   paddr_end, prot,
							   init);
				spin_unlock(&init_mm.page_table_lock);
				continue;
			}
@@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
			pages++;
			spin_lock(&init_mm.page_table_lock);
			set_pte_safe((pte_t *)pmd,
			set_pte_init((pte_t *)pmd,
				     pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
					     __pgprot(pgprot_val(prot) | _PAGE_PSE)),
				     init);
			spin_unlock(&init_mm.page_table_lock);
			paddr_last = paddr_next;
			continue;
		}

		pte = alloc_low_page();
		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);

		spin_lock(&init_mm.page_table_lock);
		pmd_populate_kernel_safe(&init_mm, pmd, pte);
		pmd_populate_kernel_init(&init_mm, pmd, pte, init);
		spin_unlock(&init_mm.page_table_lock);
	}
	update_page_count(PG_LEVEL_2M, pages);
@@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
 */
static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
	      unsigned long page_size_mask, bool init)
{
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
@@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
				set_pud_safe(pud, __pud(0));
				set_pud_init(pud, __pud(0), init);
			continue;
		}

@@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
				paddr_last = phys_pmd_init(pmd, paddr,
							   paddr_end,
							   page_size_mask,
							   prot);
							   prot, init);
				continue;
			}
			/*
@@ -610,9 +643,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
			pages++;
			spin_lock(&init_mm.page_table_lock);
			set_pte_safe((pte_t *)pud,
			set_pte_init((pte_t *)pud,
				     pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
					PAGE_KERNEL_LARGE));
					     PAGE_KERNEL_LARGE),
				     init);
			spin_unlock(&init_mm.page_table_lock);
			paddr_last = paddr_next;
			continue;
@@ -620,10 +654,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,

		pmd = alloc_low_page();
		paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
					   page_size_mask, prot);
					   page_size_mask, prot, init);

		spin_lock(&init_mm.page_table_lock);
		pud_populate_safe(&init_mm, pud, pmd);
		pud_populate_init(&init_mm, pud, pmd, init);
		spin_unlock(&init_mm.page_table_lock);
	}

@@ -634,14 +668,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,

static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
	      unsigned long page_size_mask, bool init)
{
	unsigned long paddr_next, paddr_last = paddr_end;
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = p4d_index(vaddr);

	if (!pgtable_l5_enabled())
		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
				     page_size_mask, init);

	for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
		p4d_t *p4d;
@@ -657,39 +692,34 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
				set_p4d_safe(p4d, __p4d(0));
				set_p4d_init(p4d, __p4d(0), init);
			continue;
		}

		if (!p4d_none(*p4d)) {
			pud = pud_offset(p4d, 0);
			paddr_last = phys_pud_init(pud, paddr,
					paddr_end,
					page_size_mask);
			paddr_last = phys_pud_init(pud, paddr, paddr_end,
						   page_size_mask, init);
			continue;
		}

		pud = alloc_low_page();
		paddr_last = phys_pud_init(pud, paddr, paddr_end,
					   page_size_mask);
					   page_size_mask, init);

		spin_lock(&init_mm.page_table_lock);
		p4d_populate_safe(&init_mm, p4d, pud);
		p4d_populate_init(&init_mm, p4d, pud, init);
		spin_unlock(&init_mm.page_table_lock);
	}

	return paddr_last;
}

/*
 * Create page table mapping for the physical memory for specific physical
 * addresses. The virtual and physical addresses have to be aligned on PMD level
 * down. It returns the last physical address mapped.
 */
unsigned long __meminit
kernel_physical_mapping_init(unsigned long paddr_start,
static unsigned long __meminit
__kernel_physical_mapping_init(unsigned long paddr_start,
			       unsigned long paddr_end,
			     unsigned long page_size_mask)
			       unsigned long page_size_mask,
			       bool init)
{
	bool pgd_changed = false;
	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start,
			p4d = (p4d_t *)pgd_page_vaddr(*pgd);
			paddr_last = phys_p4d_init(p4d, __pa(vaddr),
						   __pa(vaddr_end),
						   page_size_mask);
						   page_size_mask,
						   init);
			continue;
		}

		p4d = alloc_low_page();
		paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
					   page_size_mask);
					   page_size_mask, init);

		spin_lock(&init_mm.page_table_lock);
		if (pgtable_l5_enabled())
			pgd_populate_safe(&init_mm, pgd, p4d);
			pgd_populate_init(&init_mm, pgd, p4d, init);
		else
			p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
			p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
					  (pud_t *) p4d, init);

		spin_unlock(&init_mm.page_table_lock);
		pgd_changed = true;
	}
@@ -732,6 +765,37 @@ kernel_physical_mapping_init(unsigned long paddr_start,
	return paddr_last;
}


/*
 * Create page table mapping for the physical memory for specific physical
 * addresses. Note that it can only be used to populate non-present entries.
 * The virtual and physical addresses have to be aligned on PMD level
 * down. It returns the last physical address mapped.
 */
unsigned long __meminit
kernel_physical_mapping_init(unsigned long paddr_start,
			     unsigned long paddr_end,
			     unsigned long page_size_mask)
{
	return __kernel_physical_mapping_init(paddr_start, paddr_end,
					      page_size_mask, true);
}

/*
 * This function is similar to kernel_physical_mapping_init() above with the
 * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe()
 * when updating the mapping. The caller is responsible to flush the TLBs after
 * the function returns.
 */
unsigned long __meminit
kernel_physical_mapping_change(unsigned long paddr_start,
			       unsigned long paddr_end,
			       unsigned long page_size_mask)
{
	return __kernel_physical_mapping_init(paddr_start, paddr_end,
					      page_size_mask, false);
}

#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
+7 −3
Original line number Diff line number Diff line
@@ -301,7 +301,11 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr,
		else
			split_page_size_mask = 1 << PG_LEVEL_2M;

		kernel_physical_mapping_init(__pa(vaddr & pmask),
		/*
		 * kernel_physical_mapping_change() does not flush the TLBs, so
		 * a TLB flush is required after we exit from the for loop.
		 */
		kernel_physical_mapping_change(__pa(vaddr & pmask),
					       __pa((vaddr_end & pmask) + psize),
					       split_page_size_mask);
	}
+3 −0
Original line number Diff line number Diff line
@@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void);
unsigned long kernel_physical_mapping_init(unsigned long start,
					     unsigned long end,
					     unsigned long page_size_mask);
unsigned long kernel_physical_mapping_change(unsigned long start,
					     unsigned long end,
					     unsigned long page_size_mask);
void zone_sizes_init(void);

extern int after_bootmem;