Commit 1f12096a authored by Michael Ellerman's avatar Michael Ellerman
Browse files

Merge the lockless page table walk rework into next

This merges the lockless page table walk rework series from Aneesh.
Because it touches powerpc KVM code we are sharing it with the kvm-ppc
tree in our topic/ppc-kvm branch.

This is the cover letter from Aneesh:

Avoid IPI while updating page table entries.

Problem Summary:
Slow termination of KVM guest with large guest RAM config due to a
large number of IPIs that were caused by clearing level 1 PTE
entries (THP) entries. This is shown in the stack trace below.

- qemu-system-ppc  [kernel.vmlinux]            [k] smp_call_function_many
   - smp_call_function_many
      - 36.09% smp_call_function_many
           serialize_against_pte_lookup
           radix__pmdp_huge_get_and_clear
           zap_huge_pmd
           unmap_page_range
           unmap_vmas
           unmap_region
           __do_munmap
           __vm_munmap
           sys_munmap
          system_call
           __munmap
           qemu_ram_munmap
           qemu_anon_ram_free
           reclaim_ramblock
           call_rcu_thread
           qemu_thread_start
           start_thread
           __clone

Why we need to do IPI when clearing PMD entries:
This was added as part of commit: 13bd817b ("powerpc/thp: Serialize pmd clear against a linux page table walk")

serialize_against_pte_lookup makes sure that all parallel lockless
page table walk completes before we convert a PMD pte entry to regular
pmd entry. We end up doing that conversion in the below scenarios

1) __split_huge_zero_page_pmd
2) do_huge_pmd_wp_page_fallback
3) MADV_DONTNEED running parallel to page faults.

local_irq_disable and lockless page table walk:

The lockless page table walk work with the assumption that we can
dereference the page table contents without holding a lock. For this
to work, we need to make sure we read the page table contents
atomically and page table pages are not going to be freed/released
while we are walking the table pages. We can achieve by using a rcu
based freeing for page table pages or if the architecture implements
broadcast tlbie, we can block the IPI as we walk the page table pages.

To support both the above framework, lockless page table walk is done
with irq disabled instead of rcu_read_lock()

We do have two interface for lockless page table walk, gup fast and
__find_linux_pte. This patch series makes __find_linux_pte table walk
safe against the conversion of PMD PTE to regular PMD.

gup fast:

gup fast is already safe against THP split because kernel now
differentiate between a pmd split and a compound page split. gup fast
can run parallel to a pmd split and we prevent a parallel gup fast to
a hugepage split, by freezing the page refcount and failing the
speculative page ref increment.

Similar to how gup is safe against parallel pmd split, this patch
series updates the __find_linux_pte callers to be safe against a
parallel pmd split. We do that by enforcing the following rules.

1) Don't reload the pte value, because that can be updated in
   parallel.
2) Code should be able to work with a stale PTE value and not the
   recent one. ie, the pte value that we are looking at may not be the
   latest value in the page table.
3) Before looking at pte value check for _PAGE_PTE bit. We now do this
as part of pte_present() check.

Performance:

This speeds up Qemu guest RAM del/unplug time as below
128 core, 496GB guest:

Without patch:
  munmap start: timer = 13162 ms, PID=7684
  munmap finish: timer = 95312 ms, PID=7684 - delta = 82150 ms

With patch (upto removing IPI)
  munmap start: timer = 196449 ms, PID=6681
  munmap finish: timer = 196488 ms, PID=6681 - delta = 39ms

With patch (with adding the tlb invalidate in pmdp_huge_get_and_clear_full)
  munmap start: timer = 196345 ms, PID=6879
  munmap finish: timer = 196714 ms, PID=6879 - delta = 369ms

Link: https://lore.kernel.org/r/20200505071729.54912-1-aneesh.kumar@linux.ibm.com
parents 140777a3 75358ea3
Loading
Loading
Loading
Loading
+15 −5
Original line number Diff line number Diff line
@@ -553,6 +553,12 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
}
#endif /* CONFIG_NUMA_BALANCING */

static inline bool pte_hw_valid(pte_t pte)
{
	return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE)) ==
		cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE);
}

static inline int pte_present(pte_t pte)
{
	/*
@@ -561,12 +567,11 @@ static inline int pte_present(pte_t pte)
	 * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
	 * if we find _PAGE_PRESENT cleared.
	 */
	return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
}

static inline bool pte_hw_valid(pte_t pte)
{
	return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
	if (pte_hw_valid(pte))
		return true;
	return (pte_raw(pte) & cpu_to_be64(_PAGE_INVALID | _PAGE_PTE)) ==
		cpu_to_be64(_PAGE_INVALID | _PAGE_PTE);
}

#ifdef CONFIG_PPC_MEM_KEYS
@@ -1260,6 +1265,11 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
}
#define pmdp_collapse_flush pmdp_collapse_flush

#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
				   unsigned long addr,
				   pmd_t *pmdp, int full);

#define __HAVE_ARCH_PGTABLE_DEPOSIT
static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
					      pmd_t *pmdp, pgtable_t pgtable)
+1 −2
Original line number Diff line number Diff line
@@ -113,8 +113,7 @@ static inline void hash__flush_tlb_kernel_range(unsigned long start,
struct mmu_gather;
extern void hash__tlb_flush(struct mmu_gather *tlb);
/* Private function for use by PCI IO mapping code */
extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
				     unsigned long end);
extern void __flush_hash_table_range(unsigned long start, unsigned long end);
extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
				unsigned long addr);
#endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */
+1 −1
Original line number Diff line number Diff line
@@ -198,7 +198,7 @@ extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
			unsigned int shift,
			const struct kvm_memory_slot *memslot,
			unsigned int lpid);
extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested,
				    bool writing, unsigned long gpa,
				    unsigned int lpid);
extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+33 −1
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <asm/book3s/64/mmu-hash.h>
#include <asm/cpu_has_feature.h>
#include <asm/ppc-opcode.h>
#include <asm/pte-walk.h>

#ifdef CONFIG_PPC_PSERIES
static inline bool kvmhv_on_pseries(void)
@@ -434,7 +435,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
			continue;
		}
		/* If pte is not present return None */
		if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
		if (unlikely(!pte_present(old_pte)))
			return __pte(0);

		new_pte = pte_mkyoung(old_pte);
@@ -634,6 +635,37 @@ extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
				unsigned long gpa, unsigned long hpa,
				unsigned long nbytes);

static inline pte_t *find_kvm_secondary_pte(struct kvm *kvm, unsigned long ea,
					    unsigned *hshift)
{
	pte_t *pte;

	VM_WARN(!spin_is_locked(&kvm->mmu_lock),
		"%s called with kvm mmu_lock not held \n", __func__);
	pte = __find_linux_pte(kvm->arch.pgtable, ea, NULL, hshift);

	return pte;
}

static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq,
				       unsigned long ea, unsigned *hshift)
{
	pte_t *pte;

	VM_WARN(!spin_is_locked(&kvm->mmu_lock),
		"%s called with kvm mmu_lock not held \n", __func__);

	if (mmu_notifier_retry(kvm, mmu_seq))
		return NULL;

	pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift);

	return pte;
}

extern pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
					unsigned long ea, unsigned *hshift);

#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

#endif /* __ASM_KVM_BOOK3S_64_H__ */
+0 −9
Original line number Diff line number Diff line
@@ -291,15 +291,6 @@ static inline bool early_radix_enabled(void)
}
#endif

#ifdef CONFIG_PPC_MEM_KEYS
extern u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address);
#else
static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
{
	return 0;
}
#endif /* CONFIG_PPC_MEM_KEYS */

#ifdef CONFIG_STRICT_KERNEL_RWX
static inline bool strict_kernel_rwx_enabled(void)
{
Loading