powerpc/64s/hash: add stress_hpt kernel boot option to increase hash faults (6b34a099) · Commits · EulixOS / Software / Kernel

Documentation/admin-guide/kernel-parameters.txt

+5 −0

Original line number	Diff line number	Diff line
		@@ -1042,6 +1042,11 @@
		them frequently to increase the rate of SLB faults
		on kernel addresses.

		stress_hpt [PPC]
		Limits the number of kernel HPT entries in the hash
		page table to increase the rate of hash page table
		faults on kernel addresses.

		disable= [IPV6]
		See Documentation/networking/ipv6.rst.

arch/powerpc/mm/book3s64/hash_4k.c

+5 −0

Original line number	Diff line number	Diff line
		@@ -16,6 +16,8 @@
		#include <asm/machdep.h>
		#include <asm/mmu.h>

		#include "internal.h"

		int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
		pte_t *ptep, unsigned long trap, unsigned long flags,
		int ssize, int subpg_prot)
		@@ -118,6 +120,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
		}
		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) \| H_PAGE_HASHPTE;
		new_pte \|= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);

		if (stress_hpt())
		hpt_do_stress(ea, hpte_group);
		}
		*ptep = __pte(new_pte & ~H_PAGE_BUSY);
		return 0;

arch/powerpc/mm/book3s64/hash_64k.c

+10 −0

Original line number	Diff line number	Diff line
		@@ -16,6 +16,8 @@
		#include <asm/machdep.h>
		#include <asm/mmu.h>

		#include "internal.h"

		/*
		* Return true, if the entry has a slot value which
		* the software considers as invalid.
		@@ -216,6 +218,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
		new_pte \|= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
		new_pte \|= H_PAGE_HASHPTE;

		if (stress_hpt())
		hpt_do_stress(ea, hpte_group);

		*ptep = __pte(new_pte & ~H_PAGE_BUSY);
		return 0;
		}
		@@ -327,7 +332,12 @@ int __hash_page_64K(unsigned long ea, unsigned long access,

		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) \| H_PAGE_HASHPTE;
		new_pte \|= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);

		if (stress_hpt())
		hpt_do_stress(ea, hpte_group);
		}

		*ptep = __pte(new_pte & ~H_PAGE_BUSY);

		return 0;
		}

arch/powerpc/mm/book3s64/hash_utils.c

+129 −1

Original line number	Diff line number	Diff line
		@@ -471,7 +471,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
		return ret;
		}

		static bool disable_1tb_segments = false;
		static bool disable_1tb_segments __ro_after_init;

		static int __init parse_disable_1tb_segments(char *p)
		{
		@@ -480,6 +480,40 @@ static int __init parse_disable_1tb_segments(char *p)
		}
		early_param("disable_1tb_segments", parse_disable_1tb_segments);

		bool stress_hpt_enabled __initdata;

		static int __init parse_stress_hpt(char *p)
		{
		stress_hpt_enabled = true;
		return 0;
		}
		early_param("stress_hpt", parse_stress_hpt);

		__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);

		/*
		* per-CPU array allocated if we enable stress_hpt.
		*/
		#define STRESS_MAX_GROUPS 16
		struct stress_hpt_struct {
		unsigned long last_group[STRESS_MAX_GROUPS];
		};

		static inline int stress_nr_groups(void)
		{
		/*
		* LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
		* to allow practical forward progress. Bare metal returns 1, which
		* seems to help uncover more bugs.
		*/
		if (firmware_has_feature(FW_FEATURE_LPAR))
		return STRESS_MAX_GROUPS;
		else
		return 1;
		}

		static struct stress_hpt_struct *stress_hpt_struct;

		static int __init htab_dt_scan_seg_sizes(unsigned long node,
		const char *uname, int depth,
		void *data)
		@@ -976,6 +1010,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
		pr_info("Partition table %p\n", partition_tb);
		}

		void hpt_clear_stress(void);
		static struct timer_list stress_hpt_timer;
		void stress_hpt_timer_fn(struct timer_list *timer)
		{
		int next_cpu;

		hpt_clear_stress();
		if (!firmware_has_feature(FW_FEATURE_LPAR))
		tlbiel_all();

		next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
		if (next_cpu >= nr_cpu_ids)
		next_cpu = cpumask_first(cpu_online_mask);
		stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
		add_timer_on(&stress_hpt_timer, next_cpu);
		}

		static void __init htab_initialize(void)
		{
		unsigned long table;
		@@ -995,6 +1046,20 @@ static void __init htab_initialize(void)
		if (stress_slb_enabled)
		static_branch_enable(&stress_slb_key);

		if (stress_hpt_enabled) {
		unsigned long tmp;
		static_branch_enable(&stress_hpt_key);
		// Too early to use nr_cpu_ids, so use NR_CPUS
		tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
		0, 0, MEMBLOCK_ALLOC_ANYWHERE);
		memset((void )tmp, 0xff, sizeof(struct stress_hpt_struct) NR_CPUS);
		stress_hpt_struct = __va(tmp);

		timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
		stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
		add_timer(&stress_hpt_timer);
		}

		/*
		* Calculate the required size of the htab. We want the number of
		* PTEGs to equal one half the number of real pages.
		@@ -1980,6 +2045,69 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
		return slot;
		}

		void hpt_clear_stress(void)
		{
		int cpu = raw_smp_processor_id();
		int g;

		for (g = 0; g < stress_nr_groups(); g++) {
		unsigned long last_group;
		last_group = stress_hpt_struct[cpu].last_group[g];

		if (last_group != -1UL) {
		int i;
		for (i = 0; i < HPTES_PER_GROUP; i++) {
		if (mmu_hash_ops.hpte_remove(last_group) == -1)
		break;
		}
		stress_hpt_struct[cpu].last_group[g] = -1;
		}
		}
		}

		void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
		{
		unsigned long last_group;
		int cpu = raw_smp_processor_id();

		last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
		if (hpte_group == last_group)
		return;

		if (last_group != -1UL) {
		int i;
		/*
		* Concurrent CPUs might be inserting into this group, so
		* give up after a number of iterations, to prevent a live
		* lock.
		*/
		for (i = 0; i < HPTES_PER_GROUP; i++) {
		if (mmu_hash_ops.hpte_remove(last_group) == -1)
		break;
		}
		stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
		}

		if (ea >= PAGE_OFFSET) {
		/*
		* We would really like to prefetch to get the TLB loaded, then
		* remove the PTE before returning from fault interrupt, to
		* increase the hash fault rate.
		*
		* Unfortunately QEMU TCG does not model the TLB in a way that
		* makes this possible, and systemsim (mambo) emulator does not
		* bring in TLBs with prefetches (although loads/stores do
		* work for non-CI PTEs).
		*
		* So remember this PTE and clear it on the next hash fault.
		*/
		memmove(&stress_hpt_struct[cpu].last_group[1],
		&stress_hpt_struct[cpu].last_group[0],
		(stress_nr_groups() - 1) * sizeof(unsigned long));
		stress_hpt_struct[cpu].last_group[0] = hpte_group;
		}
		}

		#if defined(CONFIG_DEBUG_PAGEALLOC) \|\| defined(CONFIG_KFENCE)
		static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);

arch/powerpc/mm/book3s64/internal.h

+11 −0

Original line number	Diff line number	Diff line
		@@ -13,6 +13,17 @@ static inline bool stress_slb(void)
		return static_branch_unlikely(&stress_slb_key);
		}

		extern bool stress_hpt_enabled;

		DECLARE_STATIC_KEY_FALSE(stress_hpt_key);

		static inline bool stress_hpt(void)
		{
		return static_branch_unlikely(&stress_hpt_key);
		}

		void hpt_do_stress(unsigned long ea, unsigned long hpte_group);

		void slb_setup_new_exec(void);

		void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);