perf/hw_breakpoint: Reduce contention with large number of tasks (0912037f) · Commits · EulixOS / Software / Kernel

kernel/events/hw_breakpoint.c

+133 −28

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@

		#include <linux/hw_breakpoint.h>

		#include <linux/atomic.h>
		#include <linux/bug.h>
		#include <linux/cpu.h>
		#include <linux/export.h>
		@@ -28,6 +29,7 @@
		#include <linux/kernel.h>
		#include <linux/mutex.h>
		#include <linux/notifier.h>
		#include <linux/percpu-rwsem.h>
		#include <linux/percpu.h>
		#include <linux/rhashtable.h>
		#include <linux/sched.h>
		@@ -41,9 +43,9 @@ struct bp_cpuinfo {
		unsigned int cpu_pinned;
		/* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
		#ifdef hw_breakpoint_slots
		unsigned int tsk_pinned[hw_breakpoint_slots(0)];
		atomic_t tsk_pinned[hw_breakpoint_slots(0)];
		#else
		unsigned int *tsk_pinned;
		atomic_t *tsk_pinned;
		#endif
		};

		@@ -65,8 +67,79 @@ static const struct rhashtable_params task_bps_ht_params = {

		static bool constraints_initialized __ro_after_init;

		/* Serialize accesses to the above constraints */
		static DEFINE_MUTEX(nr_bp_mutex);
		/*
		* Synchronizes accesses to the per-CPU constraints; the locking rules are:
		*
		* 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
		* (due to bp_slots_histogram::count being atomic, no update are lost).
		*
		* 2. Holding a write-lock is required for computations that require a
		* stable snapshot of all bp_cpuinfo::tsk_pinned.
		*
		* 3. In all other cases, non-atomic accesses require the appropriately held
		* lock (read-lock for read-only accesses; write-lock for reads/writes).
		*/
		DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);

		/*
		* Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
		* rhltable synchronizes concurrent insertions/deletions, independent tasks may
		* insert/delete concurrently; therefore, a mutex per task is sufficient.
		*
		* Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
		* hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
		* that hw_breakpoint may contend with per-task perf event list management. The
		* assumption is that perf usecases involving hw_breakpoints are very unlikely
		* to result in unnecessary contention.
		*/
		static inline struct mutex get_task_bps_mutex(struct perf_event bp)
		{
		struct task_struct *tsk = bp->hw.target;

		return tsk ? &tsk->perf_event_mutex : NULL;
		}

		static struct mutex bp_constraints_lock(struct perf_event bp)
		{
		struct mutex *tsk_mtx = get_task_bps_mutex(bp);

		if (tsk_mtx) {
		mutex_lock(tsk_mtx);
		percpu_down_read(&bp_cpuinfo_sem);
		} else {
		percpu_down_write(&bp_cpuinfo_sem);
		}

		return tsk_mtx;
		}

		static void bp_constraints_unlock(struct mutex *tsk_mtx)
		{
		if (tsk_mtx) {
		percpu_up_read(&bp_cpuinfo_sem);
		mutex_unlock(tsk_mtx);
		} else {
		percpu_up_write(&bp_cpuinfo_sem);
		}
		}

		static bool bp_constraints_is_locked(struct perf_event *bp)
		{
		struct mutex *tsk_mtx = get_task_bps_mutex(bp);

		return percpu_is_write_locked(&bp_cpuinfo_sem) \|\|
		(tsk_mtx ? mutex_is_locked(tsk_mtx) :
		percpu_is_read_locked(&bp_cpuinfo_sem));
		}

		static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
		{
		struct mutex *tsk_mtx = get_task_bps_mutex(bp);

		if (tsk_mtx)
		lockdep_assert_held(tsk_mtx);
		lockdep_assert_held(&bp_cpuinfo_sem);
		}

		#ifdef hw_breakpoint_slots
		/*
		@@ -97,7 +170,7 @@ static __init int init_breakpoint_slots(void)
		for (i = 0; i < TYPE_MAX; i++) {
		struct bp_cpuinfo *info = get_bp_info(cpu, i);

		info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL);
		info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL);
		if (!info->tsk_pinned)
		goto err;
		}
		@@ -137,11 +210,19 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
		*/
		static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
		{
		unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
		atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
		int i;

		/*
		* At this point we want to have acquired the bp_cpuinfo_sem as a
		* writer to ensure that there are no concurrent writers in
		* toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
		*/
		lockdep_assert_held_write(&bp_cpuinfo_sem);

		for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
		if (tsk_pinned[i] > 0)
		ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */
		if (atomic_read(&tsk_pinned[i]) > 0)
		return i + 1;
		}

		@@ -158,6 +239,11 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
		struct perf_event *iter;
		int count = 0;

		/*
		* We need a stable snapshot of the per-task breakpoint list.
		*/
		assert_bp_constraints_lock_held(bp);

		rcu_read_lock();
		head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
		if (!head)
		@@ -214,16 +300,25 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
		static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
		enum bp_type_idx type, int weight)
		{
		unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
		atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
		int old_idx, new_idx;

		/*
		* If bp->hw.target, tsk_pinned is only modified, but not used
		* otherwise. We can permit concurrent updates as long as there are no
		* other uses: having acquired bp_cpuinfo_sem as a reader allows
		* concurrent updates here. Uses of tsk_pinned will require acquiring
		* bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
		*/
		lockdep_assert_held_read(&bp_cpuinfo_sem);

		old_idx = task_bp_pinned(cpu, bp, type) - 1;
		new_idx = old_idx + weight;

		if (old_idx >= 0)
		tsk_pinned[old_idx]--;
		atomic_dec(&tsk_pinned[old_idx]);
		if (new_idx >= 0)
		tsk_pinned[new_idx]++;
		atomic_inc(&tsk_pinned[new_idx]);
		}

		/*
		@@ -241,6 +336,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,

		/* Pinned counter cpu profiling */
		if (!bp->hw.target) {
		lockdep_assert_held_write(&bp_cpuinfo_sem);
		get_bp_info(bp->cpu, type)->cpu_pinned += weight;
		return 0;
		}
		@@ -249,6 +345,11 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
		for_each_cpu(cpu, cpumask)
		toggle_bp_task_slot(bp, cpu, type, weight);

		/*
		* Readers want a stable snapshot of the per-task breakpoint list.
		*/
		assert_bp_constraints_lock_held(bp);

		if (enable)
		return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
		else
		@@ -354,14 +455,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)

		int reserve_bp_slot(struct perf_event *bp)
		{
		int ret;

		mutex_lock(&nr_bp_mutex);

		ret = __reserve_bp_slot(bp, bp->attr.bp_type);

		mutex_unlock(&nr_bp_mutex);
		struct mutex *mtx = bp_constraints_lock(bp);
		int ret = __reserve_bp_slot(bp, bp->attr.bp_type);

		bp_constraints_unlock(mtx);
		return ret;
		}

		@@ -379,12 +476,11 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)

		void release_bp_slot(struct perf_event *bp)
		{
		mutex_lock(&nr_bp_mutex);
		struct mutex *mtx = bp_constraints_lock(bp);

		arch_unregister_hw_breakpoint(bp);
		__release_bp_slot(bp, bp->attr.bp_type);

		mutex_unlock(&nr_bp_mutex);
		bp_constraints_unlock(mtx);
		}

		static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
		@@ -411,11 +507,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)

		static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
		{
		int ret;
		struct mutex *mtx = bp_constraints_lock(bp);
		int ret = __modify_bp_slot(bp, old_type, new_type);

		mutex_lock(&nr_bp_mutex);
		ret = __modify_bp_slot(bp, old_type, new_type);
		mutex_unlock(&nr_bp_mutex);
		bp_constraints_unlock(mtx);
		return ret;
		}

		@@ -426,18 +521,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
		*/
		int dbg_reserve_bp_slot(struct perf_event *bp)
		{
		if (mutex_is_locked(&nr_bp_mutex))
		int ret;

		if (bp_constraints_is_locked(bp))
		return -1;

		return __reserve_bp_slot(bp, bp->attr.bp_type);
		/* Locks aren't held; disable lockdep assert checking. */
		lockdep_off();
		ret = __reserve_bp_slot(bp, bp->attr.bp_type);
		lockdep_on();

		return ret;
		}

		int dbg_release_bp_slot(struct perf_event *bp)
		{
		if (mutex_is_locked(&nr_bp_mutex))
		if (bp_constraints_is_locked(bp))
		return -1;

		/* Locks aren't held; disable lockdep assert checking. */
		lockdep_off();
		__release_bp_slot(bp, bp->attr.bp_type);
		lockdep_on();

		return 0;
		}
		@@ -663,7 +768,7 @@ bool hw_breakpoint_is_used(void)
		return true;

		for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
		if (info->tsk_pinned[slot])
		if (atomic_read(&info->tsk_pinned[slot]))
		return true;
		}
		}