Commit f841b682 authored by Chengming Zhou's avatar Chengming Zhou Committed by Peter Zijlstra
Browse files

perf/core: Fix cgroup events tracking



We encounter perf warnings when using cgroup events like:

  cd /sys/fs/cgroup
  mkdir test
  perf stat -e cycles -a -G test

Which then triggers:

  WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0
  Call Trace:
   <TASK>
   __schedule+0x4ae/0x9f0
   ? _raw_spin_unlock_irqrestore+0x23/0x40
   ? __cond_resched+0x18/0x20
   preempt_schedule_common+0x2d/0x70
   __cond_resched+0x18/0x20
   wait_for_completion+0x2f/0x160
   ? cpu_stop_queue_work+0x9e/0x130
   affine_move_task+0x18a/0x4f0

  WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0
  Call Trace:
   <TASK>
   ? ctx_sched_out+0xb7/0x1b0
   perf_cgroup_switch+0x88/0xc0
   __schedule+0x4ae/0x9f0
   ? _raw_spin_unlock_irqrestore+0x23/0x40
   ? __cond_resched+0x18/0x20
   preempt_schedule_common+0x2d/0x70
   __cond_resched+0x18/0x20
   wait_for_completion+0x2f/0x160
   ? cpu_stop_queue_work+0x9e/0x130
   affine_move_task+0x18a/0x4f0

The above two warnings are not complete here since I remove other
unimportant information. The problem is caused by the perf cgroup
events tracking:

  CPU0					CPU1
  perf_event_open()
    perf_event_alloc()
      account_event()
	account_event_cpu()
	  atomic_inc(perf_cgroup_events)
					  __perf_event_task_sched_out()
					    if (atomic_read(perf_cgroup_events))
					      perf_cgroup_switch()
						// kernel/events/core.c:849
						WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0)
						if (READ_ONCE(cpuctx->cgrp) == cgrp) // false
						  return
						perf_ctx_lock()
						ctx_sched_out()
						cpuctx->cgrp = cgrp
						ctx_sched_in()
						  perf_cgroup_set_timestamp()
						    // kernel/events/core.c:829
						    WARN_ON_ONCE(!ctx->nr_cgroups)
						perf_ctx_unlock()
    perf_install_in_context()
      cpu_function_call()
					  __perf_install_in_context()
					    add_event_to_ctx()
					      list_add_event()
						perf_cgroup_event_enable()
						  ctx->nr_cgroups++
						  cpuctx->cgrp = X

We can see from above that we wrongly use percpu atomic perf_cgroup_events
to check if we need to perf_cgroup_switch(), which should only be used
when we know this CPU has cgroup events enabled.

The commit bd275681 ("perf: Rewrite core context handling") change
to have only one context per-CPU, so we can just use cpuctx->cgrp to
check if this CPU has cgroup events enabled.

So percpu atomic perf_cgroup_events is not needed.

Fixes: bd275681 ("perf: Rewrite core context handling")
Signed-off-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarRavi Bangoria <ravi.bangoria@amd.com>
Link: https://lkml.kernel.org/r/20221207124023.66252-1-zhouchengming@bytedance.com
parent e2d37148
Loading
Loading
Loading
Loading
+10 −32
Original line number Original line Diff line number Diff line
@@ -380,7 +380,6 @@ enum event_type_t {


/*
/*
 * perf_sched_events : >0 events exist
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
 */


static void perf_sched_delayed(struct work_struct *work);
static void perf_sched_delayed(struct work_struct *work);
@@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static atomic_t perf_sched_count;


static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);


static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
@@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task)
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
	struct perf_cgroup *cgrp;
	struct perf_cgroup *cgrp;


	cgrp = perf_cgroup_from_task(task, NULL);
	/*
	 * cpuctx->cgrp is set when the first cgroup event enabled,
	 * and is cleared when the last cgroup event disabled.
	 */
	if (READ_ONCE(cpuctx->cgrp) == NULL)
		return;


	WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
	WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

	cgrp = perf_cgroup_from_task(task, NULL);
	if (READ_ONCE(cpuctx->cgrp) == cgrp)
	if (READ_ONCE(cpuctx->cgrp) == cgrp)
		return;
		return;


@@ -3631,7 +3636,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
	 * to check if we have to switch out PMU state.
	 * to check if we have to switch out PMU state.
	 * cgroup event are system-wide mode only
	 * cgroup event are system-wide mode only
	 */
	 */
	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
	perf_cgroup_switch(next);
	perf_cgroup_switch(next);
}
}


@@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
		detach_sb_event(event);
		detach_sb_event(event);
}
}


static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
	if (event->parent)
		return;

	if (is_cgroup_event(event))
		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}

#ifdef CONFIG_NO_HZ_FULL
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
#endif
@@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event)
			schedule_delayed_work(&perf_sched_work, HZ);
			schedule_delayed_work(&perf_sched_work, HZ);
	}
	}


	unaccount_event_cpu(event, event->cpu);

	unaccount_pmu_sb_event(event);
	unaccount_pmu_sb_event(event);
}
}


@@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event)
		attach_sb_event(event);
		attach_sb_event(event);
}
}


static void account_event_cpu(struct perf_event *event, int cpu)
{
	if (event->parent)
		return;

	if (is_cgroup_event(event))
		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
static void account_freq_event_nohz(void)
{
{
@@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event)
	}
	}
enabled:
enabled:


	account_event_cpu(event, event->cpu);

	account_pmu_sb_event(event);
	account_pmu_sb_event(event);
}
}


@@ -12822,13 +12804,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx,


	perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
	perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
		perf_remove_from_context(event, 0);
		perf_remove_from_context(event, 0);
		unaccount_event_cpu(event, cpu);
		put_pmu_ctx(event->pmu_ctx);
		put_pmu_ctx(event->pmu_ctx);
		list_add(&event->migrate_entry, events);
		list_add(&event->migrate_entry, events);


		for_each_sibling_event(sibling, event) {
		for_each_sibling_event(sibling, event) {
			perf_remove_from_context(sibling, 0);
			perf_remove_from_context(sibling, 0);
			unaccount_event_cpu(sibling, cpu);
			put_pmu_ctx(sibling->pmu_ctx);
			put_pmu_ctx(sibling->pmu_ctx);
			list_add(&sibling->migrate_entry, events);
			list_add(&sibling->migrate_entry, events);
		}
		}
@@ -12847,7 +12827,6 @@ static void __perf_pmu_install_event(struct pmu *pmu,


	if (event->state >= PERF_EVENT_STATE_OFF)
	if (event->state >= PERF_EVENT_STATE_OFF)
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->state = PERF_EVENT_STATE_INACTIVE;
	account_event_cpu(event, cpu);
	perf_install_in_context(ctx, event, cpu);
	perf_install_in_context(ctx, event, cpu);
}
}


@@ -13742,7 +13721,6 @@ static int __perf_cgroup_move(void *info)
	struct task_struct *task = info;
	struct task_struct *task = info;


	preempt_disable();
	preempt_disable();
	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
	perf_cgroup_switch(task);
	perf_cgroup_switch(task);
	preempt_enable();
	preempt_enable();