Commit ca6c2132 authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

perf: Fix missing SIGTRAPs



Marco reported:

Due to the implementation of how SIGTRAP are delivered if
perf_event_attr::sigtrap is set, we've noticed 3 issues:

  1. Missing SIGTRAP due to a race with event_sched_out() (more
     details below).

  2. Hardware PMU events being disabled due to returning 1 from
     perf_event_overflow(). The only way to re-enable the event is
     for user space to first "properly" disable the event and then
     re-enable it.

  3. The inability to automatically disable an event after a
     specified number of overflows via PERF_EVENT_IOC_REFRESH.

The worst of the 3 issues is problem (1), which occurs when a
pending_disable is "consumed" by a racing event_sched_out(), observed
as follows:

		CPU0			|	CPU1
	--------------------------------+---------------------------
	__perf_event_overflow()		|
	 perf_event_disable_inatomic()	|
	  pending_disable = CPU0	| ...
					| _perf_event_enable()
					|  event_function_call()
					|   task_function_call()
					|    /* sends IPI to CPU0 */
	<IPI>				| ...
	 __perf_event_enable()		+---------------------------
	  ctx_resched()
	   task_ctx_sched_out()
	    ctx_sched_out()
	     group_sched_out()
	      event_sched_out()
	       pending_disable = -1
	</IPI>
	<IRQ-work>
	 perf_pending_event()
	  perf_pending_event_disable()
	   /* Fails to send SIGTRAP because no pending_disable! */
	</IRQ-work>

In the above case, not only is that particular SIGTRAP missed, but also
all future SIGTRAPs because 'event_limit' is not reset back to 1.

To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction
of a separate 'pending_sigtrap', no longer using 'event_limit' and
'pending_disable' for its delivery.

Additionally; and different to Marco's proposed patch:

 - recognise that pending_disable effectively duplicates oncpu for
   the case where it is set. As such, change the irq_work handler to
   use ->oncpu to target the event and use pending_* as boolean toggles.

 - observe that SIGTRAP targets the ctx->task, so the context switch
   optimization that carries contexts between tasks is invalid. If
   the irq_work were delayed enough to hit after a context switch the
   SIGTRAP would be delivered to the wrong task.

 - observe that if the event gets scheduled out
   (rotation/migration/context-switch/...) the irq-work would be
   insufficient to deliver the SIGTRAP when the event gets scheduled
   back in (the irq-work might still be pending on the old CPU).

   Therefore have event_sched_out() convert the pending sigtrap into a
   task_work which will deliver the signal at return_to_user.

Fixes: 97ba62b2 ("perf: Add support for SIGTRAP on perf events")
Reported-by: default avatarDmitry Vyukov <dvyukov@google.com>
Debugged-by: default avatarDmitry Vyukov <dvyukov@google.com>
Reported-by: default avatarMarco Elver <elver@google.com>
Debugged-by: default avatarMarco Elver <elver@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarMarco Elver <elver@google.com>
Tested-by: default avatarMarco Elver <elver@google.com>
parent 9abf2313
Loading
Loading
Loading
Loading
+15 −4
Original line number Diff line number Diff line
@@ -756,11 +756,14 @@ struct perf_event {
	struct fasync_struct		*fasync;

	/* delayed work for NMIs and such */
	int				pending_wakeup;
	int				pending_kill;
	int				pending_disable;
	unsigned int			pending_wakeup;
	unsigned int			pending_kill;
	unsigned int			pending_disable;
	unsigned int			pending_sigtrap;
	unsigned long			pending_addr;	/* SIGTRAP */
	struct irq_work			pending;
	struct irq_work			pending_irq;
	struct callback_head		pending_task;
	unsigned int			pending_work;

	atomic_t			event_limit;

@@ -877,6 +880,14 @@ struct perf_event_context {
#endif
	void				*task_ctx_data; /* pmu specific data */
	struct rcu_head			rcu_head;

	/*
	 * Sum (event->pending_sigtrap + event->pending_work)
	 *
	 * The SIGTRAP is targeted at ctx->task, as such it won't do changing
	 * that until the signal is delivered.
	 */
	local_t				nr_pending;
};

/*
+113 −38
Original line number Diff line number Diff line
@@ -54,6 +54,7 @@
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>

#include "internal.h"

@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
	event->pmu->del(event, 0);
	event->oncpu = -1;

	if (READ_ONCE(event->pending_disable) >= 0) {
		WRITE_ONCE(event->pending_disable, -1);
	if (event->pending_disable) {
		event->pending_disable = 0;
		perf_cgroup_event_disable(event, ctx);
		state = PERF_EVENT_STATE_OFF;
	}

	if (event->pending_sigtrap) {
		bool dec = true;

		event->pending_sigtrap = 0;
		if (state != PERF_EVENT_STATE_OFF &&
		    !event->pending_work) {
			event->pending_work = 1;
			dec = false;
			task_work_add(current, &event->pending_task, TWA_RESUME);
		}
		if (dec)
			local_dec(&event->ctx->nr_pending);
	}

	perf_event_set_state(event, state);

	if (!is_software_event(event))
@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_event it's OK because event->ctx
 * When called from perf_pending_irq it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
	WRITE_ONCE(event->pending_disable, smp_processor_id());
	/* can fail, see perf_pending_event_disable() */
	irq_work_queue(&event->pending);
	event->pending_disable = 1;
	irq_work_queue(&event->pending_irq);
}

#define MAX_INTERRUPTS (~0ULL)
@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
		if (context_equiv(ctx, next_ctx)) {

			perf_pmu_disable(pmu);

			/* PMIs are disabled; ctx->nr_pending is stable. */
			if (local_read(&ctx->nr_pending) ||
			    local_read(&next_ctx->nr_pending)) {
				/*
				 * Must not swap out ctx when there's pending
				 * events that rely on the ctx->task relation.
				 */
				raw_spin_unlock(&next_ctx->lock);
				rcu_read_unlock();
				goto inside_switch;
			}

			WRITE_ONCE(ctx->task, next);
			WRITE_ONCE(next_ctx->task, task);

			perf_pmu_disable(pmu);

			if (cpuctx->sched_cb_usage && pmu->sched_task)
				pmu->sched_task(ctx, false);

@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
		raw_spin_lock(&ctx->lock);
		perf_pmu_disable(pmu);

inside_switch:
		if (cpuctx->sched_cb_usage && pmu->sched_task)
			pmu->sched_task(ctx, false);
		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,

static void _free_event(struct perf_event *event)
{
	irq_work_sync(&event->pending);
	irq_work_sync(&event->pending_irq);

	unaccount_event(event);

@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
		return;

	/*
	 * perf_pending_event() can race with the task exiting.
	 * Both perf_pending_task() and perf_pending_irq() can race with the
	 * task exiting.
	 */
	if (current->flags & PF_EXITING)
		return;
@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
		      event->attr.type, event->attr.sig_data);
}

static void perf_pending_event_disable(struct perf_event *event)
/*
 * Deliver the pending work in-event-context or follow the context.
 */
static void __perf_pending_irq(struct perf_event *event)
{
	int cpu = READ_ONCE(event->pending_disable);
	int cpu = READ_ONCE(event->oncpu);

	/*
	 * If the event isn't running; we done. event_sched_out() will have
	 * taken care of things.
	 */
	if (cpu < 0)
		return;

	/*
	 * Yay, we hit home and are in the context of the event.
	 */
	if (cpu == smp_processor_id()) {
		WRITE_ONCE(event->pending_disable, -1);

		if (event->attr.sigtrap) {
		if (event->pending_sigtrap) {
			event->pending_sigtrap = 0;
			perf_sigtrap(event);
			atomic_set_release(&event->event_limit, 1); /* rearm event */
			return;
			local_dec(&event->ctx->nr_pending);
		}

		if (event->pending_disable) {
			event->pending_disable = 0;
			perf_event_disable_local(event);
		}
		return;
	}

@@ -6484,33 +6523,60 @@ static void perf_pending_event_disable(struct perf_event *event)
	 *				  irq_work_queue(); // FAILS
	 *
	 *  irq_work_run()
	 *    perf_pending_event()
	 *    perf_pending_irq()
	 *
	 * But the event runs on CPU-B and wants disabling there.
	 */
	irq_work_queue_on(&event->pending, cpu);
	irq_work_queue_on(&event->pending_irq, cpu);
}

static void perf_pending_event(struct irq_work *entry)
static void perf_pending_irq(struct irq_work *entry)
{
	struct perf_event *event = container_of(entry, struct perf_event, pending);
	struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
	int rctx;

	rctx = perf_swevent_get_recursion_context();
	/*
	 * If we 'fail' here, that's OK, it means recursion is already disabled
	 * and we won't recurse 'further'.
	 */
	rctx = perf_swevent_get_recursion_context();

	perf_pending_event_disable(event);

	/*
	 * The wakeup isn't bound to the context of the event -- it can happen
	 * irrespective of where the event is.
	 */
	if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
	}

	__perf_pending_irq(event);

	if (rctx >= 0)
		perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_task(struct callback_head *head)
{
	struct perf_event *event = container_of(head, struct perf_event, pending_task);
	int rctx;

	/*
	 * If we 'fail' here, that's OK, it means recursion is already disabled
	 * and we won't recurse 'further'.
	 */
	preempt_disable_notrace();
	rctx = perf_swevent_get_recursion_context();

	if (event->pending_work) {
		event->pending_work = 0;
		perf_sigtrap(event);
		local_dec(&event->ctx->nr_pending);
	}

	if (rctx >= 0)
		perf_swevent_put_recursion_context(rctx);
	preempt_enable_notrace();
}

#ifdef CONFIG_GUEST_PERF_EVENTS
@@ -9236,16 +9302,28 @@ static int __perf_event_overflow(struct perf_event *event,
	if (events && atomic_dec_and_test(&event->event_limit)) {
		ret = 1;
		event->pending_kill = POLL_HUP;
		event->pending_addr = data->addr;

		perf_event_disable_inatomic(event);
	}

	if (event->attr.sigtrap) {
		/*
		 * Should not be able to return to user space without processing
		 * pending_sigtrap (kernel events can overflow multiple times).
		 */
		WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
		if (!event->pending_sigtrap) {
			event->pending_sigtrap = 1;
			local_inc(&event->ctx->nr_pending);
		}
		event->pending_addr = data->addr;
		irq_work_queue(&event->pending_irq);
	}

	READ_ONCE(event->overflow_handler)(event, data, regs);

	if (*perf_event_fasync(event) && event->pending_kill) {
		event->pending_wakeup = 1;
		irq_work_queue(&event->pending);
		irq_work_queue(&event->pending_irq);
	}

	return ret;
@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,


	init_waitqueue_head(&event->waitq);
	event->pending_disable = -1;
	init_irq_work(&event->pending, perf_pending_event);
	init_irq_work(&event->pending_irq, perf_pending_irq);
	init_task_work(&event->pending_task, perf_pending_task);

	mutex_init(&event->mmap_mutex);
	raw_spin_lock_init(&event->addr_filters.lock);
@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
	if (parent_event)
		event->event_caps = parent_event->event_caps;

	if (event->attr.sigtrap)
		atomic_set(&event->event_limit, 1);

	if (task) {
		event->attach_state = PERF_ATTACH_TASK;
		/*
+1 −1
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
	atomic_set(&handle->rb->poll, EPOLLIN);

	handle->event->pending_wakeup = 1;
	irq_work_queue(&handle->event->pending);
	irq_work_queue(&handle->event->pending_irq);
}

/*