Commit fa2c3254 authored by Valentin Schneider's avatar Valentin Schneider Committed by Peter Zijlstra
Browse files

sched/tracing: Don't re-read p->state when emitting sched_switch event



As of commit

  c6e7bd7a ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

		      p->__state = TASK_INTERRUPTIBLE;
		      __schedule()
			deactivate_task(p);
  ttwu()
    READ !p->on_rq
    p->__state=TASK_WAKING
			trace_sched_switch()
			  __trace_sched_switch_state()
			    task_state_index()
			      return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.

Reported-by: default avatarAbhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: default avatarValentin Schneider <valentin.schneider@arm.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarSteven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com
parent 49bef33e
Loading
Loading
Loading
Loading
+8 −3
Original line number Diff line number Diff line
@@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE	(TASK_REPORT + 1)
#define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)

static inline unsigned int task_state_index(struct task_struct *tsk)
static inline unsigned int __task_state_index(unsigned int tsk_state,
					      unsigned int tsk_exit_state)
{
	unsigned int tsk_state = READ_ONCE(tsk->__state);
	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
	unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

	BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

@@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
	return fls(state);
}

static inline unsigned int task_state_index(struct task_struct *tsk)
{
	return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
}

static inline char task_index_to_char(unsigned int state)
{
	static const char state_char[] = "RSDTtXZPI";
+7 −4
Original line number Diff line number Diff line
@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
	     TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
static inline long __trace_sched_switch_state(bool preempt,
					      unsigned int prev_state,
					      struct task_struct *p)
{
	unsigned int state;

@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
	 * it for left shift operation to get the correct task->state
	 * mapping.
	 */
	state = task_state_index(p);
	state = __task_state_index(prev_state, p->exit_state);

	return state ? (1 << (state - 1)) : state;
}
@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
TRACE_EVENT(sched_switch,

	TP_PROTO(bool preempt,
		 unsigned int prev_state,
		 struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(preempt, prev, next),
	TP_ARGS(preempt, prev_state, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
		__entry->prev_state	= __trace_sched_switch_state(preempt, prev_state, prev);
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
+2 −2
Original line number Diff line number Diff line
@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
	struct rq *rq = this_rq();
	struct mm_struct *mm = rq->prev_mm;
	long prev_state;
	unsigned int prev_state;

	/*
	 * The previous task will have left us with a preempt_count of 2
@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
		migrate_disable_switch(rq, prev);
		psi_sched_switch(prev, next, !task_on_rq_queued(prev));

		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);

		/* Also unlocks the rq: */
		rq = context_switch(rq, prev, next, &rf);
+3 −1
Original line number Diff line number Diff line
@@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)

static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
			struct task_struct *prev, struct task_struct *next)
				unsigned int prev_state,
				struct task_struct *prev,
				struct task_struct *next)
{
	unsigned long long timestamp;
	int index;
+3 −1
Original line number Diff line number Diff line
@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)

static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
		    struct task_struct *prev, struct task_struct *next)
				     unsigned int prev_state,
				     struct task_struct *prev,
				     struct task_struct *next)
{
	struct trace_array *tr = data;
	struct trace_pid_list *pid_list;
Loading