Commit 1d2cc076 authored by Steve Sistare's avatar Steve Sistare Committed by Cheng Jian
Browse files

sched/fair: Provide idle search schedstats

hulk inclusion
category: feature
bugzilla: 38261, https://bugzilla.openeuler.org/show_bug.cgi?id=23


CVE: NA

---------------------------

Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
          prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
          CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
           migratable.

Signed-off-by: default avatarSteve Sistare <steven.sistare@oracle.com>
Signed-off-by: default avatarCheng Jian <cj.chengjian@huawei.com>
Reviewed-by: default avatarHanjun Guo <guohanjun@huawei.com>
Signed-off-by: default avatarYang Yingliang <yangyingliang@huawei.com>
Reviewed-by: default avatarXie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: default avatarYang Yingliang <yangyingliang@huawei.com>
parent 49353d1e
Loading
Loading
Loading
Loading
+29 −2
Original line number Diff line number Diff line
@@ -2240,17 +2240,44 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
static bool __initdata __sched_schedstats = false;

unsigned long schedstat_skid;

static void compute_skid(void)
{
	int i, n = 0;
	s64 t;
	int skid = 0;

	for (i = 0; i < 100; i++) {
		t = local_clock();
		t = local_clock() - t;
		if (t > 0 && t < 1000) {	/* only use sane samples */
			skid += (int) t;
			n++;
		}
	}

	if (n > 0)
		schedstat_skid = skid / n;
	else
		schedstat_skid = 0;
	pr_info("schedstat_skid = %lu\n", schedstat_skid);
}

static void set_schedstats(bool enabled)
{
	if (enabled)
	if (enabled) {
		compute_skid();
		static_branch_enable(&sched_schedstats);
	else
	} else {
		static_branch_disable(&sched_schedstats);
	}
}

void force_schedstat_enabled(void)
{
	if (!schedstat_enabled()) {
		compute_skid();
		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
		static_branch_enable(&sched_schedstats);
	}
+48 −6
Original line number Diff line number Diff line
@@ -3834,29 +3834,35 @@ static inline bool steal_enabled(void)
static void overload_clear(struct rq *rq)
{
	struct sparsemask *overload_cpus;
	unsigned long time;

	if (!steal_enabled())
		return;

	time = schedstat_start_time();
	rcu_read_lock();
	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
	if (overload_cpus)
		sparsemask_clear_elem(overload_cpus, rq->cpu);
	rcu_read_unlock();
	schedstat_end_time(rq->find_time, time);
}

static void overload_set(struct rq *rq)
{
	struct sparsemask *overload_cpus;
	unsigned long time;

	if (!steal_enabled())
		return;

	time = schedstat_start_time();
	rcu_read_lock();
	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
	if (overload_cpus)
		sparsemask_set_elem(overload_cpus, rq->cpu);
	rcu_read_unlock();
	schedstat_end_time(rq->find_time, time);
}

static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6300,6 +6306,16 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
	return cpu;
}

#define SET_STAT(STAT)							\
	do {								\
		if (schedstat_enabled()) {				\
			struct rq *rq = this_rq();			\
									\
			if (rq)						\
				__schedstat_inc(rq->STAT);		\
		}							\
	} while (0)

/*
 * Try and locate an idle core/thread in the LLC cache domain.
 */
@@ -6308,15 +6324,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
	struct sched_domain *sd;
	int i, recent_used_cpu;

	if (available_idle_cpu(target) || sched_idle_cpu(target))
	if (available_idle_cpu(target) || sched_idle_cpu(target)) {
		SET_STAT(found_idle_cpu_easy);
		return target;
	}

	/*
	 * If the previous CPU is cache affine and idle, don't be stupid:
	 */
	if (prev != target && cpus_share_cache(prev, target) &&
	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
	    (available_idle_cpu(prev) || sched_idle_cpu(prev))) {
		SET_STAT(found_idle_cpu_easy);
		return prev;
	}

	/* Check a recently used CPU as a potential idle candidate: */
	recent_used_cpu = p->recent_used_cpu;
@@ -6329,26 +6349,36 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
		 * Replace recent_used_cpu with prev as it is a potential
		 * candidate for the next wake:
		 */
		SET_STAT(found_idle_cpu_easy);
		p->recent_used_cpu = prev;
		return recent_used_cpu;
	}

	sd = rcu_dereference(per_cpu(sd_llc, target));
	if (!sd)
	if (!sd) {
		SET_STAT(nofound_idle_cpu);
		return target;
	}

	i = select_idle_core(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
	if ((unsigned)i < nr_cpumask_bits) {
		SET_STAT(found_idle_core);
		return i;
	}

	i = select_idle_cpu(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
	if ((unsigned)i < nr_cpumask_bits) {
		SET_STAT(found_idle_cpu);
		return i;
	}

	i = select_idle_smt(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
	if ((unsigned)i < nr_cpumask_bits) {
		SET_STAT(found_idle_cpu);
		return i;
	}

	SET_STAT(nofound_idle_cpu);
	return target;
}

@@ -6533,6 +6563,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
	unsigned long time = schedstat_start_time();
	struct sched_domain *tmp, *sd = NULL;
	int cpu = smp_processor_id();
	int new_cpu = prev_cpu;
@@ -6581,6 +6612,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
			current->recent_used_cpu = cpu;
	}
	rcu_read_unlock();
	schedstat_end_time(cpu_rq(cpu)->find_time, time);

	return new_cpu;
}
@@ -6827,6 +6859,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
	struct sched_entity *se;
	struct task_struct *p;
	int new_tasks;
	unsigned long time;

again:
	if (!cfs_rq->nr_running)
@@ -6938,6 +6971,8 @@ done: __maybe_unused;

idle:

	time = schedstat_start_time();

	/*
	 * We must set idle_stamp _before_ calling try_steal() or
	 * idle_balance(), such that we measure the duration as idle time.
@@ -6951,6 +6986,8 @@ done: __maybe_unused;
	if (new_tasks)
		rq_idle_stamp_clear(rq);

	schedstat_end_time(rq->find_time, time);

	/*
	 * Because try_steal() and idle_balance() release (and re-acquire)
	 * rq->lock, it is possible for any higher priority task to appear.
@@ -9964,6 +10001,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
		update_rq_clock(dst_rq);
		attach_task(dst_rq, p);
		stolen = 1;
		schedstat_inc(dst_rq->steal);
	}
	local_irq_restore(rf.flags);

@@ -9988,6 +10026,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
	int dst_cpu = dst_rq->cpu;
	bool locked = true;
	int stolen = 0;
	bool any_overload = false;
	struct sparsemask *overload_cpus;

	if (!steal_enabled())
@@ -10030,6 +10069,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
			stolen = 1;
			goto out;
		}
		any_overload = true;
	}

out:
@@ -10041,6 +10081,8 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
	stolen |= (dst_rq->cfs.h_nr_running > 0);
	if (dst_rq->nr_running != dst_rq->cfs.h_nr_running)
		stolen = -1;
	if (!stolen && any_overload)
		schedstat_inc(dst_rq->steal_fail);
	return stolen;
}

+9 −0
Original line number Diff line number Diff line
@@ -934,6 +934,15 @@ struct rq {
	/* try_to_wake_up() stats */
	unsigned int		ttwu_count;
	unsigned int		ttwu_local;

	/* Idle search stats */
	unsigned int		found_idle_core;
	unsigned int		found_idle_cpu;
	unsigned int		found_idle_cpu_easy;
	unsigned int		nofound_idle_cpu;
	unsigned long		find_time;
	unsigned int		steal;
	unsigned int		steal_fail;
#endif

#ifdef CONFIG_SMP
+10 −1
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@
 * Bump this up when changing the output format or the meaning of an existing
 * format, so that tools can adapt (or abort)
 */
#define SCHEDSTAT_VERSION 15
#define SCHEDSTAT_VERSION 16

static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -37,6 +37,15 @@ static int show_schedstat(struct seq_file *seq, void *v)
		    rq->rq_cpu_time,
		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);

		seq_printf(seq, " %u %u %u %u %lu %u %u",
			   rq->found_idle_cpu_easy,
			   rq->found_idle_cpu,
			   rq->found_idle_core,
			   rq->nofound_idle_cpu,
			   rq->find_time,
			   rq->steal,
			   rq->steal_fail);

		seq_printf(seq, "\n");

#ifdef CONFIG_SMP
+13 −0
Original line number Diff line number Diff line
@@ -40,6 +40,17 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
#define   schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
#define   schedstat_val(var)		(var)
#define   schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)
#define   schedstat_start_time()	schedstat_val_or_zero(local_clock())
#define   schedstat_end_time(stat, time)			\
	do {							\
		unsigned long endtime;				\
								\
		if (schedstat_enabled() && (time)) {		\
			endtime = local_clock() - (time) - schedstat_skid; \
			schedstat_add((stat), endtime);		\
		}						\
	} while (0)
extern unsigned long schedstat_skid;

#else /* !CONFIG_SCHEDSTATS: */
static inline void rq_sched_info_arrive  (struct rq *rq, unsigned long long delta) { }
@@ -54,6 +65,8 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
# define   schedstat_set(var, val)	do { } while (0)
# define   schedstat_val(var)		0
# define   schedstat_val_or_zero(var)	0
# define   schedstat_start_time()	0
# define   schedstat_end_time(stat, t)	do { } while (0)
#endif /* CONFIG_SCHEDSTATS */

#ifdef CONFIG_SCHED_INFO