sched/fair: Commit to EEVDF (5e963f2b) · Commits · EulixOS / Software / Kernel

kernel/sched/debug.c

+0 −6

Original line number	Diff line number	Diff line
		@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
		debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
		#endif

		debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
		debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
		debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
		debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);

		debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
		debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
		@@ -866,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
		SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
		#define PN(x) \
		SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
		PN(sysctl_sched_latency);
		PN(sysctl_sched_min_granularity);
		PN(sysctl_sched_idle_min_granularity);
		PN(sysctl_sched_wakeup_granularity);
		P(sysctl_sched_child_runs_first);
		P(sysctl_sched_features);
		#undef PN

kernel/sched/fair.c

+38 −427

Original line number	Diff line number	Diff line
		@@ -57,22 +57,6 @@
		#include "stats.h"
		#include "autogroup.h"

		/*
		* Targeted preemption latency for CPU-bound tasks:
		*
		* NOTE: this latency value is not the same as the concept of
		* 'timeslice length' - timeslices in CFS are of variable length
		* and have no persistent notion like in traditional, time-slice
		* based scheduling concepts.
		*
		* (to see the precise effective timeslice length of your workload,
		* run vmstat and monitor the context-switches (cs) field)
		*
		* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
		*/
		unsigned int sysctl_sched_latency = 6000000ULL;
		static unsigned int normalized_sysctl_sched_latency = 6000000ULL;

		/*
		* The initial- and re-scaling of tunables is configurable
		*
		@@ -94,37 +78,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
		unsigned int sysctl_sched_min_granularity = 750000ULL;
		static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;

		/*
		* Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
		* Applies only when SCHED_IDLE tasks compete with normal tasks.
		*
		* (default: 0.75 msec)
		*/
		unsigned int sysctl_sched_idle_min_granularity = 750000ULL;

		/*
		* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
		*/
		static unsigned int sched_nr_latency = 8;

		/*
		* After fork, child runs first. If set to 0 (default) then
		* parent will (try to) run first.
		*/
		unsigned int sysctl_sched_child_runs_first __read_mostly;

		/*
		* SCHED_OTHER wake-up granularity.
		*
		* This option delays the preemption effects of decoupled workloads
		* and reduces their over-scheduling. Synchronous workloads will still
		* have immediate wakeup/sleep latencies.
		*
		* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
		*/
		unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
		static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

		const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

		int sched_thermal_decay_shift;
		@@ -279,8 +238,6 @@ static void update_sysctl(void)
		#define SET_SYSCTL(name) \
		(sysctl_##name = (factor) * normalized_sysctl_##name)
		SET_SYSCTL(sched_min_granularity);
		SET_SYSCTL(sched_latency);
		SET_SYSCTL(sched_wakeup_granularity);
		#undef SET_SYSCTL
		}

		@@ -888,30 +845,6 @@ struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
		return __node_2_se(left);
		}

		static struct sched_entity __pick_next_entity(struct sched_entity se)
		{
		struct rb_node *next = rb_next(&se->run_node);

		if (!next)
		return NULL;

		return __node_2_se(next);
		}

		static struct sched_entity pick_cfs(struct cfs_rq cfs_rq, struct sched_entity *curr)
		{
		struct sched_entity *left = __pick_first_entity(cfs_rq);

		/*
		* If curr is set we have to see if its left of the leftmost entity
		* still in the tree, provided there was anything in the tree at all.
		*/
		if (!left \|\| (curr && entity_before(curr, left)))
		left = curr;

		return left;
		}

		/*
		* Earliest Eligible Virtual Deadline First
		*
		@@ -1008,85 +941,15 @@ int sched_update_scaling(void)
		{
		unsigned int factor = get_update_sysctl_factor();

		sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
		sysctl_sched_min_granularity);

		#define WRT_SYSCTL(name) \
		(normalized_sysctl_##name = sysctl_##name / (factor))
		WRT_SYSCTL(sched_min_granularity);
		WRT_SYSCTL(sched_latency);
		WRT_SYSCTL(sched_wakeup_granularity);
		#undef WRT_SYSCTL

		return 0;
		}
		#endif

		/*
		* The idea is to set a period in which each task runs once.
		*
		* When there are too many tasks (sched_nr_latency) we have to stretch
		* this period because otherwise the slices get too small.
		*
		* p = (nr <= nl) ? l : l*nr/nl
		*/
		static u64 __sched_period(unsigned long nr_running)
		{
		if (unlikely(nr_running > sched_nr_latency))
		return nr_running * sysctl_sched_min_granularity;
		else
		return sysctl_sched_latency;
		}

		static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);

		/*
		* We calculate the wall-time slice from the period by taking a part
		* proportional to the weight.
		*
		* s = p*P[w/rw]
		*/
		static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		unsigned int nr_running = cfs_rq->nr_running;
		struct sched_entity *init_se = se;
		unsigned int min_gran;
		u64 slice;

		if (sched_feat(ALT_PERIOD))
		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;

		slice = __sched_period(nr_running + !se->on_rq);

		for_each_sched_entity(se) {
		struct load_weight *load;
		struct load_weight lw;
		struct cfs_rq *qcfs_rq;

		qcfs_rq = cfs_rq_of(se);
		load = &qcfs_rq->load;

		if (unlikely(!se->on_rq)) {
		lw = qcfs_rq->load;

		update_load_add(&lw, se->load.weight);
		load = &lw;
		}
		slice = __calc_delta(slice, se->load.weight, load);
		}

		if (sched_feat(BASE_SLICE)) {
		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
		min_gran = sysctl_sched_idle_min_granularity;
		else
		min_gran = sysctl_sched_min_granularity;

		slice = max_t(u64, slice, min_gran);
		}

		return slice;
		}

		static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se);

		/*
		@@ -1098,7 +961,6 @@ static void update_deadline(struct cfs_rq cfs_rq, struct sched_entity se)
		if ((s64)(se->vruntime - se->deadline) < 0)
		return;

		if (sched_feat(EEVDF)) {
		/*
		* For EEVDF the virtual time slope is determined by w_i (iow.
		* nice) while the request time r_i is determined by
		@@ -1106,6 +968,11 @@ static void update_deadline(struct cfs_rq cfs_rq, struct sched_entity se)
		*/
		se->slice = sysctl_sched_min_granularity;

		/*
		* EEVDF: vd_i = ve_i + r_i / w_i
		*/
		se->deadline = se->vruntime + calc_delta_fair(se->slice, se);

		/*
		* The task has consumed its request, reschedule.
		*/
		@@ -1113,20 +980,6 @@ static void update_deadline(struct cfs_rq cfs_rq, struct sched_entity se)
		resched_curr(rq_of(cfs_rq));
		clear_buddies(cfs_rq, se);
		}
		} else {
		/*
		* When many tasks blow up the sched_period; it is possible
		* that sched_slice() reports unusually large results (when
		* many tasks are very light for example). Therefore impose a
		* maximum.
		*/
		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
		}

		/*
		* EEVDF: vd_i = ve_i + r_i / w_i
		*/
		se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
		}

		#include "pelt.h"
		@@ -5055,19 +4908,6 @@ static inline void update_misfit_status(struct task_struct p, struct rq rq) {}

		#endif /* CONFIG_SMP */

		static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		#ifdef CONFIG_SCHED_DEBUG
		s64 d = se->vruntime - cfs_rq->min_vruntime;

		if (d < 0)
		d = -d;

		if (d > 3*sysctl_sched_latency)
		schedstat_inc(cfs_rq->nr_spread_over);
		#endif
		}

		static void
		place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
		{
		@@ -5219,7 +5059,6 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)

		check_schedstat_required();
		update_stats_enqueue_fair(cfs_rq, se, flags);
		check_spread(cfs_rq, se);
		if (!curr)
		__enqueue_entity(cfs_rq, se);
		se->on_rq = 1;
		@@ -5241,17 +5080,6 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
		}
		}

		static void __clear_buddies_last(struct sched_entity *se)
		{
		for_each_sched_entity(se) {
		struct cfs_rq *cfs_rq = cfs_rq_of(se);
		if (cfs_rq->last != se)
		break;

		cfs_rq->last = NULL;
		}
		}

		static void __clear_buddies_next(struct sched_entity *se)
		{
		for_each_sched_entity(se) {
		@@ -5263,27 +5091,10 @@ static void __clear_buddies_next(struct sched_entity *se)
		}
		}

		static void __clear_buddies_skip(struct sched_entity *se)
		{
		for_each_sched_entity(se) {
		struct cfs_rq *cfs_rq = cfs_rq_of(se);
		if (cfs_rq->skip != se)
		break;

		cfs_rq->skip = NULL;
		}
		}

		static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		if (cfs_rq->last == se)
		__clear_buddies_last(se);

		if (cfs_rq->next == se)
		__clear_buddies_next(se);

		if (cfs_rq->skip == se)
		__clear_buddies_skip(se);
		}

		static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
		@@ -5341,45 +5152,6 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
		update_idle_cfs_rq_clock_pelt(cfs_rq);
		}

		/*
		* Preempt the current task with a newly woken task if needed:
		*/
		static void
		check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
		{
		unsigned long delta_exec;
		struct sched_entity *se;
		s64 delta;

		delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
		if (delta_exec > curr->slice) {
		resched_curr(rq_of(cfs_rq));
		/*
		* The current task ran long enough, ensure it doesn't get
		* re-elected due to buddy favours.
		*/
		clear_buddies(cfs_rq, curr);
		return;
		}

		/*
		* Ensure that a task that missed wakeup preemption by a
		* narrow margin doesn't have to wait for a full slice.
		* This also mitigates buddy induced latencies under load.
		*/
		if (delta_exec < sysctl_sched_min_granularity)
		return;

		se = __pick_first_entity(cfs_rq);
		delta = curr->vruntime - se->vruntime;

		if (delta < 0)
		return;

		if (delta > curr->slice)
		resched_curr(rq_of(cfs_rq));
		}

		static void
		set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		@@ -5418,9 +5190,6 @@ set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
		se->prev_sum_exec_runtime = se->sum_exec_runtime;
		}

		static int
		wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);

		/*
		* Pick the next process, keeping these things in mind, in this order:
		* 1) keep things fair between processes/task groups
		@@ -5431,9 +5200,6 @@ wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
		static struct sched_entity *
		pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
		{
		struct sched_entity left, se;

		if (sched_feat(EEVDF)) {
		/*
		* Enabling NEXT_BUDDY will affect latency but not fairness.
		*/
		@@ -5444,42 +5210,6 @@ pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
		return pick_eevdf(cfs_rq);
		}

		se = left = pick_cfs(cfs_rq, curr);

		/*
		* Avoid running the skip buddy, if running something else can
		* be done without getting too unfair.
		*/
		if (cfs_rq->skip && cfs_rq->skip == se) {
		struct sched_entity *second;

		if (se == curr) {
		second = __pick_first_entity(cfs_rq);
		} else {
		second = __pick_next_entity(se);
		if (!second \|\| (curr && entity_before(curr, second)))
		second = curr;
		}

		if (second && wakeup_preempt_entity(second, left) < 1)
		se = second;
		}

		if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
		/*
		* Someone really wants this to run. If it's not unfair, run it.
		*/
		se = cfs_rq->next;
		} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
		/*
		* Prefer last buddy, try to return the CPU to a preempted task.
		*/
		se = cfs_rq->last;
		}

		return se;
		}

		static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

		static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
		@@ -5494,8 +5224,6 @@ static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
		/* throttle cfs_rqs exceeding runtime */
		check_cfs_rq_runtime(cfs_rq);

		check_spread(cfs_rq, prev);

		if (prev->on_rq) {
		update_stats_wait_start_fair(cfs_rq, prev);
		/* Put 'current' back into the tree. */
		@@ -5536,9 +5264,6 @@ entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
		hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
		return;
		#endif

		if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1)
		check_preempt_tick(cfs_rq, curr);
		}


		@@ -6610,7 +6335,6 @@ static void hrtick_update(struct rq *rq)
		if (!hrtick_enabled_fair(rq) \|\| curr->sched_class != &fair_sched_class)
		return;

		if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
		hrtick_start_fair(rq, curr);
		}
		#else /* !CONFIG_SCHED_HRTICK */
		@@ -6652,17 +6376,6 @@ static int sched_idle_rq(struct rq *rq)
		rq->nr_running);
		}

		/*
		* Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
		* of idle_nr_running, which does not consider idle descendants of normal
		* entities.
		*/
		static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
		{
		return cfs_rq->nr_running &&
		cfs_rq->nr_running == cfs_rq->idle_nr_running;
		}

		#ifdef CONFIG_SMP
		static int sched_idle_cpu(int cpu)
		{
		@@ -8205,66 +7918,6 @@ balance_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
		}
		#endif /* CONFIG_SMP */

		static unsigned long wakeup_gran(struct sched_entity *se)
		{
		unsigned long gran = sysctl_sched_wakeup_granularity;

		/*
		* Since its curr running now, convert the gran from real-time
		* to virtual-time in his units.
		*
		* By using 'se' instead of 'curr' we penalize light tasks, so
		* they get preempted easier. That is, if 'se' < 'curr' then
		* the resulting gran will be larger, therefore penalizing the
		* lighter, if otoh 'se' > 'curr' then the resulting gran will
		* be smaller, again penalizing the lighter task.
		*
		* This is especially important for buddies when the leftmost
		* task is higher priority than the buddy.
		*/
		return calc_delta_fair(gran, se);
		}

		/*
		* Should 'se' preempt 'curr'.
		*
		* \|s1
		* \|s2
		* \|s3
		* g
		* \|<--->\|c
		*
		* w(c, s1) = -1
		* w(c, s2) = 0
		* w(c, s3) = 1
		*
		*/
		static int
		wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
		{
		s64 gran, vdiff = curr->vruntime - se->vruntime;

		if (vdiff <= 0)
		return -1;

		gran = wakeup_gran(se);
		if (vdiff > gran)
		return 1;

		return 0;
		}

		static void set_last_buddy(struct sched_entity *se)
		{
		for_each_sched_entity(se) {
		if (SCHED_WARN_ON(!se->on_rq))
		return;
		if (se_is_idle(se))
		return;
		cfs_rq_of(se)->last = se;
		}
		}

		static void set_next_buddy(struct sched_entity *se)
		{
		for_each_sched_entity(se) {
		@@ -8276,12 +7929,6 @@ static void set_next_buddy(struct sched_entity *se)
		}
		}

		static void set_skip_buddy(struct sched_entity *se)
		{
		for_each_sched_entity(se)
		cfs_rq_of(se)->skip = se;
		}

		/*
		* Preempt the current task with a newly woken task if needed:
		*/
		@@ -8290,7 +7937,6 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
		struct task_struct *curr = rq->curr;
		struct sched_entity se = &curr->se, pse = &p->se;
		struct cfs_rq *cfs_rq = task_cfs_rq(curr);
		int scale = cfs_rq->nr_running >= sched_nr_latency;
		int next_buddy_marked = 0;
		int cse_is_idle, pse_is_idle;

		@@ -8306,7 +7952,7 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
		if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
		return;

		if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
		if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
		set_next_buddy(pse);
		next_buddy_marked = 1;
		}
		@@ -8354,7 +8000,6 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
		cfs_rq = cfs_rq_of(se);
		update_curr(cfs_rq);

		if (sched_feat(EEVDF)) {
		/*
		* XXX pick_eevdf(cfs_rq) != se ?
		*/
		@@ -8362,36 +8007,9 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
		goto preempt;

		return;
		}

		if (wakeup_preempt_entity(se, pse) == 1) {
		/*
		* Bias pick_next to pick the sched entity that is
		* triggering this preemption.
		*/
		if (!next_buddy_marked)
		set_next_buddy(pse);
		goto preempt;
		}

		return;

		preempt:
		resched_curr(rq);
		/*
		* Only set the backward buddy when the current task is still
		* on the rq. This can happen when a wakeup gets interleaved
		* with schedule on the ->pre_schedule() or idle_balance()
		* point, either of which can * drop the rq lock.
		*
		* Also, during early boot the idle thread is in the fair class,
		* for obvious reasons its a bad idea to schedule back to it.
		*/
		if (unlikely(!se->on_rq \|\| curr == rq->idle))
		return;

		if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		set_last_buddy(se);
		}

		#ifdef CONFIG_SMP
		@@ -8592,8 +8210,6 @@ static void put_prev_task_fair(struct rq rq, struct task_struct prev)

		/*
		* sched_yield() is very simple
		*
		* The magic of dealing with the ->skip buddy is in pick_next_entity.
		*/
		static void yield_task_fair(struct rq *rq)
		{
		@@ -8609,7 +8225,6 @@ static void yield_task_fair(struct rq *rq)

		clear_buddies(cfs_rq, se);

		if (sched_feat(EEVDF) \|\| curr->policy != SCHED_BATCH) {
		update_rq_clock(rq);
		/*
		* Update run-time statistics of the 'current'.
		@@ -8621,11 +8236,8 @@ static void yield_task_fair(struct rq *rq)
		* and double the fastpath cost.
		*/
		rq_clock_skip_update(rq);
		}
		if (sched_feat(EEVDF))
		se->deadline += calc_delta_fair(se->slice, se);

		set_skip_buddy(se);
		se->deadline += calc_delta_fair(se->slice, se);
		}

		static bool yield_to_task_fair(struct rq rq, struct task_struct p)
		@@ -8873,8 +8485,7 @@ static int task_hot(struct task_struct p, struct lb_env env)
		* Buddy candidates are cache hot:
		*/
		if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
		(&p->se == cfs_rq_of(&p->se)->next \|\|
		&p->se == cfs_rq_of(&p->se)->last))
		(&p->se == cfs_rq_of(&p->se)->next))
		return 1;

		if (sysctl_sched_migration_cost == -1)

kernel/sched/features.h

+0 −12

Original line number	Diff line number	Diff line
		@@ -14,13 +14,6 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
		*/
		SCHED_FEAT(NEXT_BUDDY, false)

		/*
		* Prefer to schedule the task that ran last (when we did
		* wake-preempt) as that likely will touch the same data, increases
		* cache locality.
		*/
		SCHED_FEAT(LAST_BUDDY, true)

		/*
		* Consider buddies to be cache hot, decreases the likeliness of a
		* cache buddy being migrated away, increases cache locality.
		@@ -93,8 +86,3 @@ SCHED_FEAT(UTIL_EST, true)
		SCHED_FEAT(UTIL_EST_FASTUP, true)

		SCHED_FEAT(LATENCY_WARN, false)

		SCHED_FEAT(ALT_PERIOD, true)
		SCHED_FEAT(BASE_SLICE, true)

		SCHED_FEAT(EEVDF, true)

kernel/sched/sched.h

+0 −5

Original line number	Diff line number	Diff line
		@@ -570,8 +570,6 @@ struct cfs_rq {
		*/
		struct sched_entity *curr;
		struct sched_entity *next;
		struct sched_entity *last;
		struct sched_entity *skip;

		#ifdef CONFIG_SCHED_DEBUG
		unsigned int nr_spread_over;
		@@ -2508,9 +2506,6 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
		extern unsigned int sysctl_sched_min_granularity;

		#ifdef CONFIG_SCHED_DEBUG
		extern unsigned int sysctl_sched_latency;
		extern unsigned int sysctl_sched_idle_min_granularity;
		extern unsigned int sysctl_sched_wakeup_granularity;
		extern int sysctl_resched_latency_warn_ms;
		extern int sysctl_resched_latency_warn_once;