!844 A patchset of sched to improve benchmark performance (df9cfeee) · Commits · EulixOS / Software / Kernel

include/linux/sched/topology.h

+5 −1

Original line number	Diff line number	Diff line
		@@ -153,8 +153,12 @@ struct sched_domain {
		struct rcu_head rcu; /* used during destruction */
		};
		struct sched_domain_shared *shared;

		#ifndef __GENKSYMS__
		unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */
		KABI_FILL_HOLE(unsigned int kabi_hole)
		#else
		KABI_RESERVE(1)
		#endif
		KABI_RESERVE(2)

		unsigned int span_weight;

kernel/sched/core.c

+25 −19

Original line number	Diff line number	Diff line
		@@ -2933,13 +2933,6 @@ void sched_ttwu_pending(void *arg)
		if (!llist)
		return;

		/*
		* rq::ttwu_pending racy indication of out-standing wakeups.
		* Races such that false-negatives are possible, since they
		* are shorter lived that false-positives would be.
		*/
		WRITE_ONCE(rq->ttwu_pending, 0);

		rq_lock_irqsave(rq, &rf);
		update_rq_clock(rq);

		@@ -2953,6 +2946,17 @@ void sched_ttwu_pending(void *arg)
		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
		}

		/*
		* Must be after enqueueing at least once task such that
		* idle_cpu() does not observe a false-negative -- if it does,
		* it is possible for select_idle_siblings() to stack a number
		* of tasks on this CPU during that window.
		*
		* It is ok to clear ttwu_pending when another task pending.
		* We will receive IPI after local irq enabled and then enqueue it.
		* Since now nr_running > 0, idle_cpu() will always get correct result.
		*/
		WRITE_ONCE(rq->ttwu_pending, 0);
		rq_unlock_irqrestore(rq, &rf);
		}

		@@ -3026,7 +3030,7 @@ bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
		return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu);
		}

		static inline bool ttwu_queue_cond(int cpu, int wake_flags)
		static inline bool ttwu_queue_cond(int cpu)
		{
		/*
		* If the CPU does not share cache, then queue the task on the
		@@ -3035,17 +3039,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
		if (!cpus_share_cache(smp_processor_id(), cpu))
		return true;

		if (cpu == smp_processor_id())
		return false;

		/*
		* If the task is descheduling and the only running task on the
		* CPU then use the wakelist to offload the task activation to
		* the soon-to-be-idle CPU as the current CPU is likely busy.
		* nr_running is checked to avoid unnecessary task stacking.
		* If the wakee cpu is idle, or the task is descheduling and the
		* only running task on the CPU, then use the wakelist to offload
		* the task activation to the idle (or soon-to-be-idle) CPU as
		* the current CPU is likely busy. nr_running is checked to
		* avoid unnecessary task stacking.
		*
		* Note that we can only get here with (wakee) p->on_rq=0,
		* p->on_cpu can be whatever, we've done the dequeue, so
		* the wakee has been accounted out of ->nr_running.
		*/
		if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
		if (!cpu_rq(cpu)->nr_running)
		return true;

		return false;
		@@ -3053,10 +3061,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)

		static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
		{
		if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
		if (WARN_ON_ONCE(cpu == smp_processor_id()))
		return false;

		if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
		sched_clock_cpu(cpu); /* Sync clocks across CPUs */
		__ttwu_queue_wakelist(p, cpu, wake_flags);
		return true;
		@@ -3333,7 +3338,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
		* scheduling.
		*/
		if (smp_load_acquire(&p->on_cpu) &&
		ttwu_queue_wakelist(p, task_cpu(p), wake_flags \| WF_ON_CPU))
		ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
		goto unlock;

		/*
		@@ -3895,7 +3900,8 @@ static inline void prepare_task(struct task_struct *next)
		* Claim the task as running, we do this before switching to it
		* such that any running task will have this set.
		*
		* See the ttwu() WF_ON_CPU case and its ordering comment.
		* See the smp_load_acquire(&p->on_cpu) case in ttwu() and
		* its ordering comment.
		*/
		WRITE_ONCE(next->on_cpu, 1);
		#endif

kernel/sched/fair.c

+75 −43

Original line number	Diff line number	Diff line
		@@ -1524,6 +1524,7 @@ struct task_numa_env {

		int src_cpu, src_nid;
		int dst_cpu, dst_nid;
		int imb_numa_nr;

		struct numa_stats src_stats, dst_stats;

		@@ -1539,7 +1540,7 @@ static unsigned long cpu_load(struct rq *rq);
		static unsigned long cpu_runnable(struct rq *rq);
		static unsigned long cpu_util(int cpu);
		static inline long adjust_numa_imbalance(int imbalance,
		int dst_running, int dst_weight);
		int dst_running, int imb_numa_nr);

		static inline enum
		numa_type numa_classify(unsigned int imbalance_pct,
		@@ -1920,7 +1921,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
		dst_running = env->dst_stats.nr_running + 1;
		imbalance = max(0, dst_running - src_running);
		imbalance = adjust_numa_imbalance(imbalance, dst_running,
		env->dst_stats.weight);
		env->imb_numa_nr);

		/* Use idle CPU if there is no imbalance */
		if (!imbalance) {
		@@ -1985,8 +1986,10 @@ static int task_numa_migrate(struct task_struct *p)
		*/
		rcu_read_lock();
		sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
		if (sd)
		if (sd) {
		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
		env.imb_numa_nr = sd->imb_numa_nr;
		}
		rcu_read_unlock();

		/*
		@@ -3084,6 +3087,9 @@ dequeue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
		sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
		/* See update_cfs_rq_load_avg() */
		cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
		cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
		}
		#else
		static inline void
		@@ -3449,11 +3455,11 @@ void set_task_rq_fair(struct sched_entity *se,
		static inline void
		update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
		{
		long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
		u32 divider;
		long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
		u32 new_sum, divider;

		/* Nothing to update */
		if (!delta)
		if (!delta_avg)
		return;

		/*
		@@ -3462,23 +3468,30 @@ update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq
		*/
		divider = get_pelt_divider(&cfs_rq->avg);


		/* Set new sched_entity's utilization */
		se->avg.util_avg = gcfs_rq->avg.util_avg;
		se->avg.util_sum = se->avg.util_avg * divider;
		new_sum = se->avg.util_avg * divider;
		delta_sum = (long)new_sum - (long)se->avg.util_sum;
		se->avg.util_sum = new_sum;

		/* Update parent cfs_rq utilization */
		add_positive(&cfs_rq->avg.util_avg, delta);
		cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
		add_positive(&cfs_rq->avg.util_avg, delta_avg);
		add_positive(&cfs_rq->avg.util_sum, delta_sum);

		/* See update_cfs_rq_load_avg() */
		cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
		cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
		}

		static inline void
		update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
		{
		long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
		u32 divider;
		long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
		u32 new_sum, divider;

		/* Nothing to update */
		if (!delta)
		if (!delta_avg)
		return;

		/*
		@@ -3489,19 +3502,25 @@ update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct sched_entity se, struct cf

		/* Set new sched_entity's runnable */
		se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
		se->avg.runnable_sum = se->avg.runnable_avg * divider;
		new_sum = se->avg.runnable_avg * divider;
		delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
		se->avg.runnable_sum = new_sum;

		/* Update parent cfs_rq runnable */
		add_positive(&cfs_rq->avg.runnable_avg, delta);
		cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
		add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
		add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
		/* See update_cfs_rq_load_avg() */
		cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
		cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
		}

		static inline void
		update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
		{
		long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
		long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
		unsigned long load_avg;
		u64 load_sum = 0;
		s64 delta_sum;
		u32 divider;

		if (!runnable_sum)
		@@ -3528,7 +3547,7 @@ update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq
		* assuming all tasks are equally runnable.
		*/
		if (scale_load_down(gcfs_rq->load.weight)) {
		load_sum = div_s64(gcfs_rq->avg.load_sum,
		load_sum = div_u64(gcfs_rq->avg.load_sum,
		scale_load_down(gcfs_rq->load.weight));
		}

		@@ -3545,16 +3564,22 @@ update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq
		running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
		runnable_sum = max(runnable_sum, running_sum);

		load_sum = (s64)se_weight(se) * runnable_sum;
		load_avg = div_s64(load_sum, divider);
		load_sum = se_weight(se) * runnable_sum;
		load_avg = div_u64(load_sum, divider);

		delta = load_avg - se->avg.load_avg;
		delta_avg = load_avg - se->avg.load_avg;
		if (!delta_avg)
		return;

		delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;

		se->avg.load_sum = runnable_sum;
		se->avg.load_avg = load_avg;

		add_positive(&cfs_rq->avg.load_avg, delta);
		cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
		add_positive(&cfs_rq->avg.load_avg, delta_avg);
		add_positive(&cfs_rq->avg.load_sum, delta_sum);
		/* See update_cfs_rq_load_avg() */
		cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
		cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
		}

		static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
		@@ -3670,7 +3695,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

		r = removed_load;
		sub_positive(&sa->load_avg, r);
		sa->load_sum = sa->load_avg * divider;
		sub_positive(&sa->load_sum, r * divider);
		/* See sa->util_sum below */
		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);

		r = removed_util;
		sub_positive(&sa->util_avg, r);
		@@ -3690,7 +3717,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

		r = removed_runnable;
		sub_positive(&sa->runnable_avg, r);
		sa->runnable_sum = sa->runnable_avg * divider;
		sub_positive(&sa->runnable_sum, r * divider);
		/* See sa->util_sum above */
		sa->runnable_sum = max_t(u32, sa->runnable_sum,
		sa->runnable_avg * PELT_MIN_DIVIDER);

		/*
		* removed_runnable is the unweighted version of removed_load so we
		@@ -3777,17 +3807,18 @@ static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity s
		*/
		static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
		{
		/*
		* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
		* See ___update_load_avg() for details.
		*/
		u32 divider = get_pelt_divider(&cfs_rq->avg);

		dequeue_load_avg(cfs_rq, se);
		sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
		cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
		sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
		/* See update_cfs_rq_load_avg() */
		cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
		cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);

		sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
		cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
		sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
		/* See update_cfs_rq_load_avg() */
		cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
		cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);

		add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

		@@ -9965,9 +9996,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
		* This is an approximation as the number of running tasks may not be
		* related to the number of busy CPUs due to sched_setaffinity.
		*/
		static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
		static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
		{
		return (dst_running < (dst_weight >> 2));
		return running <= imb_numa_nr;
		}

		/*
		@@ -10106,12 +10137,13 @@ find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
		return idlest;
		#endif
		/*
		* Otherwise, keep the task on this node to stay close
		* its wakeup source and improve locality. If there is
		* a real need of migration, periodic load balance will
		* take care of it.
		* Otherwise, keep the task close to the wakeup source
		* and improve locality if the number of running tasks
		* would remain below threshold where an imbalance is
		* allowed. If there is a real need of migration,
		* periodic load balance will take care of it.
		*/
		if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
		if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
		return NULL;
		}

		@@ -10291,9 +10323,9 @@ static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sd
		#define NUMA_IMBALANCE_MIN 2

		static inline long adjust_numa_imbalance(int imbalance,
		int dst_running, int dst_weight)
		int dst_running, int imb_numa_nr)
		{
		if (!allow_numa_imbalance(dst_running, dst_weight))
		if (!allow_numa_imbalance(dst_running, imb_numa_nr))
		return imbalance;

		/*
		@@ -10405,7 +10437,7 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
		/* Consider allowing a small imbalance between NUMA groups */
		if (env->sd->flags & SD_NUMA) {
		env->imbalance = adjust_numa_imbalance(env->imbalance,
		busiest->sum_nr_running, busiest->group_weight);
		local->sum_nr_running + 1, env->sd->imb_numa_nr);
		}

		return;

kernel/sched/sched.h

+0 −1

Original line number	Diff line number	Diff line
		@@ -2085,7 +2085,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
		#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
		#define WF_FORK 0x02 /* Child wakeup after fork */
		#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
		#define WF_ON_CPU 0x08 /* Wakee is on_cpu */

		/*
		* To aid in avoiding the subversion of "niceness" due to uneven distribution

kernel/sched/topology.c

+53 −0

Original line number	Diff line number	Diff line
		@@ -2343,6 +2343,59 @@ build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr att
		}
		}

		/*
		* Calculate an allowed NUMA imbalance such that LLCs do not get
		* imbalanced.
		*/
		for_each_cpu(i, cpu_map) {
		unsigned int imb = 0;
		unsigned int imb_span = 1;

		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
		struct sched_domain *child = sd->child;

		if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
		(child->flags & SD_SHARE_PKG_RESOURCES)) {
		struct sched_domain top, top_p;
		unsigned int nr_llcs;

		/*
		* For a single LLC per node, allow an
		* imbalance up to 25% of the node. This is an
		* arbitrary cutoff based on SMT-2 to balance
		* between memory bandwidth and avoiding
		* premature sharing of HT resources and SMT-4
		* or SMT-8 may benefit from a different
		* cutoff.
		*
		* For multiple LLCs, allow an imbalance
		* until multiple tasks would share an LLC
		* on one node while LLCs on another node
		* remain idle.
		*/
		nr_llcs = sd->span_weight / child->span_weight;
		if (nr_llcs == 1)
		imb = sd->span_weight >> 2;
		else
		imb = nr_llcs;
		sd->imb_numa_nr = imb;

		/* Set span based on the first NUMA domain. */
		top = sd;
		top_p = top->parent;
		while (top_p && !(top_p->flags & SD_NUMA)) {
		top = top->parent;
		top_p = top->parent;
		}
		imb_span = top_p ? top_p->span_weight : sd->span_weight;
		} else {
		int factor = max(1U, (sd->span_weight / imb_span));

		sd->imb_numa_nr = imb * factor;
		}
		}
		}

		/* Calculate CPU capacity for physical packages and nodes */
		for (i = nr_cpumask_bits-1; i >= 0; i--) {
		if (!cpumask_test_cpu(i, cpu_map))