Unverified Commit df9cfeee authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!844 A patchset of sched to improve benchmark performance

Merge Pull Request from: @NNNNicole 
 
1.sched/pelt: Relax the sync of *_sum with *_avg (patch1-patch3)
2.Adjust NUMA imbalance for multiple LLCs(patch4-patch6)
3.sched: Queue task on wakelist in the same llc if the wakee cpu is idle(patch7)
4.Clear ttwu_pending after enqueue_task(patch8)
 
 
Link:https://gitee.com/openeuler/kernel/pulls/844

 

Reviewed-by: default avatarZucheng Zheng <zhengzucheng@huawei.com>
Signed-off-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
parents 162d1b0b a6dcd26f
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -153,8 +153,12 @@ struct sched_domain {
		struct rcu_head rcu;	/* used during destruction */
	};
	struct sched_domain_shared *shared;

#ifndef __GENKSYMS__
	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */
	KABI_FILL_HOLE(unsigned int kabi_hole)
#else
	KABI_RESERVE(1)
#endif
	KABI_RESERVE(2)

	unsigned int span_weight;
+25 −19
Original line number Diff line number Diff line
@@ -2933,13 +2933,6 @@ void sched_ttwu_pending(void *arg)
	if (!llist)
		return;

	/*
	 * rq::ttwu_pending racy indication of out-standing wakeups.
	 * Races such that false-negatives are possible, since they
	 * are shorter lived that false-positives would be.
	 */
	WRITE_ONCE(rq->ttwu_pending, 0);

	rq_lock_irqsave(rq, &rf);
	update_rq_clock(rq);

@@ -2953,6 +2946,17 @@ void sched_ttwu_pending(void *arg)
		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
	}

	/*
	 * Must be after enqueueing at least once task such that
	 * idle_cpu() does not observe a false-negative -- if it does,
	 * it is possible for select_idle_siblings() to stack a number
	 * of tasks on this CPU during that window.
	 *
	 * It is ok to clear ttwu_pending when another task pending.
	 * We will receive IPI after local irq enabled and then enqueue it.
	 * Since now nr_running > 0, idle_cpu() will always get correct result.
	 */
	WRITE_ONCE(rq->ttwu_pending, 0);
	rq_unlock_irqrestore(rq, &rf);
}

@@ -3026,7 +3030,7 @@ bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
	return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu);
}

static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static inline bool ttwu_queue_cond(int cpu)
{
	/*
	 * If the CPU does not share cache, then queue the task on the
@@ -3035,17 +3039,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
	if (!cpus_share_cache(smp_processor_id(), cpu))
		return true;

	if (cpu == smp_processor_id())
		return false;

	/*
	 * If the task is descheduling and the only running task on the
	 * CPU then use the wakelist to offload the task activation to
	 * the soon-to-be-idle CPU as the current CPU is likely busy.
	 * nr_running is checked to avoid unnecessary task stacking.
	 * If the wakee cpu is idle, or the task is descheduling and the
	 * only running task on the CPU, then use the wakelist to offload
	 * the task activation to the idle (or soon-to-be-idle) CPU as
	 * the current CPU is likely busy. nr_running is checked to
	 * avoid unnecessary task stacking.
	 *
	 * Note that we can only get here with (wakee) p->on_rq=0,
	 * p->on_cpu can be whatever, we've done the dequeue, so
	 * the wakee has been accounted out of ->nr_running.
	 */
	if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
	if (!cpu_rq(cpu)->nr_running)
		return true;

	return false;
@@ -3053,10 +3061,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)

static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
		if (WARN_ON_ONCE(cpu == smp_processor_id()))
			return false;

	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
		sched_clock_cpu(cpu); /* Sync clocks across CPUs */
		__ttwu_queue_wakelist(p, cpu, wake_flags);
		return true;
@@ -3333,7 +3338,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
	 * scheduling.
	 */
	if (smp_load_acquire(&p->on_cpu) &&
	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
		goto unlock;

	/*
@@ -3895,7 +3900,8 @@ static inline void prepare_task(struct task_struct *next)
	 * Claim the task as running, we do this before switching to it
	 * such that any running task will have this set.
	 *
	 * See the ttwu() WF_ON_CPU case and its ordering comment.
	 * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
	 * its ordering comment.
	 */
	WRITE_ONCE(next->on_cpu, 1);
#endif
+75 −43
Original line number Diff line number Diff line
@@ -1524,6 +1524,7 @@ struct task_numa_env {

	int src_cpu, src_nid;
	int dst_cpu, dst_nid;
	int imb_numa_nr;

	struct numa_stats src_stats, dst_stats;

@@ -1539,7 +1540,7 @@ static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance,
					int dst_running, int dst_weight);
					int dst_running, int imb_numa_nr);

static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1920,7 +1921,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
		dst_running = env->dst_stats.nr_running + 1;
		imbalance = max(0, dst_running - src_running);
		imbalance = adjust_numa_imbalance(imbalance, dst_running,
							env->dst_stats.weight);
						  env->imb_numa_nr);

		/* Use idle CPU if there is no imbalance */
		if (!imbalance) {
@@ -1985,8 +1986,10 @@ static int task_numa_migrate(struct task_struct *p)
	 */
	rcu_read_lock();
	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
	if (sd)
	if (sd) {
		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
		env.imb_numa_nr = sd->imb_numa_nr;
	}
	rcu_read_unlock();

	/*
@@ -3084,6 +3087,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
	/* See update_cfs_rq_load_avg() */
	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3449,11 +3455,11 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
	u32 divider;
	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
	u32 new_sum, divider;

	/* Nothing to update */
	if (!delta)
	if (!delta_avg)
		return;

	/*
@@ -3462,23 +3468,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
	 */
	divider = get_pelt_divider(&cfs_rq->avg);


	/* Set new sched_entity's utilization */
	se->avg.util_avg = gcfs_rq->avg.util_avg;
	se->avg.util_sum = se->avg.util_avg * divider;
	new_sum = se->avg.util_avg * divider;
	delta_sum = (long)new_sum - (long)se->avg.util_sum;
	se->avg.util_sum = new_sum;

	/* Update parent cfs_rq utilization */
	add_positive(&cfs_rq->avg.util_avg, delta);
	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
	add_positive(&cfs_rq->avg.util_avg, delta_avg);
	add_positive(&cfs_rq->avg.util_sum, delta_sum);

	/* See update_cfs_rq_load_avg() */
	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}

static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
	long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
	u32 divider;
	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
	u32 new_sum, divider;

	/* Nothing to update */
	if (!delta)
	if (!delta_avg)
		return;

	/*
@@ -3489,19 +3502,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf

	/* Set new sched_entity's runnable */
	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
	se->avg.runnable_sum = se->avg.runnable_avg * divider;
	new_sum = se->avg.runnable_avg * divider;
	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
	se->avg.runnable_sum = new_sum;

	/* Update parent cfs_rq runnable */
	add_positive(&cfs_rq->avg.runnable_avg, delta);
	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
	/* See update_cfs_rq_load_avg() */
	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}

static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
	long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
	unsigned long load_avg;
	u64 load_sum = 0;
	s64 delta_sum;
	u32 divider;

	if (!runnable_sum)
@@ -3528,7 +3547,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
		 * assuming all tasks are equally runnable.
		 */
		if (scale_load_down(gcfs_rq->load.weight)) {
			load_sum = div_s64(gcfs_rq->avg.load_sum,
			load_sum = div_u64(gcfs_rq->avg.load_sum,
				scale_load_down(gcfs_rq->load.weight));
		}

@@ -3545,16 +3564,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
	runnable_sum = max(runnable_sum, running_sum);

	load_sum = (s64)se_weight(se) * runnable_sum;
	load_avg = div_s64(load_sum, divider);
	load_sum = se_weight(se) * runnable_sum;
	load_avg = div_u64(load_sum, divider);

	delta = load_avg - se->avg.load_avg;
	delta_avg = load_avg - se->avg.load_avg;
	if (!delta_avg)
		return;

	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;

	se->avg.load_sum = runnable_sum;
	se->avg.load_avg = load_avg;

	add_positive(&cfs_rq->avg.load_avg, delta);
	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
	add_positive(&cfs_rq->avg.load_avg, delta_avg);
	add_positive(&cfs_rq->avg.load_sum, delta_sum);
	/* See update_cfs_rq_load_avg() */
	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}

static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3670,7 +3695,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

		r = removed_load;
		sub_positive(&sa->load_avg, r);
		sa->load_sum = sa->load_avg * divider;
		sub_positive(&sa->load_sum, r * divider);
		/* See sa->util_sum below */
		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);

		r = removed_util;
		sub_positive(&sa->util_avg, r);
@@ -3690,7 +3717,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

		r = removed_runnable;
		sub_positive(&sa->runnable_avg, r);
		sa->runnable_sum = sa->runnable_avg * divider;
		sub_positive(&sa->runnable_sum, r * divider);
		/* See sa->util_sum above */
		sa->runnable_sum = max_t(u32, sa->runnable_sum,
					      sa->runnable_avg * PELT_MIN_DIVIDER);

		/*
		 * removed_runnable is the unweighted version of removed_load so we
@@ -3777,17 +3807,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/*
	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	 * See ___update_load_avg() for details.
	 */
	u32 divider = get_pelt_divider(&cfs_rq->avg);

	dequeue_load_avg(cfs_rq, se);
	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
	/* See update_cfs_rq_load_avg() */
	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);

	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
	/* See update_cfs_rq_load_avg() */
	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);

	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

@@ -9965,9 +9996,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
 * This is an approximation as the number of running tasks may not be
 * related to the number of busy CPUs due to sched_setaffinity.
 */
static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
{
	return (dst_running < (dst_weight >> 2));
	return running <= imb_numa_nr;
}

/*
@@ -10106,12 +10137,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
				return idlest;
#endif
			/*
			 * Otherwise, keep the task on this node to stay close
			 * its wakeup source and improve locality. If there is
			 * a real need of migration, periodic load balance will
			 * take care of it.
			 * Otherwise, keep the task close to the wakeup source
			 * and improve locality if the number of running tasks
			 * would remain below threshold where an imbalance is
			 * allowed. If there is a real need of migration,
			 * periodic load balance will take care of it.
			 */
			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
				return NULL;
		}

@@ -10291,9 +10323,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
#define NUMA_IMBALANCE_MIN 2

static inline long adjust_numa_imbalance(int imbalance,
				int dst_running, int dst_weight)
				int dst_running, int imb_numa_nr)
{
	if (!allow_numa_imbalance(dst_running, dst_weight))
	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
		return imbalance;

	/*
@@ -10405,7 +10437,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
		/* Consider allowing a small imbalance between NUMA groups */
		if (env->sd->flags & SD_NUMA) {
			env->imbalance = adjust_numa_imbalance(env->imbalance,
				busiest->sum_nr_running, busiest->group_weight);
				local->sum_nr_running + 1, env->sd->imb_numa_nr);
		}

		return;
+0 −1
Original line number Diff line number Diff line
@@ -2085,7 +2085,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC			0x01		/* Waker goes to sleep after wakeup */
#define WF_FORK			0x02		/* Child wakeup after fork */
#define WF_MIGRATED		0x04		/* Internal use, task got migrated */
#define WF_ON_CPU		0x08		/* Wakee is on_cpu */

/*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
+53 −0
Original line number Diff line number Diff line
@@ -2343,6 +2343,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
		}
	}

	/*
	 * Calculate an allowed NUMA imbalance such that LLCs do not get
	 * imbalanced.
	 */
	for_each_cpu(i, cpu_map) {
		unsigned int imb = 0;
		unsigned int imb_span = 1;

		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
			struct sched_domain *child = sd->child;

			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
				struct sched_domain *top, *top_p;
				unsigned int nr_llcs;

				/*
				 * For a single LLC per node, allow an
				 * imbalance up to 25% of the node. This is an
				 * arbitrary cutoff based on SMT-2 to balance
				 * between memory bandwidth and avoiding
				 * premature sharing of HT resources and SMT-4
				 * or SMT-8 *may* benefit from a different
				 * cutoff.
				 *
				 * For multiple LLCs, allow an imbalance
				 * until multiple tasks would share an LLC
				 * on one node while LLCs on another node
				 * remain idle.
				 */
				nr_llcs = sd->span_weight / child->span_weight;
				if (nr_llcs == 1)
					imb = sd->span_weight >> 2;
				else
					imb = nr_llcs;
				sd->imb_numa_nr = imb;

				/* Set span based on the first NUMA domain. */
				top = sd;
				top_p = top->parent;
				while (top_p && !(top_p->flags & SD_NUMA)) {
					top = top->parent;
					top_p = top->parent;
				}
				imb_span = top_p ? top_p->span_weight : sd->span_weight;
			} else {
				int factor = max(1U, (sd->span_weight / imb_span));

				sd->imb_numa_nr = imb * factor;
			}
		}
	}

	/* Calculate CPU capacity for physical packages and nodes */
	for (i = nr_cpumask_bits-1; i >= 0; i--) {
		if (!cpumask_test_cpu(i, cpu_map))