Commit 89bf80a4 authored by Song Zhang's avatar Song Zhang
Browse files

sched: Introduce priority load balance for qos scheduler

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8MWDD


CVE: NA

-------------------------------------------------

Add new sysctl interface:
/proc/sys/kernel/sched_prio_load_balance_enabled

0: default behavior
1: enable priority load balance for qos scheduler

For tasks co-location with qos scheduler, when CFS do load balance,
it is reasonable to prefer migrating online(Latency Sensitive) tasks.
So the CFS load balance can be changed to below:

cfs_tasks list is owned by online tasks.
Add new cfs_offline_tasks list which is owned by offline tasks.
Prefer to migrate the online tasks of cfs_tasks list to dst rq.
In the scenario of hyperthread interference, if the smt expeller feature
enabled, CPU A and CPU B are two hyperthreads on a physical core,
CPU A runs online tasks while CPU B only has offline tasks, The offline
tasks on CPU B are expelled by the online tasks on CPU A and cannot be
scheduled. However, when load balance is triggered, before CPU B can
migrate some online tasks from CPU A, the load on the two cpus is already
balanced. As a result, CPU B cannot run online tasks and online tasks
cannot be evenly distributed among different cpus.

Signed-off-by: default avatarSong Zhang <zhangsong34@huawei.com>
parent f31cd96b
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -166,6 +166,7 @@ CONFIG_CGROUP_WRITEBACK=y
CONFIG_CGROUP_V1_WRITEBACK=y
CONFIG_CGROUP_SCHED=y
CONFIG_QOS_SCHED=y
CONFIG_QOS_SCHED_PRIO_LB=y
CONFIG_FAIR_GROUP_SCHED=y
CONFIG_CFS_BANDWIDTH=y
CONFIG_RT_GROUP_SCHED=y
+1 −0
Original line number Diff line number Diff line
@@ -188,6 +188,7 @@ CONFIG_CGROUP_WRITEBACK=y
CONFIG_CGROUP_V1_WRITEBACK=y
CONFIG_CGROUP_SCHED=y
CONFIG_QOS_SCHED=y
CONFIG_QOS_SCHED_PRIO_LB=y
CONFIG_FAIR_GROUP_SCHED=y
CONFIG_CFS_BANDWIDTH=y
CONFIG_RT_GROUP_SCHED=y
+9 −0
Original line number Diff line number Diff line
@@ -1031,6 +1031,15 @@ config QOS_SCHED

	  If in doubt, say N.

config QOS_SCHED_PRIO_LB
	bool "Priority load balance for Qos scheduler"
	depends on QOS_SCHED
	default n
	help
	  This feature enable priority load balance
	  for Qos scheduler, which prefer migrating online tasks
	  and migrating offline tasks secondly between CPUs.

config FAIR_GROUP_SCHED
	bool "Group scheduling for SCHED_OTHER"
	depends on CGROUP_SCHED
+3 −0
Original line number Diff line number Diff line
@@ -10045,6 +10045,9 @@ void __init sched_init(void)
		rq->max_idle_balance_cost = sysctl_sched_migration_cost;

		INIT_LIST_HEAD(&rq->cfs_tasks);
#ifdef CONFIG_QOS_SCHED_PRIO_LB
		INIT_LIST_HEAD(&rq->cfs_offline_tasks);
#endif

		rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
+82 −1
Original line number Diff line number Diff line
@@ -147,6 +147,10 @@ static int hundred_thousand = 100000;
static int unthrottle_qos_cfs_rqs(int cpu);
#endif

#ifdef CONFIG_QOS_SCHED_PRIO_LB
unsigned int sysctl_sched_prio_load_balance_enabled;
#endif

#ifdef CONFIG_CFS_BANDWIDTH
/*
 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -234,6 +238,17 @@ static struct ctl_table sched_fair_sysctls[] = {
		.extra1         = SYSCTL_ZERO,
		.extra2		= SYSCTL_ONE_HUNDRED,
	},
#endif
#ifdef CONFIG_QOS_SCHED_PRIO_LB
	{
		.procname	= "sched_prio_load_balance_enabled",
		.data		= &sysctl_sched_prio_load_balance_enabled,
		.maxlen		= sizeof(unsigned int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_ONE,
	},
#endif
	{}
};
@@ -3585,6 +3600,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)

#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_QOS_SCHED_PRIO_LB
static void
adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
	struct rq *rq,
	struct sched_entity *se)
{
	struct task_group *tg = task_group(task_of(se));

	if (sysctl_sched_prio_load_balance_enabled && tg->qos_level == -1)
		(*list_op)(&se->group_node, &rq->cfs_offline_tasks);
	else
		(*list_op)(&se->group_node, &rq->cfs_tasks);
}
#endif

static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3594,7 +3624,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
		struct rq *rq = rq_of(cfs_rq);

		account_numa_enqueue(rq, task_of(se));
#ifdef CONFIG_QOS_SCHED_PRIO_LB
		adjust_rq_cfs_tasks(list_add, rq, se);
#else
		list_add(&se->group_node, &rq->cfs_tasks);
#endif
	}
#endif
	cfs_rq->nr_running++;
@@ -8885,7 +8919,11 @@ done: __maybe_unused;
	 * the list, so our cfs_tasks list becomes MRU
	 * one.
	 */
#ifdef CONFIG_QOS_SCHED_PRIO_LB
	adjust_rq_cfs_tasks(list_move, rq, &p->se);
#else
	list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
#endif

	if (hrtick_enabled_fair(rq))
@@ -9233,6 +9271,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
	    (&p->se == cfs_rq_of(&p->se)->next))
		return 1;

#ifdef CONFIG_QOS_SCHED_PRIO_LB
	/* Preempt sched idle cpu do not consider migration cost */
	if (sysctl_sched_prio_load_balance_enabled &&
	    cpus_share_cache(env->src_cpu, env->dst_cpu) &&
	    sched_idle_cpu(env->dst_cpu))
		return 0;
#endif

	if (sysctl_sched_migration_cost == -1)
		return 1;

@@ -9432,11 +9478,18 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
static struct task_struct *detach_one_task(struct lb_env *env)
{
	struct task_struct *p;
	struct list_head *tasks = &env->src_rq->cfs_tasks;
#ifdef CONFIG_QOS_SCHED_PRIO_LB
	int loop = 0;
#endif

	lockdep_assert_rq_held(env->src_rq);

#ifdef CONFIG_QOS_SCHED_PRIO_LB
again:
#endif
	list_for_each_entry_reverse(p,
			&env->src_rq->cfs_tasks, se.group_node) {
			tasks, se.group_node) {
		if (!can_migrate_task(p, env))
			continue;

@@ -9451,6 +9504,15 @@ static struct task_struct *detach_one_task(struct lb_env *env)
		schedstat_inc(env->sd->lb_gained[env->idle]);
		return p;
	}
#ifdef CONFIG_QOS_SCHED_PRIO_LB
	if (sysctl_sched_prio_load_balance_enabled) {
		loop++;
		if (loop == 1) {
			tasks = &env->src_rq->cfs_offline_tasks;
			goto again;
		}
	}
#endif
	return NULL;
}

@@ -9466,6 +9528,9 @@ static int detach_tasks(struct lb_env *env)
	unsigned long util, load;
	struct task_struct *p;
	int detached = 0;
#ifdef CONFIG_QOS_SCHED_PRIO_LB
	int loop = 0;
#endif

	lockdep_assert_rq_held(env->src_rq);

@@ -9481,6 +9546,9 @@ static int detach_tasks(struct lb_env *env)
	if (env->imbalance <= 0)
		return 0;

#ifdef CONFIG_QOS_SCHED_PRIO_LB
again:
#endif
	while (!list_empty(tasks)) {
		/*
		 * We don't want to steal all, otherwise we may be treated likewise,
@@ -9586,6 +9654,15 @@ static int detach_tasks(struct lb_env *env)
		list_move(&p->se.group_node, tasks);
	}

#ifdef CONFIG_QOS_SCHED_PRIO_LB
	if (sysctl_sched_prio_load_balance_enabled && env->imbalance > 0) {
		loop++;
		if (loop == 1) {
			tasks = &env->src_rq->cfs_offline_tasks;
			goto again;
		}
	}
#endif
	/*
	 * Right now, this is one of only two places we collect this stat
	 * so we can safely collect detach_one_task() stats here rather
@@ -13212,7 +13289,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
		 * Move the next running task to the front of the list, so our
		 * cfs_tasks list becomes MRU one.
		 */
#ifdef CONFIG_QOS_SCHED_PRIO_LB
		adjust_rq_cfs_tasks(list_move, rq, se);
#else
		list_move(&se->group_node, &rq->cfs_tasks);
#endif
	}
#endif

Loading