Commit 23c8dbb3 authored by zhangwei123171's avatar zhangwei123171
Browse files

sched/fair: introduce cgroup level smt expell

jingdong inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7NJY2



-----------------------------------------------------

This feature supports enabling SMT expell by cgroup dimension.
Compared with system-level SMT expell, on the one hand, it can
guarantee the SMT expell requirements of specific online services.
At the same time, it can also improve the CPU utilization of the
whole machine to a certain extent.

system-level smt has 2 per-cpu smt status:
1. QOS_LEVEL_ONLINE
   the current is not offline task. for example, all online containers,
   systemd.
2. QOS_LEVEL_OFFLINE
   the current is offline task or idle.

cgroup level smt expell has 3 per-cpu smt status:
1. SMT_EXPELLER
   the current is online task with smt expeller capibility.
2  SMT_EXPELLED
   the curent is offline task or idle.
3. SMT_NONE
   the current is other online task.

enable:
echo  1 > /sys/fs/cgroup/cpu/xx/cpu.smt_expell

disable:
echo  0 > /sys/fs/cgroup/cpu/xx/cpu.smt_expell

Signed-off-by: default avatarzhangwei123171 <zhangwei123171@jd.com>
Reviewed-by: default avatarzhaoxiaoqiang11 <zhaoxiaoqiang11@jd.com>
parent 3feff602
Loading
Loading
Loading
Loading
+44 −0
Original line number Diff line number Diff line
@@ -8206,6 +8206,9 @@ void __init sched_init(void)
		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
		root_task_group.smt_expell = TG_SMT_EXPELL;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
		ptr += nr_cpu_ids * sizeof(void **);
@@ -8560,6 +8563,9 @@ static inline int alloc_qos_sched_group(struct task_group *tg,
{
	tg->qos_level = parent->qos_level;

#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
	tg->smt_expell = parent->smt_expell;
#endif
	return 1;
}

@@ -9463,6 +9469,36 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
static int cpu_smt_expell_write(struct cgroup_subsys_state *css,
			 struct cftype *cftype, s64 smt_expell)
{
	struct task_group *tg = css_tg(css);

	if (!tg->se[0])
		return -EINVAL;

	if (smt_expell != TG_SMT_NONE && smt_expell != TG_SMT_EXPELL)
		return -EINVAL;

	/*
	 * a. This attribute takes effect with a delay, and it will not take
	 *    effect until the next cfs task is selected.
	 * b. This property creates temporary state inconsistencies, which may
	 *    result in an invalid smt expell, but overall works fine.
	 */
	tg->smt_expell = smt_expell;

	return 0;
}

static inline s64 cpu_smt_expell_read(struct cgroup_subsys_state *css,
			       struct cftype *cft)
{
	return css_tg(css)->smt_expell;
}
#endif

#ifdef CONFIG_QOS_SCHED
static int tg_change_scheduler(struct task_group *tg, void *data)
{
@@ -9670,6 +9706,14 @@ static struct cftype cpu_legacy_files[] = {
		.write_s64 = cpu_qos_write,
	},
#endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
	{
		.name = "smt_expell",
		.flags = CFTYPE_NOT_ON_ROOT,
		.read_s64 = cpu_smt_expell_read,
		.write_s64 = cpu_smt_expell_write,
	},
#endif
#ifdef CONFIG_BPF_SCHED
	{
		.name = "tag",
+27 −21
Original line number Diff line number Diff line
@@ -7948,8 +7948,7 @@ static bool qos_smt_check_siblings_status(int this_cpu)
	for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
		if (cpu == this_cpu)
			continue;

		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE)
		if (per_cpu(qos_smt_status, cpu) == CPU_SMT_EXPELLER)
			return true;
	}

@@ -7985,10 +7984,12 @@ static __always_inline bool qos_smt_expelled(int this_cpu)

static bool qos_smt_update_status(struct task_struct *p)
{
	int status = QOS_LEVEL_OFFLINE;
	int status = CPU_SMT_EXPELLED;

	if (p != NULL && !is_offline_level(task_group(p)->qos_level))
		status = QOS_LEVEL_ONLINE;
	if (p != NULL && is_expeller_level(task_group(p)))
		status = CPU_SMT_EXPELLER;
	else if (p != NULL && !is_offline_level(task_group(p)->qos_level))
		status = CPU_SMT_NONE;

	if (__this_cpu_read(qos_smt_status) == status)
		return false;
@@ -8013,11 +8014,13 @@ static void qos_smt_send_ipi(int this_cpu)
		rq = cpu_rq(cpu);

		/*
		* There are two cases where current don't need to send scheduler_ipi:
		* a) The qos_smt_status of siblings cpu is online;
		 * There are two cases where current don't need to send
		 * scheduler_ipi:
		 * a) The qos_smt_status of siblings cpu is SMT_EXPELLER or SMT_NONE
		 * b) The cfs.h_nr_running of siblings cpu is 0.
		 */
		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE ||
		if (per_cpu(qos_smt_status, cpu) == CPU_SMT_EXPELLER ||
			per_cpu(qos_smt_status, cpu) == CPU_SMT_NONE ||
			rq->cfs.h_nr_running == 0)
			continue;

@@ -8061,20 +8064,23 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq)
			continue;

		/*
		* There are two cases rely on the set need_resched to drive away
		 * There are two cases rely on the set need_resched to drive away or pick up
		 * offline task:
		* a) The qos_smt_status of siblings cpu is online, the task of current cpu is offline;
		* b) The qos_smt_status of siblings cpu is offline, the task of current cpu is idle,
		*    and current cpu only has SCHED_IDLE tasks enqueued.
		 * a) The qos_smt_status of siblings cpu is online with expeller capability,
		 *    the task of current cpu is offline;
		 * b) The qos_smt_status of siblings cpu is offline or online without smt
		 *    capability. the task of current cpu is idle, and current cpu only
		 *    has offline tasks enqueued.
		 */
		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE &&
		if (per_cpu(qos_smt_status, cpu) == CPU_SMT_EXPELLER &&
		    is_offline_level(task_group(current)->qos_level)) {
			trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu));
			return true;
		}

		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE &&
		    rq->curr == rq->idle && qos_sched_idle_cpu(this_cpu)) {
		if ((per_cpu(qos_smt_status, cpu) == CPU_SMT_EXPELLED ||
			per_cpu(qos_smt_status, cpu) == CPU_SMT_NONE)
			&& rq->curr == rq->idle && qos_sched_idle_cpu(this_cpu)) {
			trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu));
			return true;
		}
@@ -8146,7 +8152,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
again:
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
	if (qos_smt_expelled(this_cpu) && !__this_cpu_read(qos_cpu_overload)) {
		__this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE);
		__this_cpu_write(qos_smt_status, CPU_SMT_EXPELLED);

		if (!qos_timer_is_activated(this_cpu))
			start_qos_hrtimer(this_cpu);
+26 −0
Original line number Diff line number Diff line
@@ -460,7 +460,12 @@ struct task_group {
#else
	KABI_RESERVE(1)
#endif

#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
	KABI_USE(2, long smt_expell)
#else
	KABI_RESERVE(2)
#endif
	KABI_RESERVE(3)
	KABI_RESERVE(4)
};
@@ -1199,6 +1204,19 @@ enum task_qos_level {
void init_qos_hrtimer(int cpu);
#endif

#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
enum tg_smt_status {
	TG_SMT_NONE = 0, //task group without smt expell capability.
	TG_SMT_EXPELL = 1, //online task group with smt expell capability.
};

enum cpu_smt_status {
	CPU_SMT_NONE = 0,     //current is online task without smt expell capibility.
	CPU_SMT_EXPELLED = 1, //curent is offline task or idle.
	CPU_SMT_EXPELLER = 2, //current is online task with smt expell capibility.
};
#endif

struct sched_group;
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -3064,6 +3082,14 @@ static inline int is_offline_level(long qos_level)
}
#endif

#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
static inline int is_expeller_level(struct task_group *tg)
{
	return !is_offline_level(tg->qos_level) &&
		tg->smt_expell == TG_SMT_EXPELL;
}
#endif

#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
static __always_inline int task_has_qos_idle_policy(struct task_struct *p)
{