Commit 713cfd26 authored by Hui Tang's avatar Hui Tang Committed by zhangchangzhong
Browse files

sched: Introduce smart grid scheduling strategy for cfs

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7BQZ0


CVE: NA

----------------------------------------

We want to dynamically expand or shrink the affinity range of tasks
based on the CPU topology level while meeting the minimum resource
requirements of tasks.

We divide several level of affinity domains according to sched domains:

level4   * SOCKET  [                                                  ]
level3   * DIE     [                             ]
level2   * MC      [             ] [             ]
level1   * SMT     [     ] [     ] [     ] [     ]
level0   * CPU      0   1   2   3   4   5   6   7

Whether users tend to choose power saving or performance will affect
strategy of adjusting affinity, when selecting the power saving mode,
we will choose a more appropriate affinity based on the energy model
to reduce power consumption, while considering the QOS of resources
such as CPU and memory consumption, for instance, if the current task
CPU load is less than required, smart grid will judge whether to aggregate
tasks together into a smaller range or not according to energy model.

The main difference from EAS is that we pay more attention to the impact
of power consumption brought by such as cpuidle and DVFS, and classify
tasks to reduce interference and ensure resource QOS in each divided unit,
which are more suitable for general-purpose on non-heterogeneous CPUs.

        --------        --------        --------
       | group0 |      | group1 |      | group2 |
        --------        --------        --------
	   |                |              |
	   v                |              v
       ---------------------+-----     -----------------
      |                  ---v--   |   |
      |       DIE0      |  MC1 |  |   |   DIE1
      |                  ------   |   |
       ---------------------------     -----------------

We regularly count the resource satisfaction of groups, and adjust the
affinity, scheduling balance and migrating memory will be considered
based on memory location for better meetting resource requirements.

Signed-off-by: default avatarHui Tang <tanghui20@huawei.com>
Signed-off-by: default avatarWang ShaoBo <bobo.shaobowang@huawei.com>
Reviewed-by: default avatarChen Hui <judy.chenhui@huawei.com>
Reviewed-by: default avatarZhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: default avatarZhang Changzhong <zhangchangzhong@huawei.com>
parent aaf2ccb4
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -386,6 +386,16 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
		   cpumask_pr_args(&task->cpus_allowed));
}

#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
static void task_cpus_preferred(struct seq_file *m, struct task_struct *task)
{
	seq_printf(m, "Cpus_preferred:\t%*pb\n",
		   cpumask_pr_args(task->prefer_cpus));
	seq_printf(m, "Cpus_preferred_list:\t%*pbl\n",
		   cpumask_pr_args(task->prefer_cpus));
}
#endif

static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
{
	seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
@@ -414,6 +424,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
	task_cpus_allowed(m, task);
	cpuset_task_status_allowed(m, task);
	task_context_switch_counts(m, task);
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
	task_cpus_preferred(m, task);
#endif
	return 0;
}

+13 −0
Original line number Diff line number Diff line
@@ -2000,4 +2000,17 @@ int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
void sched_prefer_cpus_free(struct task_struct *p);
#endif

#ifdef CONFIG_QOS_SCHED_SMART_GRID
extern struct static_key __smart_grid_used;
static inline bool smart_grid_used(void)
{
	return static_key_false(&__smart_grid_used);
}
#else
static inline bool smart_grid_used(void)
{
	return false;
}
#endif

#endif
+4 −0
Original line number Diff line number Diff line
@@ -36,6 +36,10 @@ extern unsigned int sysctl_sched_child_runs_first;
extern int sysctl_sched_util_low_pct;
#endif

#ifdef CONFIG_QOS_SCHED_SMART_GRID
extern int sysctl_affinity_adjust_delay_ms;
#endif

enum sched_tunable_scaling {
	SCHED_TUNABLESCALING_NONE,
	SCHED_TUNABLESCALING_LOG,
+13 −0
Original line number Diff line number Diff line
@@ -834,6 +834,19 @@ config QOS_SCHED_DYNAMIC_AFFINITY
	 of taskgroup is below threshold setted, otherwise make taskgroup to use
	 cpus allowed.

config QOS_SCHED_SMART_GRID
	bool "qos smart grid scheduler"
	depends on FAIR_GROUP_SCHED && QOS_SCHED_DYNAMIC_AFFINITY
	default n
	help
	 This feature is used for power consumption tuning in server scenario.
	 This can be divided into the following aspects:
	  1. User interface, manage user needs.
	  2. Collect tasks' features to ensure key tasks' QOS.
	  3. Weaken the influence the impact of CPU frequency and cpuidle
	     adjustment on tasks.
	  4. Docking EAS (Energy Aware Scheduling) model.

config CGROUP_PIDS
	bool "PIDs controller"
	help
+147 −0
Original line number Diff line number Diff line
@@ -5842,6 +5842,7 @@ int sched_cpu_activate(unsigned int cpu)
		static_branch_inc_cpuslocked(&sched_smt_present);
#endif
	set_cpu_active(cpu, true);
	tg_update_affinity_domains(cpu, 1);

	if (sched_smp_initialized) {
		sched_domains_numa_masks_set(cpu);
@@ -5900,6 +5901,7 @@ int sched_cpu_deactivate(unsigned int cpu)
		return ret;
	}
	sched_domains_numa_masks_clear(cpu);
	tg_update_affinity_domains(cpu, 0);
	return 0;
}

@@ -5970,6 +5972,8 @@ void __init sched_init_smp(void)
	init_sched_dl_class();

	sched_smp_initialized = true;

	init_auto_affinity(&root_task_group);
}

static int __init migration_init(void)
@@ -6530,6 +6534,9 @@ void sched_move_task(struct task_struct *tsk)
		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
	struct rq_flags rf;
	struct rq *rq;
#ifdef CONFIG_QOS_SCHED_SMART_GRID
	struct affinity_domain *ad;
#endif

	rq = task_rq_lock(tsk, &rf);
	update_rq_clock(rq);
@@ -6550,6 +6557,14 @@ void sched_move_task(struct task_struct *tsk)
		set_curr_task(rq, tsk);

	task_rq_unlock(rq, tsk, &rf);

#ifdef CONFIG_QOS_SCHED_SMART_GRID
	if (smart_grid_used()) {
		ad = &task_group(tsk)->auto_affinity->ad;
		set_prefer_cpus_ptr(tsk, ad->domains[ad->curr_level]);
	}
#endif

}

static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -6969,6 +6984,117 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_QOS_SCHED_SMART_GRID
int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode)
{
	struct auto_affinity *auto_affi = tg->auto_affinity;
	int ret = 0;

	raw_spin_lock_irq(&auto_affi->lock);

	/* auto mode*/
	if (mode == 1) {
		start_auto_affinity(auto_affi);
	} else if (mode == 0) {
		stop_auto_affinity(auto_affi);
	} else {
		raw_spin_unlock_irq(&auto_affi->lock);
		return -EINVAL;
	}

	auto_affi->mode = mode;
	raw_spin_unlock_irq(&auto_affi->lock);

	return ret;
}

static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css,
					      struct cftype *cft)
{
	struct task_group *tg = css_tg(css);

	return tg->auto_affinity->mode;
}

static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css,
				   struct cftype *cftype, u64 mode)
{
	return tg_set_dynamic_affinity_mode(css_tg(css), mode);
}

int tg_set_affinity_period(struct task_group *tg, u64 period_ms)
{
	if (period_ms > U64_MAX / NSEC_PER_MSEC)
		return -EINVAL;

	raw_spin_lock_irq(&tg->auto_affinity->lock);
	tg->auto_affinity->period = ms_to_ktime(period_ms);
	raw_spin_unlock_irq(&tg->auto_affinity->lock);
	return 0;
}

u64 tg_get_affinity_period(struct task_group *tg)
{
	return ktime_to_ms(tg->auto_affinity->period);
}

static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css,
					  struct cftype *cftype, u64 period)
{
	return tg_set_affinity_period(css_tg(css), period);
}

static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css,
					 struct cftype *cft)
{
	return tg_get_affinity_period(css_tg(css));
}

static int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css,
					      struct cftype *cftype,
					      u64 mask)
{
	struct task_group *tg = css_tg(css);
	struct affinity_domain *ad = &tg->auto_affinity->ad;
	u16 full = (1 << ad->dcount) - 1;

	if (mask > full)
		return -EINVAL;

	raw_spin_lock_irq(&tg->auto_affinity->lock);
	ad->domain_mask = mask;
	raw_spin_unlock_irq(&tg->auto_affinity->lock);
	return 0;
}

static u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css,
					     struct cftype *cft)
{
	struct task_group *tg = css_tg(css);

	return tg->auto_affinity->ad.domain_mask;
}

static int cpu_affinity_stat_show(struct seq_file *sf, void *v)
{
	struct task_group *tg = css_tg(seq_css(sf));
	struct auto_affinity *auto_affi = tg->auto_affinity;
	struct affinity_domain *ad = &auto_affi->ad;
	int i;

	seq_printf(sf, "period_active %d\n", auto_affi->period_active);
	seq_printf(sf, "dcount %d\n", ad->dcount);
	seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask);
	seq_printf(sf, "curr_level %d\n", ad->curr_level);
	for (i = 0; i < ad->dcount; i++)
		seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n",
			i, cpumask_pr_args(ad->domains[i]),
			schedstat_val(ad->stay_cnt[i]));

	return 0;
}
#endif /* CONFIG_QOS_SCHED_SMART_GRID */

#ifdef CONFIG_QOS_SCHED
static int tg_change_scheduler(struct task_group *tg, void *data)
{
@@ -7073,6 +7199,27 @@ static struct cftype cpu_legacy_files[] = {
		.read_s64 = cpu_qos_read,
		.write_s64 = cpu_qos_write,
	},
#endif
#ifdef CONFIG_QOS_SCHED_SMART_GRID
	{
		.name = "dynamic_affinity_mode",
		.read_u64 = cpu_affinity_mode_read_u64,
		.write_u64 = cpu_affinity_mode_write_u64,
	},
	{
		.name = "affinity_period_ms",
		.read_u64 = cpu_affinity_period_read_uint,
		.write_u64 = cpu_affinity_period_write_uint,
	},
	{
		.name = "affinity_domain_mask",
		.read_u64 = cpu_affinity_domain_mask_read_u64,
		.write_u64 = cpu_affinity_domain_mask_write_u64,
	},
	{
		.name = "affinity_stat",
		.seq_show = cpu_affinity_stat_show,
	},
#endif
	{ }	/* Terminate */
};
Loading