Commit 926f2241 authored by Waiman Long's avatar Waiman Long Committed by Chen Ridong
Browse files

cgroup/cpuset: Fix remote root partition creation problem

mainline inclusion
from mainline-v6.11-rc1
commit ccac8e8de99cbcf5e7f53251ebce917bf7bcc29c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/IAOPW2

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ccac8e8de99cbcf5e7f53251ebce917bf7bcc29c



----------------------------------------------------------------------

Since commit 181c8e091aae ("cgroup/cpuset: Introduce remote partition"),
a remote partition can be created underneath a non-partition root cpuset
as long as its exclusive_cpus are set to distribute exclusive CPUs down
to its children. The generate_sched_domains() function, however, doesn't
take into account this new behavior and hence will fail to create the
sched domain needed for a remote root (non-isolated) partition.

There are two issues related to remote partition support. First of
all, generate_sched_domains() has a fast path that is activated if
root_load_balance is true and top_cpuset.nr_subparts is non-zero. The
later condition isn't quite correct for remote partitions as nr_subparts
just shows the number of local child partitions underneath it. There
can be no local child partition under top_cpuset even if there are
remote partitions further down the hierarchy. Fix that by checking
for subpartitions_cpus which contains exclusive CPUs allocated to both
local and remote partitions.

Secondly, the valid partition check for subtree skipping in the csa[]
generation loop isn't enough as remote partition does not need to
have a partition root parent. Fix this problem by breaking csa[] array
generation loop of generate_sched_domains() into v1 and v2 specific parts
and checking a cpuset's exclusive_cpus before skipping its subtree in
the v2 case.

Also simplify generate_sched_domains() for cgroup v2 as only
non-isolating partition roots should be included in building the cpuset
array and none of the v1 scheduling attributes other than a different
way to create an isolated partition are supported.

Fixes: 027bac95 ("cgroup/cpuset: Introduce remote partition")
Signed-off-by: default avatarWaiman Long <longman@redhat.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarChen Ridong <chenridong@huawei.com>
parent 8799cb6d
Loading
Loading
Loading
Loading
+42 −13
Original line number Diff line number Diff line
@@ -180,7 +180,7 @@ struct cpuset {
	/* for custom sched domain */
	int relax_domain_level;

	/* number of valid sub-partitions */
	/* number of valid local child partitions */
	int nr_subparts;

	/* partition root state */
@@ -1114,13 +1114,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
	int nslot;		/* next empty doms[] struct cpumask slot */
	struct cgroup_subsys_state *pos_css;
	bool root_load_balance = is_sched_load_balance(&top_cpuset);
	bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);

	doms = NULL;
	dattr = NULL;
	csa = NULL;

	/* Special case for the 99% of systems with one, full, sched domain */
	if (root_load_balance && !top_cpuset.nr_subparts) {
	if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
single_root_domain:
		ndoms = 1;
		doms = alloc_sched_domains(ndoms);
@@ -1149,16 +1150,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
		if (cp == &top_cpuset)
			continue;

		if (cgrpv2)
			goto v2;

		/*
		 * v1:
		 * Continue traversing beyond @cp iff @cp has some CPUs and
		 * isn't load balancing.  The former is obvious.  The
		 * latter: All child cpusets contain a subset of the
		 * parent's cpus, so just skip them, and then we call
		 * update_domain_attr_tree() to calc relax_domain_level of
		 * the corresponding sched domain.
		 *
		 * If root is load-balancing, we can skip @cp if it
		 * is a subset of the root's effective_cpus.
		 */
		if (!cpumask_empty(cp->cpus_allowed) &&
		    !(is_sched_load_balance(cp) &&
@@ -1166,16 +1169,28 @@ static int generate_sched_domains(cpumask_var_t **domains,
					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
			continue;

		if (root_load_balance &&
		    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
		if (is_sched_load_balance(cp) &&
		    !cpumask_empty(cp->effective_cpus))
			csa[csn++] = cp;

		/* skip @cp's subtree */
		pos_css = css_rightmost_descendant(pos_css);
		continue;

		if (is_sched_load_balance(cp) &&
v2:
		/*
		 * Only valid partition roots that are not isolated and with
		 * non-empty effective_cpus will be saved into csn[].
		 */
		if ((cp->partition_root_state == PRS_ROOT) &&
		    !cpumask_empty(cp->effective_cpus))
			csa[csn++] = cp;

		/* skip @cp's subtree if not a partition root */
		if (!is_partition_valid(cp))
		/*
		 * Skip @cp's subtree if not a partition root and has no
		 * exclusive CPUs to be granted to child cpusets.
		 */
		if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
			pos_css = css_rightmost_descendant(pos_css);
	}
	rcu_read_unlock();
@@ -1229,6 +1244,20 @@ static int generate_sched_domains(cpumask_var_t **domains,
	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
			      GFP_KERNEL);

	/*
	 * Cgroup v2 doesn't support domain attributes, just set all of them
	 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
	 * subset of HK_TYPE_DOMAIN housekeeping CPUs.
	 */
	if (cgrpv2) {
		for (i = 0; i < ndoms; i++) {
			cpumask_copy(doms[i], csa[i]->effective_cpus);
			if (dattr)
				dattr[i] = SD_ATTR_INIT;
		}
		goto done;
	}

	for (nslot = 0, i = 0; i < csn; i++) {
		struct cpuset *a = csa[i];
		struct cpumask *dp;
@@ -1388,7 +1417,7 @@ static void rebuild_sched_domains_locked(void)
	 * root should be only a subset of the active CPUs.  Since a CPU in any
	 * partition root could be offlined, all must be checked.
	 */
	if (top_cpuset.nr_subparts) {
	if (!cpumask_empty(subpartitions_cpus)) {
		rcu_read_lock();
		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
			if (!is_partition_valid(cs)) {
@@ -4679,7 +4708,7 @@ static void cpuset_handle_hotplug(void)
	 * In the rare case that hotplug removes all the cpus in
	 * subpartitions_cpus, we assumed that cpus are updated.
	 */
	if (!cpus_updated && top_cpuset.nr_subparts)
	if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
		cpus_updated = true;

	/* For v1, synchronize cpus_allowed to cpu_active_mask */