Commit 0083242c authored by Valentin Schneider's avatar Valentin Schneider Committed by Peter Zijlstra
Browse files

sched/topology: Skip updating masks for non-online nodes



The scheduler currently expects NUMA node distances to be stable from
init onwards, and as a consequence builds the related data structures
once-and-for-all at init (see sched_init_numa()).

Unfortunately, on some architectures node distance is unreliable for
offline nodes and may very well change upon onlining.

Skip over offline nodes during sched_init_numa(). Track nodes that have
been onlined at least once, and trigger a build of a node's NUMA masks
when it is first onlined post-init.

Reported-by: default avatarGeetika Moolchandani <Geetika.Moolchandani1@ibm.com>
Signed-off-by: default avatarSrikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: default avatarValentin Schneider <valentin.schneider@arm.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210818074333.48645-1-srikar@linux.vnet.ibm.com
parent 746f5ea9
Loading
Loading
Loading
Loading
+65 −0
Original line number Diff line number Diff line
@@ -1482,6 +1482,8 @@ int sched_max_numa_distance;
static int			*sched_domains_numa_distance;
static struct cpumask		***sched_domains_numa_masks;
int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;

static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif

/*
@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
			sched_domains_numa_masks[i][j] = mask;

			for_each_node(k) {
				/*
				 * Distance information can be unreliable for
				 * offline nodes, defer building the node
				 * masks to its bringup.
				 * This relies on all unique distance values
				 * still being visible at init time.
				 */
				if (!node_online(j))
					continue;

				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
					sched_numa_warn("Node-distance not symmetric");

@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];

	init_numa_topology_type();

	sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
	if (!sched_numa_onlined_nodes)
		return;

	bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
	for_each_online_node(i)
		bitmap_set(sched_numa_onlined_nodes, i, 1);
}

static void __sched_domains_numa_masks_set(unsigned int node)
{
	int i, j;

	/*
	 * NUMA masks are not built for offline nodes in sched_init_numa().
	 * Thus, when a CPU of a never-onlined-before node gets plugged in,
	 * adding that new CPU to the right NUMA masks is not sufficient: the
	 * masks of that CPU's node must also be updated.
	 */
	if (test_bit(node, sched_numa_onlined_nodes))
		return;

	bitmap_set(sched_numa_onlined_nodes, node, 1);

	for (i = 0; i < sched_domains_numa_levels; i++) {
		for (j = 0; j < nr_node_ids; j++) {
			if (!node_online(j) || node == j)
				continue;

			if (node_distance(j, node) > sched_domains_numa_distance[i])
				continue;

			/* Add remote nodes in our masks */
			cpumask_or(sched_domains_numa_masks[i][node],
				   sched_domains_numa_masks[i][node],
				   sched_domains_numa_masks[0][j]);
		}
	}

	/*
	 * A new node has been brought up, potentially changing the topology
	 * classification.
	 *
	 * Note that this is racy vs any use of sched_numa_topology_type :/
	 */
	init_numa_topology_type();
}

void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
	int node = cpu_to_node(cpu);
	int i, j;

	__sched_domains_numa_masks_set(node);

	for (i = 0; i < sched_domains_numa_levels; i++) {
		for (j = 0; j < nr_node_ids; j++) {
			if (!node_online(j))
				continue;

			/* Set ourselves in the remote node's masks */
			if (node_distance(j, node) <= sched_domains_numa_distance[i])
				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
		}