Commit f4128ae8 authored by Tim Chen's avatar Tim Chen Committed by Aubrey Li
Browse files

sched: Add cluster scheduler level for x86

mainline inclusion
from mainline-v5.16-rc1
commit 66558b73
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I8LVOX


CVE: N/A

--------------------------------

Intel-SIG: commit 66558b73 sched: Add cluster scheduler level
for x86.
Cluster scheduler feature backport.

--------------------------------

There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce is
shared among a cluster of cores instead of being exclusive to one
single core.

To prevent oversubscription of L2 cache, load should be balanced
between such L2 clusters, especially for tasks with no shared data.
On benchmark such as SPECrate mcf test, this change provides a boost
to performance especially on medium load system on Jacobsville.  on a
Jacobsville that has 24 Atom cores, arranged into 6 clusters of 4
cores each, the benchmark number is as follow:

 Improvement over baseline kernel for mcf_r
 copies         run time        base rate
 1              -0.1%           -0.2%
 6              25.1%           25.1%
 12             18.8%           19.0%
 24             0.3%            0.3%

So this looks pretty good. In terms of the system's task distribution,
some pretty bad clumping can be seen for the vanilla kernel without
the L2 cluster domain for the 6 and 12 copies case. With the extra
domain for cluster, the load does get evened out between the clusters.

Note this patch isn't an universal win as spreading isn't necessarily
a win, particually for those workload who can benefit from packing.

Signed-off-by: default avatarTim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: default avatarBarry Song <song.bao.hua@hisilicon.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210924085104.44806-4-21cnbao@gmail.com


Signed-off-by: default avatarAubrey Li <aubrey.li@linux.intel.com>
parent 3ece7213
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -1010,6 +1010,17 @@ config NR_CPUS
	  This is purely to save memory: each supported CPU adds about 8KB
	  to the kernel image.

config SCHED_CLUSTER
	bool "Cluster scheduler support"
	depends on SMP
	default y
	help
	  Cluster scheduler support improves the CPU scheduler's decision
	  making when dealing with machines that have clusters of CPUs.
	  Cluster usually means a couple of CPUs which are placed closely
	  by sharing mid-level caches, last-level cache tags or internal
	  busses.

config SCHED_SMT
	def_bool y if SMP

+7 −0
Original line number Diff line number Diff line
@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);

static inline struct cpumask *cpu_llc_shared_mask(int cpu)
@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
	return per_cpu(cpu_llc_shared_map, cpu);
}

static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
{
	return per_cpu(cpu_l2c_shared_map, cpu);
}

DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
+3 −0
Original line number Diff line number Diff line
@@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { }
#include <asm-generic/topology.h>

extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern const struct cpumask *cpu_clustergroup_mask(int cpu);

#define topology_logical_package_id(cpu)	(cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
@@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern unsigned int __max_die_per_package;

#ifdef CONFIG_SMP
#define topology_cluster_id(cpu)		(per_cpu(cpu_l2c_id, cpu))
#define topology_die_cpumask(cpu)		(per_cpu(cpu_die_map, cpu))
#define topology_cluster_cpumask(cpu)		(cpu_clustergroup_mask(cpu))
#define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))

+1 −0
Original line number Diff line number Diff line
@@ -865,6 +865,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
		l2 = new_l2;
#ifdef CONFIG_SMP
		per_cpu(cpu_llc_id, cpu) = l2_id;
		per_cpu(cpu_l2c_id, cpu) = l2_id;
#endif
	}

+5 −0
Original line number Diff line number Diff line
@@ -85,6 +85,7 @@ EXPORT_SYMBOL(smp_num_siblings);

/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
EXPORT_SYMBOL(cpu_llc_id);

u16 get_llc_id(unsigned int cpu)
{
@@ -92,6 +93,10 @@ u16 get_llc_id(unsigned int cpu)
}
EXPORT_SYMBOL_GPL(get_llc_id);

/* L2 cache ID of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
EXPORT_SYMBOL(cpu_l2c_id);

/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
Loading