Unverified Commit 0314681b authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!24 Intel SPR: intel_idle support

The intel_idle support patches for 5.10, especially for SPR C1E/C1 support.

Intel-kernel issue:
https://gitee.com/open_euler/dashboard?issue_id=I5BECY

New changes:

SnowRidge Cstate table
ICX C6 support
ICXD support
SCX c6
SPR support including C1E/C1 support

Test steps:

Check intel_idle is default driver on these x86 platforms and Cstate which is listed in /sys/devices/system/cpu/cpu0/cpuidle/state*/*
On SPR and C1E and C1 can be supported with new module argument intel_idle.preferred_cstates. 
parents 52c7b522 ca838b2f
Loading
Loading
Loading
Loading
+202 −12
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@
 */

/* un-comment DEBUG to enable pr_debug() statements */
#define DEBUG
/* #define DEBUG */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

@@ -64,11 +64,17 @@ static struct cpuidle_driver intel_idle_driver = {
/* intel_idle.max_cstate=0 disables driver */
static int max_cstate = CPUIDLE_STATE_MAX - 1;
static unsigned int disabled_states_mask;
static unsigned int preferred_states_mask;

static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;

static unsigned long auto_demotion_disable_flags;
static bool disable_promotion_to_c1e;

static enum {
	C1E_PROMOTION_PRESERVE,
	C1E_PROMOTION_ENABLE,
	C1E_PROMOTION_DISABLE
} c1e_promotion = C1E_PROMOTION_PRESERVE;

struct idle_cpu {
	struct cpuidle_state *state_table;
@@ -88,6 +94,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata;

static unsigned int mwait_substates __initdata;

/*
 * Enable interrupts before entering the C-state. On some platforms and for
 * some C-states, this may measurably decrease interrupt latency.
 */
#define CPUIDLE_FLAG_IRQ_ENABLE		BIT(14)

/*
 * Enable this state by default even if the ACPI _CST does not list it.
 */
@@ -115,9 +127,6 @@ static unsigned int mwait_substates __initdata;
 * If the local APIC timer is not known to be reliable in the target idle state,
 * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
 *
 * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
 * flushing user TLBs.
 *
 * Must be called under local_irq_disable().
 */
static __cpuidle int intel_idle(struct cpuidle_device *dev,
@@ -127,6 +136,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
	unsigned long eax = flg2MWAIT(state->flags);
	unsigned long ecx = 1; /* break on interrupt flag */

	if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)
		local_irq_enable();

	mwait_idle_with_hints(eax, ecx);

	return index;
@@ -698,7 +710,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
	{
		.name = "C1",
		.desc = "MWAIT 0x00",
		.flags = MWAIT2flg(0x00),
		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
		.exit_latency = 2,
		.target_residency = 2,
		.enter = &intel_idle,
@@ -727,7 +739,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
	{
		.name = "C1",
		.desc = "MWAIT 0x00",
		.flags = MWAIT2flg(0x00),
		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
		.exit_latency = 1,
		.target_residency = 1,
		.enter = &intel_idle,
@@ -744,8 +756,48 @@ static struct cpuidle_state icx_cstates[] __initdata = {
		.name = "C6",
		.desc = "MWAIT 0x20",
		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
		.exit_latency = 128,
		.target_residency = 384,
		.exit_latency = 170,
		.target_residency = 600,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
		.enter = NULL }
};

/*
 * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
 * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
 * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
 * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
 * both C1 and C1E requests end up with C1, so there is effectively no C1E.
 *
 * By default we enable C1 and disable C1E by marking it with
 * 'CPUIDLE_FLAG_UNUSABLE'.
 */
static struct cpuidle_state spr_cstates[] __initdata = {
	{
		.name = "C1",
		.desc = "MWAIT 0x00",
		.flags = MWAIT2flg(0x00),
		.exit_latency = 1,
		.target_residency = 1,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
		.name = "C1E",
		.desc = "MWAIT 0x01",
		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
					   CPUIDLE_FLAG_UNUSABLE,
		.exit_latency = 2,
		.target_residency = 4,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
		.name = "C6",
		.desc = "MWAIT 0x20",
		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
		.exit_latency = 290,
		.target_residency = 800,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
@@ -963,6 +1015,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = {
		.enter = NULL }
};

/*
 * Note, depending on HW and FW revision, SnowRidge SoC may or may not support
 * C6, and this is indicated in the CPUID mwait leaf.
 */
static struct cpuidle_state snr_cstates[] __initdata = {
	{
		.name = "C1",
		.desc = "MWAIT 0x00",
		.flags = MWAIT2flg(0x00),
		.exit_latency = 2,
		.target_residency = 2,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
		.name = "C1E",
		.desc = "MWAIT 0x01",
		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
		.exit_latency = 15,
		.target_residency = 25,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
		.name = "C6",
		.desc = "MWAIT 0x20",
		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
		.exit_latency = 130,
		.target_residency = 500,
		.enter = &intel_idle,
		.enter_s2idle = intel_idle_s2idle, },
	{
		.enter = NULL }
};

static const struct idle_cpu idle_cpu_nehalem __initconst = {
	.state_table = nehalem_cstates,
	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1062,6 +1147,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
	.use_acpi = true,
};

static const struct idle_cpu idle_cpu_spr __initconst = {
	.state_table = spr_cstates,
	.disable_promotion_to_c1e = true,
	.use_acpi = true,
};

static const struct idle_cpu idle_cpu_avn __initconst = {
	.state_table = avn_cstates,
	.disable_promotion_to_c1e = true,
@@ -1084,6 +1175,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = {
	.use_acpi = true,
};

static const struct idle_cpu idle_cpu_snr __initconst = {
	.state_table = snr_cstates,
	.disable_promotion_to_c1e = true,
	.use_acpi = true,
};

static const struct x86_cpu_id intel_idle_ids[] __initconst = {
	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP,		&idle_cpu_nhx),
	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM,		&idle_cpu_nehalem),
@@ -1117,12 +1214,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&idle_cpu_skl),
	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&idle_cpu_icx),
	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&idle_cpu_spr),
	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&idle_cpu_bxt),
	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&idle_cpu_dnv),
	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_dnv),
	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_snr),
	{}
};

@@ -1444,6 +1543,68 @@ static void __init sklh_idle_state_table_update(void)
	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
}

/**
 * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
 * idle states table.
 */
static void __init skx_idle_state_table_update(void)
{
	unsigned long long msr;

	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);

	/*
	 * 000b: C0/C1 (no package C-state support)
	 * 001b: C2
	 * 010b: C6 (non-retention)
	 * 011b: C6 (retention)
	 * 111b: No Package C state limits.
	 */
	if ((msr & 0x7) < 2) {
		/*
		 * Uses the CC6 + PC0 latency and 3 times of
		 * latency for target_residency if the PC6
		 * is disabled in BIOS. This is consistent
		 * with how intel_idle driver uses _CST
		 * to set the target_residency.
		 */
		skx_cstates[2].exit_latency = 92;
		skx_cstates[2].target_residency = 276;
	}
}

/**
 * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
 */
static void __init spr_idle_state_table_update(void)
{
	unsigned long long msr;

	/* Check if user prefers C1E over C1. */
	if ((preferred_states_mask & BIT(2)) &&
	    !(preferred_states_mask & BIT(1))) {
		/* Disable C1 and enable C1E. */
		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;

		/* Enable C1E using the "C1E promotion" bit. */
		c1e_promotion = C1E_PROMOTION_ENABLE;
	}

	/*
	 * By default, the C6 state assumes the worst-case scenario of package
	 * C6. However, if PC6 is disabled, we update the numbers to match
	 * core C6.
	 */
	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);

	/* Limit value 2 and above allow for PC6. */
	if ((msr & 0x7) < 2) {
		spr_cstates[2].exit_latency = 190;
		spr_cstates[2].target_residency = 600;
	}
}

static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
{
	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1475,6 +1636,12 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
	case INTEL_FAM6_SKYLAKE:
		sklh_idle_state_table_update();
		break;
	case INTEL_FAM6_SKYLAKE_X:
		skx_idle_state_table_update();
		break;
	case INTEL_FAM6_SAPPHIRERAPIDS_X:
		spr_idle_state_table_update();
		break;
	}

	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@@ -1547,6 +1714,15 @@ static void auto_demotion_disable(void)
	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
}

static void c1e_promotion_enable(void)
{
	unsigned long long msr_bits;

	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
	msr_bits |= 0x2;
	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
}

static void c1e_promotion_disable(void)
{
	unsigned long long msr_bits;
@@ -1578,7 +1754,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
	if (auto_demotion_disable_flags)
		auto_demotion_disable();

	if (disable_promotion_to_c1e)
	if (c1e_promotion == C1E_PROMOTION_ENABLE)
		c1e_promotion_enable();
	else if (c1e_promotion == C1E_PROMOTION_DISABLE)
		c1e_promotion_disable();

	return 0;
@@ -1657,7 +1835,8 @@ static int __init intel_idle_init(void)
	if (icpu) {
		cpuidle_state_table = icpu->state_table;
		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
		if (icpu->disable_promotion_to_c1e)
			c1e_promotion = C1E_PROMOTION_DISABLE;
		if (icpu->use_acpi || force_use_acpi)
			intel_idle_acpi_cst_extract();
	} else if (!intel_idle_acpi_cst_extract()) {
@@ -1716,3 +1895,14 @@ module_param(max_cstate, int, 0444);
 */
module_param_named(states_off, disabled_states_mask, uint, 0444);
MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
/*
 * Some platforms come with mutually exclusive C-states, so that if one is
 * enabled, the other C-states must not be used. Example: C1 and C1E on
 * Sapphire Rapids platform. This parameter allows for selecting the
 * preferred C-states among the groups of mutually exclusive C-states - the
 * selected C-states will be registered, the other C-states from the mutually
 * exclusive group won't be registered. If the platform has no mutually
 * exclusive C-states, this parameter has no effect.
 */
module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");