Commit 636f64db authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ras_core_for_v5.18_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Borislav Petkov:

 - More noinstr fixes

 - Add an erratum workaround for Intel CPUs which, in certain
   circumstances, end up consuming an unrelated uncorrectable memory
   error when using fast string copy insns

 - Remove the MCE tolerance level control as it is not really needed or
   used anymore

* tag 'ras_core_for_v5.18_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Remove the tolerance level control
  x86/mce: Work around an erratum on fast string copy instructions
  x86/mce: Use arch atomic and bit helpers
parents ebcb577a 7f1b8e0d
Loading
Loading
Loading
Loading
+37 −0
Original line number Original line Diff line number Diff line
What:		/sys/devices/system/machinecheck/machinecheckX/tolerant
Contact:	Borislav Petkov <bp@suse.de>
Date:		Dec, 2021
Description:
		Unused and obsolete after the advent of recoverable machine
		checks (see last sentence below) and those are present since
		2010 (Nehalem).

		Original description:

		The entries appear for each CPU, but they are truly shared
		between all CPUs.

		Tolerance level. When a machine check exception occurs for a
		non corrected machine check the kernel can take different
		actions.

		Since machine check exceptions can happen any time it is
		sometimes risky for the kernel to kill a process because it
		defies normal kernel locking rules. The tolerance level
		configures how hard the kernel tries to recover even at some
		risk of	deadlock. Higher tolerant values trade potentially
		better uptime with the risk of a crash or even corruption
		(for tolerant >= 3).

		==  ===========================================================
		 0  always panic on uncorrected errors, log corrected errors
		 1  panic or SIGBUS on uncorrected errors, log corrected errors
		 2  SIGBUS or log uncorrected errors, log corrected errors
		 3  never panic or SIGBUS, log all errors (for testing only)
		==  ===========================================================

		Default: 1

		Note this only makes a difference if the CPU allows recovery
		from a machine check exception. Current x86 CPUs generally
		do not.
+0 −32
Original line number Original line Diff line number Diff line
@@ -53,38 +53,6 @@ Description:
		(but some corrected errors might be still reported
		(but some corrected errors might be still reported
		in other ways)
		in other ways)


What:		/sys/devices/system/machinecheck/machinecheckX/tolerant
Contact:	Andi Kleen <ak@linux.intel.com>
Date:		Feb, 2007
Description:
		The entries appear for each CPU, but they are truly shared
		between all CPUs.

		Tolerance level. When a machine check exception occurs for a
		non corrected machine check the kernel can take different
		actions.

		Since machine check exceptions can happen any time it is
		sometimes risky for the kernel to kill a process because it
		defies normal kernel locking rules. The tolerance level
		configures how hard the kernel tries to recover even at some
		risk of	deadlock. Higher tolerant values trade potentially
		better uptime with the risk of a crash or even corruption
		(for tolerant >= 3).

		==  ===========================================================
		 0  always panic on uncorrected errors, log corrected errors
		 1  panic or SIGBUS on uncorrected errors, log corrected errors
		 2  SIGBUS or log uncorrected errors, log corrected errors
		 3  never panic or SIGBUS, log all errors (for testing only)
		==  ===========================================================

		Default: 1

		Note this only makes a difference if the CPU allows recovery
		from a machine check exception. Current x86 CPUs generally
		do not.

What:		/sys/devices/system/machinecheck/machinecheckX/trigger
What:		/sys/devices/system/machinecheck/machinecheckX/trigger
Contact:	Andi Kleen <ak@linux.intel.com>
Contact:	Andi Kleen <ak@linux.intel.com>
Date:		Feb, 2007
Date:		Feb, 2007
+0 −2
Original line number Original line Diff line number Diff line
@@ -60,8 +60,6 @@ There are two (actually three) modes memory failure recovery can be in:


vm.memory_failure_recovery sysctl set to zero:
vm.memory_failure_recovery sysctl set to zero:
	All memory failures cause a panic. Do not attempt recovery.
	All memory failures cause a panic. Do not attempt recovery.
	(on x86 this can be also affected by the tolerant level of the
	MCE subsystem)


early kill
early kill
	(can be controlled globally and per process)
	(can be controlled globally and per process)
+1 −8
Original line number Original line Diff line number Diff line
@@ -47,14 +47,7 @@ Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
		in a reboot. On Intel systems it is enabled by default.
		in a reboot. On Intel systems it is enabled by default.
   mce=nobootlog
   mce=nobootlog
		Disable boot machine check logging.
		Disable boot machine check logging.
   mce=tolerancelevel[,monarchtimeout] (number,number)
   mce=monarchtimeout (number)
		tolerance levels:
		0: always panic on uncorrected errors, log corrected errors
		1: panic or SIGBUS on uncorrected errors, log corrected errors
		2: SIGBUS or log uncorrected errors, log corrected errors
		3: never panic or SIGBUS, log all errors (for testing only)
		Default is 1
		Can be also set using sysfs which is preferable.
		monarchtimeout:
		monarchtimeout:
		Sets the time in us to wait for other CPUs on machine checks. 0
		Sets the time in us to wait for other CPUs on machine checks. 0
		to disable.
		to disable.
+103 −72
Original line number Original line Diff line number Diff line
@@ -86,14 +86,6 @@ struct mce_vendor_flags mce_flags __read_mostly;


struct mca_config mca_cfg __read_mostly = {
struct mca_config mca_cfg __read_mostly = {
	.bootlog  = -1,
	.bootlog  = -1,
	/*
	 * Tolerant levels:
	 * 0: always panic on uncorrected errors, log corrected errors
	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
	 * 3: never panic or SIGBUS, log all errors (for testing only)
	 */
	.tolerant = 1,
	.monarch_timeout = -1
	.monarch_timeout = -1
};
};


@@ -168,27 +160,6 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);


u32 mca_msr_reg(int bank, enum mca_msr reg)
{
	if (mce_flags.smca) {
		switch (reg) {
		case MCA_CTL:	 return MSR_AMD64_SMCA_MCx_CTL(bank);
		case MCA_ADDR:	 return MSR_AMD64_SMCA_MCx_ADDR(bank);
		case MCA_MISC:	 return MSR_AMD64_SMCA_MCx_MISC(bank);
		case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
		}
	}

	switch (reg) {
	case MCA_CTL:	 return MSR_IA32_MCx_CTL(bank);
	case MCA_ADDR:	 return MSR_IA32_MCx_ADDR(bank);
	case MCA_MISC:	 return MSR_IA32_MCx_MISC(bank);
	case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
	}

	return 0;
}

static void __print_mce(struct mce *m)
static void __print_mce(struct mce *m)
{
{
	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
@@ -769,7 +740,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
			goto clear_it;
			goto clear_it;


		mce_read_aux(&m, i);
		mce_read_aux(&m, i);
		m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false);
		m.severity = mce_severity(&m, NULL, NULL, false);
		/*
		/*
		 * Don't get the IP here because it's unlikely to
		 * Don't get the IP here because it's unlikely to
		 * have anything to do with the actual error location.
		 * have anything to do with the actual error location.
@@ -809,7 +780,8 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
 * the severity assessment code. Pretend that EIPV was set, and take the
 * the severity assessment code. Pretend that EIPV was set, and take the
 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
 */
 */
static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
static __always_inline void
quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
{
{
	if (bank != 0)
	if (bank != 0)
		return;
		return;
@@ -829,11 +801,64 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
	m->cs = regs->cs;
	m->cs = regs->cs;
}
}


/*
 * Disable fast string copy and return from the MCE handler upon the first SRAR
 * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
 * CPUs.
 * The fast string copy instructions ("REP; MOVS*") could consume an
 * uncorrectable memory error in the cache line _right after_ the desired region
 * to copy and raise an MCE with RIP pointing to the instruction _after_ the
 * "REP; MOVS*".
 * This mitigation addresses the issue completely with the caveat of performance
 * degradation on the CPU affected. This is still better than the OS crashing on
 * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
 * kernel context (e.g., copy_page).
 *
 * Returns true when fast string copy on CPU has been disabled.
 */
static noinstr bool quirk_skylake_repmov(void)
{
	u64 mcgstatus   = mce_rdmsrl(MSR_IA32_MCG_STATUS);
	u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
	u64 mc1_status;

	/*
	 * Apply the quirk only to local machine checks, i.e., no broadcast
	 * sync is needed.
	 */
	if (!(mcgstatus & MCG_STATUS_LMCES) ||
	    !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
		return false;

	mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));

	/* Check for a software-recoverable data fetch error. */
	if ((mc1_status &
	     (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
	      MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
	      MCI_STATUS_AR | MCI_STATUS_S)) ==
	     (MCI_STATUS_VAL |                   MCI_STATUS_UC | MCI_STATUS_EN |
	      MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
	      MCI_STATUS_AR | MCI_STATUS_S)) {
		misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
		mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
		mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);

		instrumentation_begin();
		pr_err_once("Erratum detected, disable fast string copy instructions.\n");
		instrumentation_end();

		return true;
	}

	return false;
}

/*
/*
 * Do a quick check if any of the events requires a panic.
 * Do a quick check if any of the events requires a panic.
 * This decides if we keep the events around or clear them.
 * This decides if we keep the events around or clear them.
 */
 */
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
					  struct pt_regs *regs)
					  struct pt_regs *regs)
{
{
	char *tmp = *msg;
	char *tmp = *msg;
@@ -844,12 +869,12 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
		if (!(m->status & MCI_STATUS_VAL))
		if (!(m->status & MCI_STATUS_VAL))
			continue;
			continue;


		__set_bit(i, validp);
		arch___set_bit(i, validp);
		if (mce_flags.snb_ifu_quirk)
		if (mce_flags.snb_ifu_quirk)
			quirk_sandybridge_ifu(i, m, regs);
			quirk_sandybridge_ifu(i, m, regs);


		m->bank = i;
		m->bank = i;
		if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
		if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
			mce_read_aux(m, i);
			mce_read_aux(m, i);
			*msg = tmp;
			*msg = tmp;
			return 1;
			return 1;
@@ -897,12 +922,11 @@ static noinstr int mce_timed_out(u64 *t, const char *msg)
	if (!mca_cfg.monarch_timeout)
	if (!mca_cfg.monarch_timeout)
		goto out;
		goto out;
	if ((s64)*t < SPINUNIT) {
	if ((s64)*t < SPINUNIT) {
		if (mca_cfg.tolerant <= 1) {
		if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
		if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
			pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
			pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
				 cpumask_pr_args(&mce_missing_cpus));
				 cpumask_pr_args(&mce_missing_cpus));
		mce_panic(msg, NULL, NULL);
		mce_panic(msg, NULL, NULL);
		}

		ret = 1;
		ret = 1;
		goto out;
		goto out;
	}
	}
@@ -966,9 +990,9 @@ static void mce_reign(void)
	 * This dumps all the mces in the log buffer and stops the
	 * This dumps all the mces in the log buffer and stops the
	 * other CPUs.
	 * other CPUs.
	 */
	 */
	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
	if (m && global_worst >= MCE_PANIC_SEVERITY) {
		/* call mce_severity() to get "msg" for panic */
		/* call mce_severity() to get "msg" for panic */
		mce_severity(m, NULL, mca_cfg.tolerant, &msg, true);
		mce_severity(m, NULL, &msg, true);
		mce_panic("Fatal machine check", m, msg);
		mce_panic("Fatal machine check", m, msg);
	}
	}


@@ -982,7 +1006,7 @@ static void mce_reign(void)
	 * No machine check event found. Must be some external
	 * No machine check event found. Must be some external
	 * source or one CPU is hung. Panic.
	 * source or one CPU is hung. Panic.
	 */
	 */
	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
	if (global_worst <= MCE_KEEP_SEVERITY)
		mce_panic("Fatal machine check from unknown source", NULL, NULL);
		mce_panic("Fatal machine check from unknown source", NULL, NULL);


	/*
	/*
@@ -1010,13 +1034,13 @@ static noinstr int mce_start(int *no_way_out)
	if (!timeout)
	if (!timeout)
		return ret;
		return ret;


	atomic_add(*no_way_out, &global_nwo);
	arch_atomic_add(*no_way_out, &global_nwo);
	/*
	/*
	 * Rely on the implied barrier below, such that global_nwo
	 * Rely on the implied barrier below, such that global_nwo
	 * is updated before mce_callin.
	 * is updated before mce_callin.
	 */
	 */
	order = atomic_inc_return(&mce_callin);
	order = arch_atomic_inc_return(&mce_callin);
	cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
	arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);


	/* Enable instrumentation around calls to external facilities */
	/* Enable instrumentation around calls to external facilities */
	instrumentation_begin();
	instrumentation_begin();
@@ -1024,10 +1048,10 @@ static noinstr int mce_start(int *no_way_out)
	/*
	/*
	 * Wait for everyone.
	 * Wait for everyone.
	 */
	 */
	while (atomic_read(&mce_callin) != num_online_cpus()) {
	while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
		if (mce_timed_out(&timeout,
		if (mce_timed_out(&timeout,
				  "Timeout: Not all CPUs entered broadcast exception handler")) {
				  "Timeout: Not all CPUs entered broadcast exception handler")) {
			atomic_set(&global_nwo, 0);
			arch_atomic_set(&global_nwo, 0);
			goto out;
			goto out;
		}
		}
		ndelay(SPINUNIT);
		ndelay(SPINUNIT);
@@ -1042,7 +1066,7 @@ static noinstr int mce_start(int *no_way_out)
		/*
		/*
		 * Monarch: Starts executing now, the others wait.
		 * Monarch: Starts executing now, the others wait.
		 */
		 */
		atomic_set(&mce_executing, 1);
		arch_atomic_set(&mce_executing, 1);
	} else {
	} else {
		/*
		/*
		 * Subject: Now start the scanning loop one by one in
		 * Subject: Now start the scanning loop one by one in
@@ -1050,10 +1074,10 @@ static noinstr int mce_start(int *no_way_out)
		 * This way when there are any shared banks it will be
		 * This way when there are any shared banks it will be
		 * only seen by one CPU before cleared, avoiding duplicates.
		 * only seen by one CPU before cleared, avoiding duplicates.
		 */
		 */
		while (atomic_read(&mce_executing) < order) {
		while (arch_atomic_read(&mce_executing) < order) {
			if (mce_timed_out(&timeout,
			if (mce_timed_out(&timeout,
					  "Timeout: Subject CPUs unable to finish machine check processing")) {
					  "Timeout: Subject CPUs unable to finish machine check processing")) {
				atomic_set(&global_nwo, 0);
				arch_atomic_set(&global_nwo, 0);
				goto out;
				goto out;
			}
			}
			ndelay(SPINUNIT);
			ndelay(SPINUNIT);
@@ -1063,7 +1087,7 @@ static noinstr int mce_start(int *no_way_out)
	/*
	/*
	 * Cache the global no_way_out state.
	 * Cache the global no_way_out state.
	 */
	 */
	*no_way_out = atomic_read(&global_nwo);
	*no_way_out = arch_atomic_read(&global_nwo);


	ret = order;
	ret = order;


@@ -1148,12 +1172,12 @@ static noinstr int mce_end(int order)
	return ret;
	return ret;
}
}


static void mce_clear_state(unsigned long *toclear)
static __always_inline void mce_clear_state(unsigned long *toclear)
{
{
	int i;
	int i;


	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
		if (test_bit(i, toclear))
		if (arch_test_bit(i, toclear))
			mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
			mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
	}
	}
}
}
@@ -1203,8 +1227,8 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
	int severity, i, taint = 0;
	int severity, i, taint = 0;


	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
		__clear_bit(i, toclear);
		arch___clear_bit(i, toclear);
		if (!test_bit(i, valid_banks))
		if (!arch_test_bit(i, valid_banks))
			continue;
			continue;


		if (!mce_banks[i].ctl)
		if (!mce_banks[i].ctl)
@@ -1229,7 +1253,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
		/* Set taint even when machine check was not enabled. */
		/* Set taint even when machine check was not enabled. */
		taint++;
		taint++;


		severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
		severity = mce_severity(m, regs, NULL, true);


		/*
		/*
		 * When machine check was for corrected/deferred handler don't
		 * When machine check was for corrected/deferred handler don't
@@ -1239,7 +1263,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
			continue;
			continue;


		__set_bit(i, toclear);
		arch___set_bit(i, toclear);


		/* Machine check event was not enabled. Clear, but ignore. */
		/* Machine check event was not enabled. Clear, but ignore. */
		if (severity == MCE_NO_SEVERITY)
		if (severity == MCE_NO_SEVERITY)
@@ -1389,7 +1413,6 @@ noinstr void do_machine_check(struct pt_regs *regs)
	int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
	int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
	DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
	DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
	struct mca_config *cfg = &mca_cfg;
	struct mce m, *final;
	struct mce m, *final;
	char *msg = NULL;
	char *msg = NULL;


@@ -1400,6 +1423,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
	else if (unlikely(!mca_cfg.initialized))
	else if (unlikely(!mca_cfg.initialized))
		return unexpected_machine_check(regs);
		return unexpected_machine_check(regs);


	if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
		goto clear;

	/*
	/*
	 * Establish sequential order between the CPUs entering the machine
	 * Establish sequential order between the CPUs entering the machine
	 * check handler.
	 * check handler.
@@ -1408,7 +1434,7 @@ noinstr void do_machine_check(struct pt_regs *regs)


	/*
	/*
	 * If no_way_out gets set, there is no safe way to recover from this
	 * If no_way_out gets set, there is no safe way to recover from this
	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
	 * MCE.
	 */
	 */
	no_way_out = 0;
	no_way_out = 0;


@@ -1442,7 +1468,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
	 * severity is MCE_AR_SEVERITY we have other options.
	 * severity is MCE_AR_SEVERITY we have other options.
	 */
	 */
	if (!(m.mcgstatus & MCG_STATUS_RIPV))
	if (!(m.mcgstatus & MCG_STATUS_RIPV))
		kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
		kill_current_task = 1;
	/*
	/*
	 * Check if this MCE is signaled to only this logical processor,
	 * Check if this MCE is signaled to only this logical processor,
	 * on Intel, Zhaoxin only.
	 * on Intel, Zhaoxin only.
@@ -1459,7 +1485,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
	 * to see it will clear it.
	 * to see it will clear it.
	 */
	 */
	if (lmce) {
	if (lmce) {
		if (no_way_out && cfg->tolerant < 3)
		if (no_way_out)
			mce_panic("Fatal local machine check", &m, msg);
			mce_panic("Fatal local machine check", &m, msg);
	} else {
	} else {
		order = mce_start(&no_way_out);
		order = mce_start(&no_way_out);
@@ -1479,7 +1505,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
			if (!no_way_out)
			if (!no_way_out)
				no_way_out = worst >= MCE_PANIC_SEVERITY;
				no_way_out = worst >= MCE_PANIC_SEVERITY;


			if (no_way_out && cfg->tolerant < 3)
			if (no_way_out)
				mce_panic("Fatal machine check on current CPU", &m, msg);
				mce_panic("Fatal machine check on current CPU", &m, msg);
		}
		}
	} else {
	} else {
@@ -1491,8 +1517,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
		 * fatal error. We call "mce_severity()" again to
		 * fatal error. We call "mce_severity()" again to
		 * make sure we have the right "msg".
		 * make sure we have the right "msg".
		 */
		 */
		if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
		if (worst >= MCE_PANIC_SEVERITY) {
			mce_severity(&m, regs, cfg->tolerant, &msg, true);
			mce_severity(&m, regs, &msg, true);
			mce_panic("Local fatal machine check!", &m, msg);
			mce_panic("Local fatal machine check!", &m, msg);
		}
		}
	}
	}
@@ -1542,6 +1568,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
out:
out:
	instrumentation_end();
	instrumentation_end();


clear:
	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
}
EXPORT_SYMBOL_GPL(do_machine_check);
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1855,6 +1882,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)


		if (c->x86 == 6 && c->x86_model == 45)
		if (c->x86 == 6 && c->x86_model == 45)
			mce_flags.snb_ifu_quirk = 1;
			mce_flags.snb_ifu_quirk = 1;

		/*
		 * Skylake, Cascacde Lake and Cooper Lake require a quirk on
		 * rep movs.
		 */
		if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
			mce_flags.skx_repmov_quirk = 1;
	}
	}


	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
@@ -2220,10 +2254,9 @@ static int __init mcheck_enable(char *str)
		cfg->bios_cmci_threshold = 1;
		cfg->bios_cmci_threshold = 1;
	else if (!strcmp(str, "recovery"))
	else if (!strcmp(str, "recovery"))
		cfg->recovery = 1;
		cfg->recovery = 1;
	else if (isdigit(str[0])) {
	else if (isdigit(str[0]))
		if (get_option(&str, &cfg->tolerant) == 2)
		get_option(&str, &(cfg->monarch_timeout));
		get_option(&str, &(cfg->monarch_timeout));
	} else {
	else {
		pr_info("mce argument %s ignored. Please use /sys\n", str);
		pr_info("mce argument %s ignored. Please use /sys\n", str);
		return 0;
		return 0;
	}
	}
@@ -2473,7 +2506,6 @@ static ssize_t store_int_with_restart(struct device *s,
	return ret;
	return ret;
}
}


static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
@@ -2494,7 +2526,6 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
};
};


static struct device_attribute *mce_device_attrs[] = {
static struct device_attribute *mce_device_attrs[] = {
	&dev_attr_tolerant.attr,
	&dev_attr_check_interval.attr,
	&dev_attr_check_interval.attr,
#ifdef CONFIG_X86_MCELOG_LEGACY
#ifdef CONFIG_X86_MCELOG_LEGACY
	&dev_attr_trigger,
	&dev_attr_trigger,
Loading