Commit 6e8746cb authored by Qiuxu Zhuo's avatar Qiuxu Zhuo Committed by Tony Luck
Browse files

EDAC/skx_common: Enable EDAC support for the "near" memory



The current {skx,i10nm}_edac miss the EDAC support to decode errors from
the 1st level memory (the fast "near" memory as cache) of the 2-level
memory system. Introduce a helper function skx_error_in_mem() to check
whether errors are from memory at the beginning of skx_mce_check_error().

As long as the errors are from memory (either the 1-level memory system
or the 2-level memory system), decode the errors.

Reported-and-tested-by: default avatarYouquan Song <youquan.song@intel.com>
Signed-off-by: default avatarQiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: default avatarTony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com
parent 8d8fcc39
Loading
Loading
Loading
Loading
+12 −6
Original line number Diff line number Diff line
@@ -632,12 +632,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m)
	if (!skx_mem_cfg_2lm)
		return false;

	errcode = GET_BITFIELD(m->status, 0, 15);
	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;

	if ((errcode & 0xef80) != 0x280)
		return false;
	return errcode == MCACOD_EXT_MEM_ERR;
}

	return true;
static bool skx_error_in_mem(const struct mce *m)
{
	u32 errcode;

	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;

	return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
}

int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@@ -651,8 +657,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
	if (mce->kflags & MCE_HANDLED_CEC)
		return NOTIFY_DONE;

	/* ignore unless this is memory related with an address */
	if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
	/* Ignore unless this is memory related with an address */
	if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
		return NOTIFY_DONE;

	memset(&res, 0, sizeof(res));
+24 −0
Original line number Diff line number Diff line
@@ -56,6 +56,30 @@
#define MCI_MISC_ECC_MODE(m)	(((m) >> 59) & 15)
#define MCI_MISC_ECC_DDRT	8	/* read from DDRT */

/*
 * According to Intel Architecture spec vol 3B,
 * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
 * memory errors should fit one of these masks:
 *	000f 0000 1mmm cccc (binary)
 *	000f 0010 1mmm cccc (binary)	[RAM used as cache]
 * where:
 *	f = Correction Report Filtering Bit. If 1, subsequent errors
 *	    won't be shown
 *	mmm = error type
 *	cccc = channel
 */
#define MCACOD_MEM_ERR_MASK	0xef80
/*
 * Errors from either the memory of the 1-level memory system or the
 * 2nd level memory (the slow "far" memory) of the 2-level memory system.
 */
#define MCACOD_MEM_CTL_ERR	0x80
/*
 * Errors from the 1st level memory (the fast "near" memory as cache)
 * of the 2-level memory system.
 */
#define MCACOD_EXT_MEM_ERR	0x280

/*
 * Each cpu socket contains some pci devices that provide global
 * information, and also some that are local to each of the two