Unverified Commit e2888060 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!742 SPR: EDAC driver enhance for driver decode and 2LM

Merge Pull Request from: @youquan_song 
 
[Description]​
SPR: EDAC driver enhance for driver decode and 2LM

Current i10nm_edac only supports firmware decoder (ACPI DSM methods) for Sapphire Rapids. So add the driver decoder(decoding DDR memory errors via extracting error information from the IMC MC error codes) for Sapphire Rapids for better decoding performance.
Enhance the SPR HBM EDAC driver to decode errors from the 1st level memory (the fast "near" memory as cache) of the 2-level memory system.

It will be valuable to avoid SMI triggered to call firware decoder, especially when CE (Correctable Error) triggered frequently on DDR memory.

6e8746cb EDAC/skx_common: Enable EDAC support for the "near" memory.
221aa03f EDAC/i10nm: Add driver decoder for Sapphire Rapids server.

[Testing]
download: https://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git
# echo 1 > /sys/module/i10nm_edac/parameters/decoding_via_mca
# modprobe einj
# cd /home/ras-tools/
# ./cmcistorm 1
0: vaddr = 0x117b490 paddr = 4884ef490
# dmesg and check the address was decoded.
 
 
Link:https://gitee.com/openeuler/kernel/pulls/742

 

Reviewed-by: default avatarJason Zeng <jason.zeng@intel.com>
Signed-off-by: default avatarJialin Zhang <zhangjialin11@huawei.com>
parents 9b40194a 31401034
Loading
Loading
Loading
Loading
+69 −33
Original line number Diff line number Diff line
@@ -338,11 +338,13 @@ static bool i10nm_check_2lm(struct res_config *cfg)
}

/*
 * Check whether the error comes from DDRT by ICX/Tremont model specific error code.
 * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS.
 * Check whether the error comes from DDRT by ICX/Tremont/SPR model specific error code.
 * Refer to SDM vol3B 17.11.3/17.13.2 Intel IMC MC error codes for IA32_MCi_STATUS.
 */
static bool i10nm_mscod_is_ddrt(u32 mscod)
{
	switch (res_cfg->type) {
	case I10NM:
		switch (mscod) {
		case 0x0106: case 0x0107:
		case 0x0800: case 0x0804:
@@ -354,11 +356,29 @@ static bool i10nm_mscod_is_ddrt(u32 mscod)
			return true;
		}

		break;
	case SPR:
		switch (mscod) {
		case 0x0800: case 0x0804:
		case 0x0806 ... 0x0808:
		case 0x080a ... 0x080e:
		case 0x0810: case 0x0811:
		case 0x0816: case 0x081e:
		case 0x081f:
			return true;
		}

		break;
	default:
		return false;
	}

	return false;
}

static bool i10nm_mc_decode_available(struct mce *mce)
{
#define ICX_IMCx_CHy		0x06666000
	u8 bank;

	if (!decoding_via_mca || mem_cfg_2lm)
@@ -372,8 +392,17 @@ static bool i10nm_mc_decode_available(struct mce *mce)

	switch (res_cfg->type) {
	case I10NM:
		if (bank < 13 || bank > 26)
		/* Check whether the bank is one of {13,14,17,18,21,22,25,26} */
		if (!(ICX_IMCx_CHy & (1 << bank)))
			return false;
		break;
	case SPR:
		if (bank < 13 || bank > 20)
			return false;
		break;
	default:
		return false;
	}

	/* DDRT errors can't be decoded from MCA bank registers */
	if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT)
@@ -382,11 +411,7 @@ static bool i10nm_mc_decode_available(struct mce *mce)
	if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status)))
		return false;

		/* Check whether one of {13,14,17,18,21,22,25,26} */
		return ((bank - 13) & BIT(1)) == 0;
	default:
		return false;
	}
	return true;
}

static bool i10nm_mc_decode(struct decoded_addr *res)
@@ -411,6 +436,26 @@ static bool i10nm_mc_decode(struct decoded_addr *res)
		bank              = m->bank - 13;
		res->imc          = bank / 4;
		res->channel      = bank % 2;
		res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
		res->row          = GET_BITFIELD(m->misc, 19, 39);
		res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
		res->bank_address = GET_BITFIELD(m->misc, 42, 43);
		res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
		res->rank         = GET_BITFIELD(m->misc, 56, 58);
		res->dimm         = res->rank >> 2;
		res->rank         = res->rank % 4;
		break;
	case SPR:
		bank              = m->bank - 13;
		res->imc          = bank / 2;
		res->channel      = bank % 2;
		res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
		res->row          = GET_BITFIELD(m->misc, 19, 36);
		res->bank_group   = GET_BITFIELD(m->misc, 37, 38);
		res->bank_address = GET_BITFIELD(m->misc, 39, 40);
		res->bank_group  |= GET_BITFIELD(m->misc, 41, 41) << 2;
		res->rank         = GET_BITFIELD(m->misc, 57, 57);
		res->dimm         = GET_BITFIELD(m->misc, 58, 58);
		break;
	default:
		return false;
@@ -422,15 +467,6 @@ static bool i10nm_mc_decode(struct decoded_addr *res)
		return false;
	}

	res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
	res->row          = GET_BITFIELD(m->misc, 19, 39);
	res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
	res->bank_address = GET_BITFIELD(m->misc, 42, 43);
	res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
	res->rank         = GET_BITFIELD(m->misc, 56, 58);
	res->dimm         = res->rank >> 2;
	res->rank         = res->rank % 4;

	return true;
}

+12 −6
Original line number Diff line number Diff line
@@ -632,12 +632,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m)
	if (!skx_mem_cfg_2lm)
		return false;

	errcode = GET_BITFIELD(m->status, 0, 15);
	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;

	if ((errcode & 0xef80) != 0x280)
		return false;
	return errcode == MCACOD_EXT_MEM_ERR;
}

	return true;
static bool skx_error_in_mem(const struct mce *m)
{
	u32 errcode;

	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;

	return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
}

int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@@ -651,8 +657,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
	if (mce->kflags & MCE_HANDLED_CEC)
		return NOTIFY_DONE;

	/* ignore unless this is memory related with an address */
	if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
	/* Ignore unless this is memory related with an address */
	if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
		return NOTIFY_DONE;

	memset(&res, 0, sizeof(res));
+24 −0
Original line number Diff line number Diff line
@@ -56,6 +56,30 @@
#define MCI_MISC_ECC_MODE(m)	(((m) >> 59) & 15)
#define MCI_MISC_ECC_DDRT	8	/* read from DDRT */

/*
 * According to Intel Architecture spec vol 3B,
 * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
 * memory errors should fit one of these masks:
 *	000f 0000 1mmm cccc (binary)
 *	000f 0010 1mmm cccc (binary)	[RAM used as cache]
 * where:
 *	f = Correction Report Filtering Bit. If 1, subsequent errors
 *	    won't be shown
 *	mmm = error type
 *	cccc = channel
 */
#define MCACOD_MEM_ERR_MASK	0xef80
/*
 * Errors from either the memory of the 1-level memory system or the
 * 2nd level memory (the slow "far" memory) of the 2-level memory system.
 */
#define MCACOD_MEM_CTL_ERR	0x80
/*
 * Errors from the 1st level memory (the fast "near" memory as cache)
 * of the 2-level memory system.
 */
#define MCACOD_EXT_MEM_ERR	0x280

/*
 * Each cpu socket contains some pci devices that provide global
 * information, and also some that are local to each of the two