Unverified Commit 4ef5a878 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!171 SPR: HBM retry_rd_err_log support

Merge Pull Request from: @youquan_song 
 
[Description]​
https://gitee.com/openeuler/intel-kernel/issues/I5V3SJ

An HBM memory channel is divided into two pseudo channels. Each
pseudo channel has its own retry_rd_err_log registers. Retrieve and
print retry_rd_err_log registers of the HBM pseudo channel if the
memory error is from HBM.

14646de4 EDAC/skx_common: Add ChipSelect ADXL component
acd4cf68 EDAC/i10nm: Retrieve and print retry_rd_err_log registers for HBM
d5f5e499 EDAC/i10nm: Print an extra register set of retry_rd_err_log

[Testing]
1.Add kernel options in grub: efi=nosoftreserve i10nm_edac.retry_rd_err_log=1
2.numactl -H
node distances:
node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0: 10 12 12 12 21 21 21 21 13 14 14 14 23 23 23 23
1: 12 10 12 12 21 21 21 21 14 13 14 14 23 23 23 23
2: 12 12 10 12 21 21 21 21 14 14 13 14 23 23 23 23
3: 12 12 12 10 21 21 21 21 14 14 14 13 23 23 23 23
4: 21 21 21 21 10 12 12 12 23 23 23 23 13 14 14 14
5: 21 21 21 21 12 10 12 12 23 23 23 23 14 13 14 14
6: 21 21 21 21 12 12 10 12 23 23 23 23 14 14 13 14
7: 21 21 21 21 12 12 12 10 23 23 23 23 14 14 14 13
8: 13 14 14 14 23 23 23 23 10 14 14 14 23 23 23 23
9: 14 13 14 14 23 23 23 23 14 10 14 14 23 23 23 23
10: 14 14 13 14 23 23 23 23 14 14 10 14 23 23 23 23
11: 14 14 14 13 23 23 23 23 14 14 14 10 23 23 23 23
12: 23 23 23 23 13 14 14 14 23 23 23 23 10 14 14 14
13: 23 23 23 23 14 13 14 14 23 23 23 23 14 10 14 14
14: 23 23 23 23 14 14 13 14 23 23 23 23 14 14 10 14
15: 23 23 23 23 14 14 14 13 23 23 23 23 14 14 14 10
3. #modprobe einj
4. git clone https://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git, build it
5. # numactl --cpunodebind=0 --membind=13 /home/ras-tools/cmcistorm 1
0: vaddr = 0x130d490 paddr = d87ff42490
6. #dmesg: output retry_rd_err_log registers value.

[83086.997090] EDAC MC29: 0 CE memory read error on CPU_SrcID#1_HBMC#9_Chan#1 (channel:1 page:0xd87ff42 offset:0x480 grain:32 syndrome:0x0 - err_code:0x0000:0x009f SystemAddress:0xd87ff42480 ProcessorSocketId:0x1 MemoryControllerId:0x9 ChannelAddress:0x7ffe8480 ChannelId:0x1 RankAddress:0x3fff4240 PhysicalRankId:0x0 Row:0x7ffe Column:0x14 Bank:0x0 BankGroup:0x0 ChipSelect:0x2 ChipId:0x1 retry_rd_err_log[08928208 00000000 0000000000010000 00500081 80007ffe 000000d87ff42480] correrrcnt[0001 0000 0001 0000 0000 0000 0000 0000]) 
 
Link:https://gitee.com/openeuler/kernel/pulls/171

 
Reviewed-by: default avatarChen Wei <chenwei@xfusion.com>
Reviewed-by: default avatarXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: default avatarJun Tian <jun.j.tian@intel.com>
Signed-off-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
parents 425c0a7b 6cc54473
Loading
Loading
Loading
Loading
+131 −22
Original line number Diff line number Diff line
@@ -79,28 +79,42 @@ static bool mem_cfg_2lm;

static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
static u32 offsets_scrub_spr_hbm0[]  = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8};
static u32 offsets_scrub_spr_hbm1[]  = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8};
static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10};
static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0};
static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0};

static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable,
				      u32 *offsets_scrub, u32 *offsets_demand,
				      u32 *offsets_demand2)
{
	u32 s, d;
	u32 s, d, d2;

	if (!imc->mbase)
		return;

	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
	s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]);
	d = I10NM_GET_REG32(imc, chan, offsets_demand[0]);
	if (offsets_demand2)
		d2 = I10NM_GET_REG32(imc, chan, offsets_demand2[0]);

	if (enable) {
		/* Save default configurations */
		imc->chan[chan].retry_rd_err_log_s = s;
		imc->chan[chan].retry_rd_err_log_d = d;
		if (offsets_demand2)
			imc->chan[chan].retry_rd_err_log_d2 = d2;

		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
		s |=  RETRY_RD_ERR_LOG_EN;
		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
		d |=  RETRY_RD_ERR_LOG_EN;

		if (offsets_demand2) {
			d2 &= ~RETRY_RD_ERR_LOG_UC;
			d2 |=  RETRY_RD_ERR_LOG_NOOVER;
			d2 |=  RETRY_RD_ERR_LOG_EN;
		}
	} else {
		/* Restore default configurations */
		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
@@ -115,23 +129,55 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable
			d |=  RETRY_RD_ERR_LOG_NOOVER;
		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
			d &= ~RETRY_RD_ERR_LOG_EN;

		if (offsets_demand2) {
			if (imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_UC)
				d2 |=  RETRY_RD_ERR_LOG_UC;
			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_NOOVER))
				d2 &=  ~RETRY_RD_ERR_LOG_NOOVER;
			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_EN))
				d2 &= ~RETRY_RD_ERR_LOG_EN;
		}
	}

	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
	I10NM_SET_REG32(imc, chan, offsets_scrub[0], s);
	I10NM_SET_REG32(imc, chan, offsets_demand[0], d);
	if (offsets_demand2)
		I10NM_SET_REG32(imc, chan, offsets_demand2[0], d2);
}

static void enable_retry_rd_err_log(bool enable)
{
	struct skx_imc *imc;
	struct skx_dev *d;
	int i, j;

	edac_dbg(2, "\n");

	list_for_each_entry(d, i10nm_edac_list, list)
		for (i = 0; i < I10NM_NUM_IMC; i++)
			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
				__enable_retry_rd_err_log(&d->imc[i], j, enable);
		for (i = 0; i < I10NM_NUM_IMC; i++) {
			imc = &d->imc[i];
			if (!imc->mbase)
				continue;

			for (j = 0; j < I10NM_NUM_CHANNELS; j++) {
				if (imc->hbm_mc) {
					__enable_retry_rd_err_log(imc, j, enable,
								  res_cfg->offsets_scrub_hbm0,
								  res_cfg->offsets_demand_hbm0,
								  NULL);
					__enable_retry_rd_err_log(imc, j, enable,
								  res_cfg->offsets_scrub_hbm1,
								  res_cfg->offsets_demand_hbm1,
								  NULL);
				} else {
					__enable_retry_rd_err_log(imc, j, enable,
								  res_cfg->offsets_scrub,
								  res_cfg->offsets_demand,
								  res_cfg->offsets_demand2);
				}
			}
	}
}

static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
@@ -140,14 +186,33 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
	struct skx_imc *imc = &res->dev->imc[res->imc];
	u32 log0, log1, log2, log3, log4;
	u32 corr0, corr1, corr2, corr3;
	u32 lxg0, lxg1, lxg3, lxg4;
	u32 *xffsets = NULL;
	u64 log2a, log5;
	u64 lxg2a, lxg5;
	u32 *offsets;
	int n;
	int n, pch;

	if (!imc->mbase)
		return;

	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
	if (imc->hbm_mc) {
		pch = res->cs & 1;

		if (pch)
			offsets = scrub_err ? res_cfg->offsets_scrub_hbm1 :
					      res_cfg->offsets_demand_hbm1;
		else
			offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 :
					      res_cfg->offsets_demand_hbm0;
	} else {
		if (scrub_err) {
			offsets = res_cfg->offsets_scrub;
		} else {
			offsets = res_cfg->offsets_demand;
			xffsets = res_cfg->offsets_demand2;
		}
	}

	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
@@ -155,20 +220,52 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);

	if (xffsets) {
		lxg0 = I10NM_GET_REG32(imc, res->channel, xffsets[0]);
		lxg1 = I10NM_GET_REG32(imc, res->channel, xffsets[1]);
		lxg3 = I10NM_GET_REG32(imc, res->channel, xffsets[3]);
		lxg4 = I10NM_GET_REG32(imc, res->channel, xffsets[4]);
		lxg5 = I10NM_GET_REG64(imc, res->channel, xffsets[5]);
	}

	if (res_cfg->type == SPR) {
		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx",
			     log0, log1, log2a, log3, log4, log5);

		if (len - n > 0) {
			if (xffsets) {
				lxg2a = I10NM_GET_REG64(imc, res->channel, xffsets[2]);
				n += snprintf(msg + n, len - n, " %.8x %.8x %.16llx %.8x %.8x %.16llx]",
					     lxg0, lxg1, lxg2a, lxg3, lxg4, lxg5);
			} else {
				n += snprintf(msg + n, len - n, "]");
			}
		}
	} else {
		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
			     log0, log1, log2, log3, log4, log5);
	}

	if (imc->hbm_mc) {
		if (pch) {
			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2c18);
			corr1 = I10NM_GET_REG32(imc, res->channel, 0x2c1c);
			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2c20);
			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2c24);
		} else {
			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2818);
			corr1 = I10NM_GET_REG32(imc, res->channel, 0x281c);
			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2820);
			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2824);
		}
	} else {
		corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
		corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
		corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
		corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
	}

	if (len - n > 0)
		snprintf(msg + n, len - n,
@@ -179,10 +276,17 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
			 corr3 & 0xffff, corr3 >> 16);

	/* Clear status bits */
	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
	if (retry_rd_err_log == 2) {
		if (log0 & RETRY_RD_ERR_LOG_OVER_UC_V) {
			log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
			I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
		}

		if (xffsets && (lxg0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
			lxg0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
			I10NM_SET_REG32(imc, res->channel, xffsets[0], lxg0);
		}
	}
}

static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
@@ -519,7 +623,12 @@ static struct res_config spr_cfg = {
	.sad_all_devfn		= PCI_DEVFN(10, 0),
	.sad_all_offset		= 0x300,
	.offsets_scrub		= offsets_scrub_spr,
	.offsets_scrub_hbm0	= offsets_scrub_spr_hbm0,
	.offsets_scrub_hbm1	= offsets_scrub_spr_hbm1,
	.offsets_demand		= offsets_demand_spr,
	.offsets_demand2	= offsets_demand2_spr,
	.offsets_demand_hbm0	= offsets_demand_spr_hbm0,
	.offsets_demand_hbm1	= offsets_demand_spr_hbm1,
};

static const struct x86_cpu_id i10nm_cpuids[] = {
+5 −0
Original line number Diff line number Diff line
@@ -27,9 +27,11 @@ static const char * const component_names[] = {
	[INDEX_MEMCTRL]		= "MemoryControllerId",
	[INDEX_CHANNEL]		= "ChannelId",
	[INDEX_DIMM]		= "DimmSlotId",
	[INDEX_CS]		= "ChipSelect",
	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
	[INDEX_NM_CHANNEL]	= "NmChannelId",
	[INDEX_NM_DIMM]		= "NmDimmSlotId",
	[INDEX_NM_CS]		= "NmChipSelect",
};

static int component_indices[ARRAY_SIZE(component_names)];
@@ -139,10 +141,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
		res->cs      = (adxl_nm_bitmap & BIT_NM_CS) ?
			       (int)adxl_values[component_indices[INDEX_NM_CS]] : -1;
	} else {
		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
		res->cs      = (int)adxl_values[component_indices[INDEX_CS]];
	}

	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
+10 −0
Original line number Diff line number Diff line
@@ -86,6 +86,7 @@ struct skx_dev {
			struct pci_dev	*edev;
			u32 retry_rd_err_log_s;
			u32 retry_rd_err_log_d;
			u32 retry_rd_err_log_d2;
			struct skx_dimm {
				u8 close_pg;
				u8 bank_xor_enable;
@@ -112,16 +113,19 @@ enum {
	INDEX_MEMCTRL,
	INDEX_CHANNEL,
	INDEX_DIMM,
	INDEX_CS,
	INDEX_NM_FIRST,
	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
	INDEX_NM_CHANNEL,
	INDEX_NM_DIMM,
	INDEX_NM_CS,
	INDEX_MAX
};

#define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
#define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
#define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
#define BIT_NM_CS	BIT_ULL(INDEX_NM_CS)

struct decoded_addr {
	struct mce *mce;
@@ -134,6 +138,7 @@ struct decoded_addr {
	int	sktways;
	int	chanways;
	int	dimm;
	int	cs;
	int	rank;
	int	channel_rank;
	u64	rank_address;
@@ -160,7 +165,12 @@ struct res_config {
	int sad_all_offset;
	/* Offsets of retry_rd_err_log registers */
	u32 *offsets_scrub;
	u32 *offsets_scrub_hbm0;
	u32 *offsets_scrub_hbm1;
	u32 *offsets_demand;
	u32 *offsets_demand2;
	u32 *offsets_demand_hbm0;
	u32 *offsets_demand_hbm1;
};

typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,