Commit b520ca5d authored by Ohad Sharabi's avatar Ohad Sharabi Committed by Oded Gabbay
Browse files

habanalabs/gaudi: use HBM_ECC_EN bit for ECC ERR



driver should use ECC info from FW only if HBM ECC CAP is set.
otherwise, try to fetch the data from MC regs only if security is
disabled.

Signed-off-by: default avatarOhad Sharabi <osharabi@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent e52606d2
Loading
Loading
Loading
Loading
+15 −3
Original line number Diff line number Diff line
@@ -7105,7 +7105,9 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
	u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch;
	int err = 0;

	if (!hdev->asic_prop.fw_security_disabled) {
	if (hdev->asic_prop.fw_security_status_valid &&
			(hdev->asic_prop.fw_app_security_map &
				CPU_BOOT_DEV_STS0_HBM_ECC_EN)) {
		if (!hbm_ecc_data) {
			dev_err(hdev->dev, "No FW ECC data");
			return 0;
@@ -7127,14 +7129,24 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
				le32_to_cpu(hbm_ecc_data->hbm_ecc_info));

		dev_err(hdev->dev,
			"HBM%d pc%d ECC: TYPE=%d, WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
			device, ch, type, wr_par, rd_par, ca_par, serr, derr);
			"HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
			device, ch, wr_par, rd_par, ca_par, serr, derr);
		dev_err(hdev->dev,
			"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%u, SEC_CNT=%d, DEC_CNT=%d\n",
			device, ch, hbm_ecc_data->first_addr, type,
			hbm_ecc_data->sec_cont_cnt, hbm_ecc_data->sec_cnt,
			hbm_ecc_data->dec_cnt);

		err = 1;

		return 0;
	}

	if (!hdev->asic_prop.fw_security_disabled) {
		dev_info(hdev->dev, "Cannot access MC regs for ECC data while security is enabled\n");
		return 0;
	}

	base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET;
	for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) {
		val = RREG32_MASK(base + ch * 0x1000 + 0x06C, 0x0000FFFF);