Commit 2de949ab authored by Haoyue Xu's avatar Haoyue Xu Committed by Leon Romanovsky
Browse files

RDMA/hns: Recover 1bit-ECC error of RAM on chip

Since ECC memory maintains a memory system immune to single-bit errors,
add support for correcting the 1bit-ECC error, which prevents a 1bit-ECC
error become an uncorrected type error. When a 1bit-ECC error happens in
the internal ram of the ROCE engine, such as the QPC table, as a 1bit-ECC
error caused by reading, the ROCE engine only corrects those 1bit ECC
errors by writing.

Link: https://lore.kernel.org/r/20220714134353.16700-6-liangwenpeng@huawei.com


Signed-off-by: default avatarHaoyue Xu <xuhaoyue1@hisilicon.com>
Signed-off-by: default avatarWenpeng Liang <liangwenpeng@huawei.com>
Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
parent 75e4e716
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -959,6 +959,7 @@ struct hns_roce_dev {
	const struct hns_roce_hw *hw;
	void			*priv;
	struct workqueue_struct *irq_workq;
	struct work_struct ecc_work;
	const struct hns_roce_dfx_hw *dfx;
	u32 func_num;
	u32 is_vf;
+180 −2
Original line number Diff line number Diff line
@@ -55,6 +55,42 @@ enum {
	CMD_RST_PRC_EBUSY,
};

enum ecc_resource_type {
	ECC_RESOURCE_QPC,
	ECC_RESOURCE_CQC,
	ECC_RESOURCE_MPT,
	ECC_RESOURCE_SRQC,
	ECC_RESOURCE_GMV,
	ECC_RESOURCE_QPC_TIMER,
	ECC_RESOURCE_CQC_TIMER,
	ECC_RESOURCE_SCCC,
	ECC_RESOURCE_COUNT,
};

static const struct {
	const char *name;
	u8 read_bt0_op;
	u8 write_bt0_op;
} fmea_ram_res[] = {
	{ "ECC_RESOURCE_QPC",
	  HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 },
	{ "ECC_RESOURCE_CQC",
	  HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 },
	{ "ECC_RESOURCE_MPT",
	  HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 },
	{ "ECC_RESOURCE_SRQC",
	  HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 },
	/* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */
	{ "ECC_RESOURCE_GMV",
	  0, 0 },
	{ "ECC_RESOURCE_QPC_TIMER",
	  HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 },
	{ "ECC_RESOURCE_CQC_TIMER",
	  HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 },
	{ "ECC_RESOURCE_SCCC",
	  HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 },
};

static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
				   struct ib_sge *sg)
{
@@ -6017,6 +6053,142 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev,
	return IRQ_RETVAL(int_work);
}

static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev,
			       struct fmea_ram_ecc *ecc_info)
{
	struct hns_roce_cmq_desc desc;
	struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
	int ret;

	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true);
	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
	if (ret)
		return ret;

	ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR);
	ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE);
	ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG);

	return 0;
}

static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx)
{
	struct hns_roce_cmq_desc desc;
	struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
	u32 addr_upper;
	u32 addr_low;
	int ret;

	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true);
	hr_reg_write(req, CFG_GMV_BT_IDX, idx);

	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
	if (ret) {
		dev_err(hr_dev->dev,
			"failed to execute cmd to read gmv, ret = %d.\n", ret);
		return ret;
	}

	addr_low =  hr_reg_read(req, CFG_GMV_BT_BA_L);
	addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H);

	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false);
	hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low);
	hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper);
	hr_reg_write(req, CFG_GMV_BT_IDX, idx);

	return hns_roce_cmq_send(hr_dev, &desc, 1);
}

static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data)
{
	if (res_type == ECC_RESOURCE_QPC_TIMER ||
	    res_type == ECC_RESOURCE_CQC_TIMER ||
	    res_type == ECC_RESOURCE_SCCC)
		return le64_to_cpu(*data);

	return le64_to_cpu(*data) << PAGE_SHIFT;
}

static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type,
			       u32 index)
{
	u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op;
	u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op;
	struct hns_roce_cmd_mailbox *mailbox;
	u64 addr;
	int ret;

	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
	if (IS_ERR(mailbox))
		return PTR_ERR(mailbox);

	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index);
	if (ret) {
		dev_err(hr_dev->dev,
			"failed to execute cmd to read fmea ram, ret = %d.\n",
			ret);
		goto out;
	}

	addr = fmea_get_ram_res_addr(res_type, mailbox->buf);

	ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index);
	if (ret)
		dev_err(hr_dev->dev,
			"failed to execute cmd to write fmea ram, ret = %d.\n",
			ret);

out:
	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
	return ret;
}

static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev,
				 struct fmea_ram_ecc *ecc_info)
{
	u32 res_type = ecc_info->res_type;
	u32 index = ecc_info->index;
	int ret;

	BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT);

	if (res_type >= ECC_RESOURCE_COUNT) {
		dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n",
			res_type);
		return;
	}

	if (res_type == ECC_RESOURCE_GMV)
		ret = fmea_recover_gmv(hr_dev, index);
	else
		ret = fmea_recover_others(hr_dev, res_type, index);
	if (ret)
		dev_err(hr_dev->dev,
			"failed to recover %s, index = %u, ret = %d.\n",
			fmea_ram_res[res_type].name, index, ret);
}

static void fmea_ram_ecc_work(struct work_struct *ecc_work)
{
	struct hns_roce_dev *hr_dev =
		container_of(ecc_work, struct hns_roce_dev, ecc_work);
	struct fmea_ram_ecc ecc_info = {};

	if (fmea_ram_ecc_query(hr_dev, &ecc_info)) {
		dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n");
		return;
	}

	if (!ecc_info.is_ecc_err) {
		dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n");
		return;
	}

	fmea_ram_ecc_recover(hr_dev, &ecc_info);
}

static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
{
	struct hns_roce_dev *hr_dev = dev_id;
@@ -6025,10 +6197,14 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)

	int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);

	if (int_st)
	if (int_st) {
		int_work = abnormal_interrupt_basic(hr_dev, int_st);
	else
	} else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
		queue_work(hr_dev->irq_workq, &hr_dev->ecc_work);
		int_work = IRQ_HANDLED;
	} else {
		dev_err(hr_dev->dev, "there is no abnormal irq found.\n");
	}

	return IRQ_RETVAL(int_work);
}
@@ -6344,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
		}
	}

	INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work);

	hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
	if (!hr_dev->irq_workq) {
		dev_err(dev, "failed to create irq workqueue.\n");
+12 −0
Original line number Diff line number Diff line
@@ -250,6 +250,7 @@ enum hns_roce_opcode_type {
	HNS_ROCE_OPC_CFG_GMV_TBL			= 0x850f,
	HNS_ROCE_OPC_CFG_GMV_BT				= 0x8510,
	HNS_ROCE_OPC_EXT_CFG				= 0x8512,
	HNS_ROCE_QUERY_RAM_ECC				= 0x8513,
	HNS_SWITCH_PARAMETER_CFG			= 0x1033,
};

@@ -1107,6 +1108,11 @@ enum {
#define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32)
#define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)

/* Fields of HNS_ROCE_QUERY_RAM_ECC */
#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0)
#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32)
#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64)

struct hns_roce_cfg_sgid_tb {
	__le32	table_idx_rsv;
	__le32	vf_sgid_l;
@@ -1343,6 +1349,12 @@ struct hns_roce_dip {
	struct list_head node; /* all dips are on a list */
};

struct fmea_ram_ecc {
	u32	is_ecc_err;
	u32	res_type;
	u32	index;
};

/* only for RNR timeout issue of HIP08 */
#define HNS_ROCE_CLOCK_ADJUST 1000
#define HNS_ROCE_MAX_CQ_PERIOD 65