Commit 209f4040 authored by wenglianfa's avatar wenglianfa Committed by Chengchang Tang
Browse files

RDMA/hns: Fix RoCEE hang when multiple QP banks use EXT_SGE EXT_SGE

driver inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IB30V8



----------------------------------------------------------------------

When QPs of multiple banks are used, there is a possibility that
the RoCEE is hang. This is because QPs of different banks may interfere
with each other in certain cases when processing extended SGEs.

To solve this problem, the QP-bank-limit mechanism is introduced.
When this mechanism is enabled, the number of QP banks must be limited
to ensure that extended SGEs can be used.

If this mechanism is not applied, the FW will limit the maximum number of
SGEs and makes extended SGEs unavailable to avoid the HW hang out.

Signed-off-by: default avatarwenglianfa <wenglianfa@huawei.com>
Signed-off-by: default avatarXinghai Cen <cenxinghai@h-partners.com>
parent 5b469415
Loading
Loading
Loading
Loading
+12 −3
Original line number Diff line number Diff line
@@ -52,9 +52,10 @@ void hns_roce_put_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx)

void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx)
{
#define INVALID_LOAD_CQNUM 0xFFFFFFFF
	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device);
	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
	u32 least_load = cq_table->ctx_num[0];
	u32 least_load = INVALID_LOAD_CQNUM;
	u8 bankid = 0;
	u8 i;

@@ -62,7 +63,10 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx)
		return;

	mutex_lock(&cq_table->bank_mutex);
	for (i = 1; i < HNS_ROCE_CQ_BANK_NUM; i++) {
	for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) {
		if (!(cq_table->valid_cq_bank_mask & BIT(i)))
			continue;

		if (cq_table->ctx_num[i] < least_load) {
			least_load = cq_table->ctx_num[i];
			bankid = i;
@@ -98,7 +102,7 @@ static u8 select_cq_bankid(struct hns_roce_dev *hr_dev, struct hns_roce_bank *ba
	struct hns_roce_ucontext *uctx = udata ?
		rdma_udata_to_drv_context(udata, struct hns_roce_ucontext,
					  ibucontext) : NULL;
	/* only apply for HIP09 and HIP10 now, and use bank 0 for kernel */
	/* only HIP08 is not applied now, and use bank 0 for kernel */
	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
		return uctx ? uctx->cq_bank_id : 0;

@@ -600,6 +604,11 @@ void hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
		cq_table->bank[i].max = hr_dev->caps.num_cqs /
					HNS_ROCE_CQ_BANK_NUM - 1;
	}

	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK)
		cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_LIMIT;
	else
		cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_DEFAULT;
}

void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev)
+6 −0
Original line number Diff line number Diff line
@@ -105,6 +105,10 @@

#define CQ_BANKID_SHIFT 2
#define CQ_BANKID_MASK GENMASK(1, 0)
#define VALID_CQ_BANK_MASK_DEFAULT 0xF
#define VALID_CQ_BANK_MASK_LIMIT 0x9

#define VALID_EXT_SGE_QP_BANK_MASK_LIMIT 0x41

#define HNS_ROCE_MAX_CQ_COUNT 0xFFFF
#define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF
@@ -168,6 +172,7 @@ enum {
	HNS_ROCE_CAP_FLAG_CQE_INLINE		= BIT(19),
	HNS_ROCE_CAP_FLAG_BOND			= BIT(21),
	HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB         = BIT(22),
	HNS_ROCE_CAP_FLAG_LIMIT_BANK            = BIT(23),
};

#define HNS_ROCE_DB_TYPE_COUNT			2
@@ -594,6 +599,7 @@ struct hns_roce_cq_table {
	struct hns_roce_bank bank[HNS_ROCE_CQ_BANK_NUM];
	struct mutex			bank_mutex;
	u32 ctx_num[HNS_ROCE_CQ_BANK_NUM];
	u8 valid_cq_bank_mask;
};

struct hns_roce_srq_table {
+5 −0
Original line number Diff line number Diff line
@@ -262,6 +262,11 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
		props->max_srq_sge = hr_dev->caps.max_srq_sges;
	}

	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) {
		props->max_cq >>= 1;
		props->max_qp >>= 1;
	}

	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR &&
	    hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+39 −10
Original line number Diff line number Diff line
@@ -198,22 +198,16 @@ static u8 get_affinity_cq_bank(u8 qp_bank)
	return (qp_bank >> 1) & CQ_BANKID_MASK;
}

static u8 get_least_load_bankid_for_qp(struct ib_qp_init_attr *init_attr,
					struct hns_roce_bank *bank)
static u8 get_least_load_bankid_for_qp(struct hns_roce_bank *bank, u8 valid_qp_bank_mask)
{
#define INVALID_LOAD_QPNUM 0xFFFFFFFF
	struct ib_cq *scq = init_attr->send_cq;
	u32 least_load = INVALID_LOAD_QPNUM;
	unsigned long cqn = 0;
	u8 bankid = 0;
	u32 bankcnt;
	u8 i;

	if (scq)
		cqn = to_hr_cq(scq)->cqn;

	for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) {
		if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK)))
		if (!(valid_qp_bank_mask & BIT(i)))
			continue;

		bankcnt = bank[i].inuse;
@@ -247,6 +241,42 @@ static int alloc_qpn_with_bankid(struct hns_roce_bank *bank, u8 bankid,

	return 0;
}

static bool use_ext_sge(struct ib_qp_init_attr *init_attr)
{
	return init_attr->cap.max_send_sge > HNS_ROCE_SGE_IN_WQE ||
		init_attr->qp_type == IB_QPT_UD ||
		init_attr->qp_type == IB_QPT_GSI;
}

static u8 select_qp_bankid(struct hns_roce_dev *hr_dev,
			   struct ib_qp_init_attr *init_attr)
{
	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
	struct hns_roce_bank *bank = qp_table->bank;
	struct ib_cq *scq = init_attr->send_cq;
	u8 valid_qp_bank_mask = 0;
	unsigned long cqn = 0;
	u8 i;

	if (scq)
		cqn = to_hr_cq(scq)->cqn;

	for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) {
		if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK)))
			continue;

		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) &&
		    use_ext_sge(init_attr) &&
		    !(VALID_EXT_SGE_QP_BANK_MASK_LIMIT & BIT(i)))
			continue;

		valid_qp_bank_mask |= BIT(i);
	}

	return get_least_load_bankid_for_qp(bank, valid_qp_bank_mask);
}

static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
		     struct ib_qp_init_attr *init_attr)
{
@@ -259,8 +289,7 @@ static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
		num = 1;
	} else {
		mutex_lock(&qp_table->bank_mutex);
		bankid = get_least_load_bankid_for_qp(init_attr, qp_table->bank);

		bankid = select_qp_bankid(hr_dev, init_attr);
		ret = alloc_qpn_with_bankid(&qp_table->bank[bankid], bankid,
					    &num);
		if (ret) {