Commit abcc57d3 authored by wenglianfa's avatar wenglianfa Committed by Zhaojiahui
Browse files

RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci

maillist inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/IB530Z
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=571e4ab8a45e530623ab129803f090a844dd3fe9



----------------------------------------------------------------------

eq_db_ci is updated only after all AEQEs are processed in the AEQ
interrupt handler, which is not timely enough and may result in
AEQ overflow. Two optimization methods are proposed:
1. Set an upper limit for AEQE processing.
2. Move time-consuming operations such as printings to the bottom
half of the interrupt.

cmd events and flush_cqe events are still fully processed in the top half
to ensure timely handling.

Fixes: a5073d60 ("RDMA/hns: Add eq support of hip08")
Signed-off-by: default avatarwenglianfa <wenglianfa@huawei.com>
Signed-off-by: default avatarJunxian Huang <huangjunxian6@hisilicon.com>
Link: https://patch.msgid.link/20241024124000.2931869-2-huangjunxian6@hisilicon.com


Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
Signed-off-by: default avatarZhaojiahui <zhaojiahui12@h-partners.com>
parent 95685dd6
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1538,6 +1538,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp);
void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
u8 hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
+6 −1
Original line number Diff line number Diff line
@@ -6780,7 +6780,12 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
	case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
	case HNS_ROCE_EVENT_TYPE_COMM_EST:
	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
	case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
	case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
	case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
		hns_roce_qp_event(hr_dev, queue_num, event_type);
		break;
	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
@@ -6879,7 +6884,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
		case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
		case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
			hns_roce_qp_event(hr_dev, queue_num, event_type);
			hns_roce_flush_cqe(hr_dev, queue_num);
			break;
		case HNS_ROCE_EVENT_TYPE_MB:
			hns_roce_cmd_event(hr_dev,
+35 −19
Original line number Diff line number Diff line
@@ -39,6 +39,25 @@
#include "hns_roce_hem.h"
#include "hns_roce_dca.h"

static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev,
					      u32 qpn)
{
	struct device *dev = hr_dev->dev;
	struct hns_roce_qp *qp;
	unsigned long flags;

	xa_lock_irqsave(&hr_dev->qp_table_xa, flags);
	qp = __hns_roce_qp_lookup(hr_dev, qpn);
	if (qp)
		refcount_inc(&qp->refcount);
	xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags);

	if (!qp)
		dev_warn(dev, "async event for bogus QP %08x\n", qpn);

	return qp;
}

static void flush_work_handle(struct work_struct *work)
{
	struct hns_roce_work *flush_work = container_of(work,
@@ -95,31 +114,28 @@ void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp)

void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
{
	struct device *dev = hr_dev->dev;
	struct hns_roce_qp *qp;

	xa_lock(&hr_dev->qp_table_xa);
	qp = __hns_roce_qp_lookup(hr_dev, qpn);
	if (qp)
		refcount_inc(&qp->refcount);
	xa_unlock(&hr_dev->qp_table_xa);

	if (!qp) {
		dev_warn(dev, "Async event for bogus QP %08x\n", qpn);
	qp = hns_roce_qp_lookup(hr_dev, qpn);
	if (!qp)
		return;
	}

	if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR ||
	    event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR ||
	    event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR ||
	    event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION ||
	    event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH) {
		qp->state = IB_QPS_ERR;
	qp->event(qp, (enum hns_roce_event)event_type);

		flush_cqe(hr_dev, qp);
	if (refcount_dec_and_test(&qp->refcount))
		complete(&qp->free);
}

	qp->event(qp, (enum hns_roce_event)event_type);
void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn)
{
	struct hns_roce_qp *qp;

	qp = hns_roce_qp_lookup(hr_dev, qpn);
	if (!qp)
		return;

	qp->state = IB_QPS_ERR;
	flush_cqe(hr_dev, qp);

	if (refcount_dec_and_test(&qp->refcount))
		complete(&qp->free);