Commit 2b39c452 authored by wenglianfa's avatar wenglianfa Committed by huwentao
Browse files

RDMA/hns: Fix flush cqe error when racing with destroy qp

mainline inclusion
from mainline-v6.13-rc2
commit 377a2097705b915325a67e4d44f9f2844e567809
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/IB7JSL

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=377a2097705b915325a67e4d44f9f2844e567809



----------------------------------------------------------------------

QP needs to be modified to IB_QPS_ERROR to trigger HW flush cqe. But
when this process races with destroy qp, the destroy-qp process may
modify the QP to IB_QPS_RESET first. In this case flush cqe will fail
since it is invalid to modify qp from IB_QPS_RESET to IB_QPS_ERROR.

Add lock and bit flag to make sure pending flush cqe work is completed
first and no more new works will be added.

Fixes: ffd541d4 ("RDMA/hns: Add the workqueue framework for flush cqe handler")
Signed-off-by: default avatarwenglianfa <wenglianfa@huawei.com>
Signed-off-by: default avatarJunxian Huang <huangjunxian6@hisilicon.com>
Link: https://patch.msgid.link/20241024124000.2931869-3-huangjunxian6@hisilicon.com


Reviewed-by: default avatarZhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
Signed-off-by: default avatarhuwentao <huwentao19@h-partners.com>
Conflicts:
        drivers/infiniband/hw/hns/hns_roce_device.h
        drivers/infiniband/hw/hns/hns_roce_hw_v2.c
parent 68b0fb4e
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -717,6 +717,7 @@ struct hns_roce_dev;

enum {
	HNS_ROCE_FLUSH_FLAG = 0,
	HNS_ROCE_STOP_FLUSH_FLAG = 1,
};

struct hns_roce_work {
@@ -777,6 +778,7 @@ struct hns_roce_qp {
	bool			delayed_destroy_flag;
	struct hns_roce_mtr_node *mtr_node;
	struct hns_roce_dip *dip;
	spinlock_t flush_lock;
};

struct hns_roce_ib_iboe {
+7 −0
Original line number Diff line number Diff line
@@ -6330,11 +6330,18 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
{
	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
	unsigned long flags;
	int ret;

	if (hr_qp->congest_type == HNS_ROCE_CONGEST_TYPE_DIP)
		put_dip_ctx_idx(hr_dev, hr_qp);

	/* Make sure flush_cqe() is completed */
	spin_lock_irqsave(&hr_qp->flush_lock, flags);
	set_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag);
	spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
	flush_work(&hr_qp->flush_work.work);

	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
	if (ret)
		ibdev_err_ratelimited(&hr_dev->ib_dev,
+13 −2
Original line number Diff line number Diff line
@@ -71,11 +71,18 @@ static void flush_work_handle(struct work_struct *work)
void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
{
	struct hns_roce_work *flush_work = &hr_qp->flush_work;
	unsigned long flags;

	spin_lock_irqsave(&hr_qp->flush_lock, flags);
	/* Exit directly after destroy_qp() */
	if (test_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag)) {
		spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
		return;
	}

	flush_work->hr_dev = hr_dev;
	INIT_WORK(&flush_work->work, flush_work_handle);
	refcount_inc(&hr_qp->refcount);
	queue_work(hr_dev->irq_workq, &flush_work->work);
	spin_unlock_irqrestore(&hr_qp->flush_lock, flags);
}

void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp)
@@ -1348,6 +1355,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
				     struct ib_udata *udata,
				     struct hns_roce_qp *hr_qp)
{
	struct hns_roce_work *flush_work = &hr_qp->flush_work;
	struct hns_roce_ib_create_qp_resp resp = {};
	struct ib_device *ibdev = &hr_dev->ib_dev;
	struct hns_roce_ib_create_qp ucmd = {};
@@ -1356,9 +1364,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
	mutex_init(&hr_qp->mutex);
	spin_lock_init(&hr_qp->sq.lock);
	spin_lock_init(&hr_qp->rq.lock);
	spin_lock_init(&hr_qp->flush_lock);

	hr_qp->state = IB_QPS_RESET;
	hr_qp->flush_flag = 0;
	flush_work->hr_dev = hr_dev;
	INIT_WORK(&flush_work->work, flush_work_handle);

	ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd);
	if (ret) {