Commit 6be22ed6 authored by Ruozhu Li's avatar Ruozhu Li Committed by Yang Yingliang
Browse files

nvme-rdma: destroy cm id before destroy qp to avoid use after free

mainline inclusion
from mainline-v5.15-rc2
commit 9817d763
category: bugfix
bugzilla: NA
CVE: NA
Link: https://gitee.com/openeuler/kernel/issues/I1WGZE



We got a panic when host received a rej cm event soon after a connect
error cm event.
When host get connect error cm event, it will destroy qp immediately.
But cm_id is still valid then.Another cm event rise here, try to access
the qp which was destroyed.Then we got a kernel panic blow:

[87816.777089] [20473] ib_cm:cm_rep_handler:2343: cm_rep_handler: Stale connection. local_comm_id -154357094, remote_comm_id -1133609861
[87816.777223] [20473] ib_cm:cm_init_qp_rtr_attr:4162: cm_init_qp_rtr_attr: local_id -1150387077, cm_id_priv->id.state: 13
[87816.777225] [20473] rdma_cm:cma_rep_recv:1871: RDMA CM: CONNECT_ERROR: failed to handle reply. status -22
[87816.777395] [20473] ib_cm:ib_send_cm_rej:2781: ib_send_cm_rej: local_id -1150387077, cm_id->state: 13
[87816.777398] [20473] nvme_rdma:nvme_rdma_cm_handler:1718: nvme nvme278: connect error (6): status -22 id 00000000c3809aff
[87816.801155] [20473] nvme_rdma:nvme_rdma_cm_handler:1742: nvme nvme278: CM error event 6
[87816.801160] [20473] rdma_cm:cma_ib_handler:1947: RDMA CM: REJECTED: consumer defined
[87816.801163] nvme nvme278: rdma connection establishment failed (-104)
[87816.801168] BUG: unable to handle kernel NULL pointer dereference at 0000000000000370
[87816.801201] RIP: 0010:_ib_modify_qp+0x6e/0x3a0 [ib_core]
[87816.801215] Call Trace:
[87816.801223]  cma_modify_qp_err+0x52/0x80 [rdma_cm]
[87816.801228]  ? __dynamic_pr_debug+0x8a/0xb0
[87816.801232]  cma_ib_handler+0x25a/0x2f0 [rdma_cm]
[87816.801235]  cm_process_work+0x60/0xe0 [ib_cm]
[87816.801238]  cm_work_handler+0x13b/0x1b97 [ib_cm]
[87816.801243]  ? __switch_to_asm+0x35/0x70
[87816.801244]  ? __switch_to_asm+0x41/0x70
[87816.801246]  ? __switch_to_asm+0x35/0x70
[87816.801248]  ? __switch_to_asm+0x41/0x70
[87816.801252]  ? __switch_to+0x8c/0x480
[87816.801254]  ? __switch_to_asm+0x41/0x70
[87816.801256]  ? __switch_to_asm+0x35/0x70
[87816.801259]  process_one_work+0x1a7/0x3b0
[87816.801263]  worker_thread+0x30/0x390
[87816.801266]  ? create_worker+0x1a0/0x1a0
[87816.801268]  kthread+0x112/0x130
[87816.801270]  ? kthread_flush_work_fn+0x10/0x10
[87816.801272]  ret_from_fork+0x35/0x40

-------------------------------------------------

We should always destroy cm_id before destroy qp to avoid to get cma
event after qp was destroyed, which may lead to use after free.
In RDMA connection establishment error flow, don't destroy qp in cm
event handler.Just report cm_error to upper level, qp will be destroy
in nvme_rdma_alloc_queue() after destroy cm id.

Signed-off-by: default avatarRuozhu Li <liruozhu@huawei.com>
Reviewed-by: default avatarMax Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Conflicts:
        drivers/nvme/host/rdma.c
    	[lrz: adjust context]
Reviewed-by: default avatarJason Yan <yanaijie@huawei.com>
Signed-off-by: default avatarYang Yingliang <yangyingliang@huawei.com>
parent 9f5a4906
Loading
Loading
Loading
Loading
+3 −13
Original line number Diff line number Diff line
@@ -575,8 +575,8 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
	if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
		return;

	nvme_rdma_destroy_queue_ib(queue);
	rdma_destroy_id(queue->cm_id);
	nvme_rdma_destroy_queue_ib(queue);
	mutex_destroy(&queue->queue_lock);
}

@@ -1507,14 +1507,10 @@ static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
	for (i = 0; i < queue->queue_size; i++) {
		ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
		if (ret)
			goto out_destroy_queue_ib;
			return ret;
	}

	return 0;

out_destroy_queue_ib:
	nvme_rdma_destroy_queue_ib(queue);
	return ret;
}

static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
@@ -1606,14 +1602,10 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
	if (ret) {
		dev_err(ctrl->ctrl.device,
			"rdma_connect failed (%d).\n", ret);
		goto out_destroy_queue_ib;
		return ret;
	}

	return 0;

out_destroy_queue_ib:
	nvme_rdma_destroy_queue_ib(queue);
	return ret;
}

static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
@@ -1644,8 +1636,6 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
	case RDMA_CM_EVENT_ROUTE_ERROR:
	case RDMA_CM_EVENT_CONNECT_ERROR:
	case RDMA_CM_EVENT_UNREACHABLE:
		nvme_rdma_destroy_queue_ib(queue);
		/* fall through */
	case RDMA_CM_EVENT_ADDR_ERROR:
		dev_dbg(queue->ctrl->ctrl.device,
			"CM error event %d\n", ev->event);