Commit f605f26e authored by Bob Pearson's avatar Bob Pearson Committed by Jason Gunthorpe
Browse files

RDMA/rxe: Protect QP state with qp->state_lock

Currently the rxe driver makes little effort to make the changes to qp
state (which includes qp->attr.qp_state, qp->attr.sq_draining and
qp->valid) atomic between different client threads and IO threads. In
particular a common template is for an RDMA application to call
ib_modify_qp() to move a qp to ERR state and then wait until all the
packet and work queues have drained before calling ib_destroy_qp(). None
of these state changes are protected by locks to assure that the changes
are executed atomically and that memory barriers are included. This has
been observed to lead to incorrect behavior around qp cleanup.

This patch continues the work of the previous patches in this series and
adds locking code around qp state changes and lookups.

Link: https://lore.kernel.org/r/20230405042611.6467-5-rpearsonhpe@gmail.com


Signed-off-by: default avatarBob Pearson <rpearsonhpe@gmail.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent 7b560b89
Loading
Loading
Loading
Loading
+30 −18
Original line number Diff line number Diff line
@@ -118,10 +118,12 @@ void retransmit_timer(struct timer_list *t)

	rxe_dbg_qp(qp, "retransmit timer fired\n");

	spin_lock_bh(&qp->state_lock);
	if (qp->valid) {
		qp->comp.timeout = 1;
		rxe_sched_task(&qp->comp.task);
	}
	spin_unlock_bh(&qp->state_lock);
}

void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
@@ -479,9 +481,8 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)

static void comp_check_sq_drain_done(struct rxe_qp *qp)
{
	if (unlikely(qp_state(qp) == IB_QPS_SQD)) {
		/* state_lock used by requester & completer */
	spin_lock_bh(&qp->state_lock);
	if (unlikely(qp_state(qp) == IB_QPS_SQD)) {
		if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) {
			qp->attr.sq_draining = 0;
			spin_unlock_bh(&qp->state_lock);
@@ -497,8 +498,8 @@ static void comp_check_sq_drain_done(struct rxe_qp *qp)
			}
			return;
		}
		spin_unlock_bh(&qp->state_lock);
	}
	spin_unlock_bh(&qp->state_lock);
}

static inline enum comp_state complete_ack(struct rxe_qp *qp,
@@ -614,6 +615,26 @@ static void free_pkt(struct rxe_pkt_info *pkt)
	ib_device_put(dev);
}

/* reset the retry timer if
 * - QP is type RC
 * - there is a packet sent by the requester that
 *   might be acked (we still might get spurious
 *   timeouts but try to keep them as few as possible)
 * - the timeout parameter is set
 * - the QP is alive
 */
static void reset_retry_timer(struct rxe_qp *qp)
{
	if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) {
		spin_lock_bh(&qp->state_lock);
		if (qp_state(qp) >= IB_QPS_RTS &&
		    psn_compare(qp->req.psn, qp->comp.psn) > 0)
			mod_timer(&qp->retrans_timer,
				  jiffies + qp->qp_timeout_jiffies);
		spin_unlock_bh(&qp->state_lock);
	}
}

int rxe_completer(struct rxe_qp *qp)
{
	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
@@ -623,14 +644,17 @@ int rxe_completer(struct rxe_qp *qp)
	enum comp_state state;
	int ret;

	spin_lock_bh(&qp->state_lock);
	if (!qp->valid || qp_state(qp) == IB_QPS_ERR ||
			  qp_state(qp) == IB_QPS_RESET) {
		bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR);

		drain_resp_pkts(qp);
		flush_send_queue(qp, notify);
		spin_unlock_bh(&qp->state_lock);
		goto exit;
	}
	spin_unlock_bh(&qp->state_lock);

	if (qp->comp.timeout) {
		qp->comp.timeout_retry = 1;
@@ -718,20 +742,7 @@ int rxe_completer(struct rxe_qp *qp)
				break;
			}

			/* re reset the timeout counter if
			 * (1) QP is type RC
			 * (2) the QP is alive
			 * (3) there is a packet sent by the requester that
			 *     might be acked (we still might get spurious
			 *     timeouts but try to keep them as few as possible)
			 * (4) the timeout parameter is set
			 */
			if ((qp_type(qp) == IB_QPT_RC) &&
			    (qp_state(qp) >= IB_QPS_RTS) &&
			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
			    qp->qp_timeout_jiffies)
				mod_timer(&qp->retrans_timer,
					  jiffies + qp->qp_timeout_jiffies);
			reset_retry_timer(qp);
			goto exit;

		case COMPST_ERROR_RETRY:
@@ -793,6 +804,7 @@ int rxe_completer(struct rxe_qp *qp)
				 */
				qp->req.wait_for_rnr_timer = 1;
				rxe_dbg_qp(qp, "set rnr nak timer\n");
				// TODO who protects from destroy_qp??
				mod_timer(&qp->rnr_nak_timer,
					  jiffies + rnrnak_jiffies(aeth_syn(pkt)
						& ~AETH_TYPE_MASK));
+3 −0
Original line number Diff line number Diff line
@@ -413,11 +413,14 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
	int is_request = pkt->mask & RXE_REQ_MASK;
	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);

	spin_lock_bh(&qp->state_lock);
	if ((is_request && (qp_state(qp) < IB_QPS_RTS)) ||
	    (!is_request && (qp_state(qp) < IB_QPS_RTR))) {
		spin_unlock_bh(&qp->state_lock);
		rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n");
		goto drop;
	}
	spin_unlock_bh(&qp->state_lock);

	rxe_icrc_generate(skb, pkt);

+85 −68
Original line number Diff line number Diff line
@@ -325,8 +325,10 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
	if (err)
		goto err2;

	spin_lock_bh(&qp->state_lock);
	qp->attr.qp_state = IB_QPS_RESET;
	qp->valid = 1;
	spin_unlock_bh(&qp->state_lock);

	return 0;

@@ -377,27 +379,9 @@ int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init)
	return 0;
}

/* called by the modify qp verb, this routine checks all the parameters before
 * making any changes
 */
int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
		    struct ib_qp_attr *attr, int mask)
{
	enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
					attr->cur_qp_state : qp->attr.qp_state;
	enum ib_qp_state new_state = (mask & IB_QP_STATE) ?
					attr->qp_state : cur_state;

	if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) {
		rxe_dbg_qp(qp, "invalid mask or state\n");
		goto err1;
	}

	if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) {
		if (qp->attr.sq_draining && new_state != IB_QPS_ERR)
			goto err1;
	}

	if (mask & IB_QP_PORT) {
		if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) {
			rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num);
@@ -508,22 +492,96 @@ static void rxe_qp_reset(struct rxe_qp *qp)
/* move the qp to the error state */
void rxe_qp_error(struct rxe_qp *qp)
{
	spin_lock_bh(&qp->state_lock);
	qp->attr.qp_state = IB_QPS_ERR;

	/* drain work and packet queues */
	rxe_sched_task(&qp->resp.task);
	rxe_sched_task(&qp->comp.task);
	rxe_sched_task(&qp->req.task);
	spin_unlock_bh(&qp->state_lock);
}

static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr,
		       int mask)
{
	spin_lock_bh(&qp->state_lock);
	qp->attr.sq_draining = 1;
	rxe_sched_task(&qp->comp.task);
	rxe_sched_task(&qp->req.task);
	spin_unlock_bh(&qp->state_lock);
}

/* caller should hold qp->state_lock */
static int __qp_chk_state(struct rxe_qp *qp, struct ib_qp_attr *attr,
			    int mask)
{
	enum ib_qp_state cur_state;
	enum ib_qp_state new_state;

	cur_state = (mask & IB_QP_CUR_STATE) ?
				attr->cur_qp_state : qp->attr.qp_state;
	new_state = (mask & IB_QP_STATE) ?
				attr->qp_state : cur_state;

	if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask))
		return -EINVAL;

	if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) {
		if (qp->attr.sq_draining && new_state != IB_QPS_ERR)
			return -EINVAL;
	}

	return 0;
}

static const char *const qps2str[] = {
	[IB_QPS_RESET]	= "RESET",
	[IB_QPS_INIT]	= "INIT",
	[IB_QPS_RTR]	= "RTR",
	[IB_QPS_RTS]	= "RTS",
	[IB_QPS_SQD]	= "SQD",
	[IB_QPS_SQE]	= "SQE",
	[IB_QPS_ERR]	= "ERR",
};

/* called by the modify qp verb */
int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
		     struct ib_udata *udata)
{
	enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
				attr->cur_qp_state : qp->attr.qp_state;
	int err;

	if (mask & IB_QP_CUR_STATE)
		qp->attr.cur_qp_state = attr->qp_state;

	if (mask & IB_QP_STATE) {
		spin_lock_bh(&qp->state_lock);
		err = __qp_chk_state(qp, attr, mask);
		if (!err) {
			qp->attr.qp_state = attr->qp_state;
			rxe_dbg_qp(qp, "state -> %s\n",
					qps2str[attr->qp_state]);
		}
		spin_unlock_bh(&qp->state_lock);

		if (err)
			return err;

		switch (attr->qp_state) {
		case IB_QPS_RESET:
			rxe_qp_reset(qp);
			break;
		case IB_QPS_SQD:
			rxe_qp_sqd(qp, attr, mask);
			break;
		case IB_QPS_ERR:
			rxe_qp_error(qp);
			break;
		default:
			break;
		}
	}

	if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
		int max_rd_atomic = attr->max_rd_atomic ?
			roundup_pow_of_two(attr->max_rd_atomic) : 0;
@@ -545,9 +603,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
			return err;
	}

	if (mask & IB_QP_CUR_STATE)
		qp->attr.cur_qp_state = attr->qp_state;

	if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
		qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify;

@@ -627,48 +682,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
	if (mask & IB_QP_DEST_QPN)
		qp->attr.dest_qp_num = attr->dest_qp_num;

	if (mask & IB_QP_STATE) {
		qp->attr.qp_state = attr->qp_state;

		switch (attr->qp_state) {
		case IB_QPS_RESET:
			rxe_dbg_qp(qp, "state -> RESET\n");
			rxe_qp_reset(qp);
			break;

		case IB_QPS_INIT:
			rxe_dbg_qp(qp, "state -> INIT\n");
			break;

		case IB_QPS_RTR:
			rxe_dbg_qp(qp, "state -> RTR\n");
			break;

		case IB_QPS_RTS:
			rxe_dbg_qp(qp, "state -> RTS\n");
			break;

		case IB_QPS_SQD:
			rxe_dbg_qp(qp, "state -> SQD\n");
			if (cur_state != IB_QPS_SQD) {
				qp->attr.sq_draining = 1;
				rxe_sched_task(&qp->comp.task);
				rxe_sched_task(&qp->req.task);
			}
			break;

		case IB_QPS_SQE:
			rxe_dbg_qp(qp, "state -> SQE !!?\n");
			/* Not possible from modify_qp. */
			break;

		case IB_QPS_ERR:
			rxe_dbg_qp(qp, "state -> ERR\n");
			rxe_qp_error(qp);
			break;
		}
	}

	return 0;
}

@@ -695,10 +708,12 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
	/* Applications that get this state typically spin on it.
	 * Yield the processor
	 */
	if (qp->attr.sq_draining)
	spin_lock_bh(&qp->state_lock);
	if (qp->attr.sq_draining) {
		spin_unlock_bh(&qp->state_lock);
		cond_resched();

	rxe_dbg_qp(qp, "attr->sq_draining = %d\n", attr->sq_draining);
	}
	spin_unlock_bh(&qp->state_lock);

	return 0;
}
@@ -722,7 +737,9 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
{
	struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work);

	spin_lock_bh(&qp->state_lock);
	qp->valid = 0;
	spin_unlock_bh(&qp->state_lock);
	qp->qp_timeout_jiffies = 0;

	if (qp_type(qp) == IB_QPT_RC) {
+8 −2
Original line number Diff line number Diff line
@@ -38,13 +38,19 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
		return -EINVAL;
	}

	spin_lock_bh(&qp->state_lock);
	if (pkt->mask & RXE_REQ_MASK) {
		if (unlikely(qp_state(qp) < IB_QPS_RTR))
		if (unlikely(qp_state(qp) < IB_QPS_RTR)) {
			spin_unlock_bh(&qp->state_lock);
			return -EINVAL;
		}
	} else {
		if (unlikely(qp_state(qp) < IB_QPS_RTS))
		if (unlikely(qp_state(qp) < IB_QPS_RTS)) {
			spin_unlock_bh(&qp->state_lock);
			return -EINVAL;
		}
	}
	spin_unlock_bh(&qp->state_lock);

	return 0;
}
+47 −24
Original line number Diff line number Diff line
@@ -102,24 +102,33 @@ void rnr_nak_timer(struct timer_list *t)

	rxe_dbg_qp(qp, "nak timer fired\n");

	spin_lock_bh(&qp->state_lock);
	if (qp->valid) {
		/* request a send queue retry */
		qp->req.need_retry = 1;
		qp->req.wait_for_rnr_timer = 0;
		rxe_sched_task(&qp->req.task);
	}
	spin_unlock_bh(&qp->state_lock);
}

static void req_check_sq_drain_done(struct rxe_qp *qp)
{
	struct rxe_queue *q = qp->sq.queue;
	unsigned int index = qp->req.wqe_index;
	unsigned int cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT);
	struct rxe_send_wqe *wqe = queue_addr_from_index(q, cons);
	struct rxe_queue *q;
	unsigned int index;
	unsigned int cons;
	struct rxe_send_wqe *wqe;

	spin_lock_bh(&qp->state_lock);
	if (qp_state(qp) == IB_QPS_SQD) {
		q = qp->sq.queue;
		index = qp->req.wqe_index;
		cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT);
		wqe = queue_addr_from_index(q, cons);

	if (unlikely(qp_state(qp) == IB_QPS_SQD)) {
		/* check to see if we are drained;
		 * state_lock used by requester and completer
		 */
		spin_lock_bh(&qp->state_lock);
		do {
			if (!qp->attr.sq_draining)
				/* comp just finished */
@@ -144,28 +153,40 @@ static void req_check_sq_drain_done(struct rxe_qp *qp)
			}
			return;
		} while (0);
		spin_unlock_bh(&qp->state_lock);
	}
	spin_unlock_bh(&qp->state_lock);
}

static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp)
{
	struct rxe_send_wqe *wqe;
	struct rxe_queue *q = qp->sq.queue;
	unsigned int index = qp->req.wqe_index;
	unsigned int prod;

	req_check_sq_drain_done(qp);

	prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT);
	if (index == prod)
		return NULL;
	else
		return queue_addr_from_index(q, index);
}

	wqe = queue_addr_from_index(q, index);
static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
{
	struct rxe_send_wqe *wqe;

	req_check_sq_drain_done(qp);

	wqe = __req_next_wqe(qp);
	if (wqe == NULL)
		return NULL;

	spin_lock(&qp->state_lock);
	if (unlikely((qp_state(qp) == IB_QPS_SQD) &&
		     (wqe->state != wqe_state_processing)))
		     (wqe->state != wqe_state_processing))) {
		spin_unlock(&qp->state_lock);
		return NULL;
	}
	spin_unlock(&qp->state_lock);

	wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
	return wqe;
@@ -656,15 +677,16 @@ int rxe_requester(struct rxe_qp *qp)
	struct rxe_ah *ah;
	struct rxe_av *av;

	if (unlikely(!qp->valid))
	spin_lock_bh(&qp->state_lock);
	if (unlikely(!qp->valid)) {
		spin_unlock_bh(&qp->state_lock);
		goto exit;
	}

	if (unlikely(qp_state(qp) == IB_QPS_ERR)) {
		wqe = req_next_wqe(qp);
		wqe = __req_next_wqe(qp);
		spin_unlock_bh(&qp->state_lock);
		if (wqe)
			/*
			 * Generate an error completion for error qp state
			 */
			goto err;
		else
			goto exit;
@@ -678,8 +700,10 @@ int rxe_requester(struct rxe_qp *qp)
		qp->req.wait_psn = 0;
		qp->req.need_retry = 0;
		qp->req.wait_for_rnr_timer = 0;
		spin_unlock_bh(&qp->state_lock);
		goto exit;
	}
	spin_unlock_bh(&qp->state_lock);

	/* we come here if the retransmit timer has fired
	 * or if the rnr timer has fired. If the retransmit
@@ -839,8 +863,7 @@ int rxe_requester(struct rxe_qp *qp)
	/* update wqe_index for each wqe completion */
	qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
	wqe->state = wqe_state_error;
	qp->attr.qp_state = IB_QPS_ERR;
	rxe_sched_task(&qp->comp.task);
	rxe_qp_error(qp);
exit:
	ret = -EAGAIN;
out:
Loading