Commit 3c6cb20a authored by Kaike Wan's avatar Kaike Wan Committed by Doug Ledford
Browse files

IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs



This patch integrates TID RDMA WRITE protocol into normal RDMA verbs
framework. The TID RDMA WRITE protocol is an end-to-end protocol
between the hfi1 drivers on two OPA nodes that converts a qualified
RDMA WRITE request into a TID RDMA WRITE request to avoid data copying
on the responder side.

Reviewed-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarMitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: default avatarKaike Wan <kaike.wan@intel.com>
Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 572f0c33
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
	.flags = RVT_OPERATION_USE_RESERVE,
},

[IB_WR_TID_RDMA_WRITE] = {
	.length = sizeof(struct ib_rdma_wr),
	.qpt_support = BIT(IB_QPT_RC),
	.flags = RVT_OPERATION_IGN_RNR_CNT,
},

};

static void flush_list_head(struct list_head *l)
@@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp)
	struct hfi1_qp_priv *priv = qp->priv;

	hfi1_del_tid_reap_timer(qp);
	hfi1_del_tid_retry_timer(qp);
	iowait_sdma_drain(&priv->s_iowait);
	qp_pio_drain(qp);
	flush_tx_list(qp);
+440 −47
Original line number Diff line number Diff line
@@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
		       struct hfi1_pkt_state *ps)
{
	struct rvt_ack_entry *e;
	u32 hwords;
	u32 hwords, hdrlen;
	u32 len = 0;
	u32 bth0 = 0, bth2 = 0;
	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
	int middle = 0;
	u32 pmtu = qp->pmtu;
	struct hfi1_qp_priv *priv = qp->priv;
	struct hfi1_qp_priv *qpriv = qp->priv;
	bool last_pkt;
	u32 delta;
	u8 next = qp->s_tail_ack_queue;
	struct tid_rdma_request *req;

	trace_hfi1_rsp_make_rc_ack(qp, 0);
	lockdep_assert_held(&qp->s_lock);
@@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
		goto bail;

	if (priv->hdr_type == HFI1_PKT_TYPE_9B)
	if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
		hwords = 5;
	else
@@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
			hwords++;
			qp->s_ack_rdma_psn = e->psn;
			bth2 = mask_psn(qp->s_ack_rdma_psn++);
		} else if (e->opcode == TID_OP(WRITE_REQ)) {
			/*
			 * If a TID RDMA WRITE RESP is being resent, we have to
			 * wait for the actual request. All requests that are to
			 * be resent will have their state set to
			 * TID_REQUEST_RESEND. When the new request arrives, the
			 * state will be changed to TID_REQUEST_RESEND_ACTIVE.
			 */
			req = ack_to_tid_req(e);
			if (req->state == TID_REQUEST_RESEND ||
			    req->state == TID_REQUEST_INIT_RESEND)
				goto bail;
			qp->s_ack_state = TID_OP(WRITE_RESP);
			qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
			goto write_resp;
		} else if (e->opcode == TID_OP(READ_REQ)) {
			/*
			 * If a TID RDMA read response is being resent and
@@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
		bth2 = mask_psn(qp->s_ack_rdma_psn++);
		break;

	case TID_OP(WRITE_RESP):
write_resp:
		/*
		 * 1. Check if RVT_S_ACK_PENDING is set. If yes,
		 *    goto normal.
		 * 2. Attempt to allocate TID resources.
		 * 3. Remove RVT_S_RESP_PENDING flags from s_flags
		 * 4. If resources not available:
		 *    4.1 Set RVT_S_WAIT_TID_SPACE
		 *    4.2 Queue QP on RCD TID queue
		 *    4.3 Put QP on iowait list.
		 *    4.4 Build IB RNR NAK with appropriate timeout value
		 *    4.5 Return indication progress made.
		 * 5. If resources are available:
		 *    5.1 Program HW flow CSRs
		 *    5.2 Build TID RDMA WRITE RESP packet
		 *    5.3 If more resources needed, do 2.1 - 2.3.
		 *    5.4 Wake up next QP on RCD TID queue.
		 *    5.5 Return indication progress made.
		 */

		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
		req = ack_to_tid_req(e);

		/*
		 * Send scheduled RNR NAK's. RNR NAK's need to be sent at
		 * segment boundaries, not at request boundaries. Don't change
		 * s_ack_state because we are still in the middle of a request
		 */
		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
		    qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
		    req->cur_seg == req->alloc_seg) {
			qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
			goto normal_no_state;
		}

		bth2 = mask_psn(qp->s_ack_rdma_psn);
		hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
							bth2, &len,
							&ps->s_txreq->ss);
		if (!hdrlen)
			return 0;

		hwords += hdrlen;
		bth0 = qp->s_ack_state << 24;
		qp->s_ack_rdma_psn++;
		if (req->cur_seg != req->total_segs)
			break;

		e->sent = 1;
		qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
		break;

	case TID_OP(READ_RESP):
read_resp:
		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
@@ -298,8 +367,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
		 * (see above).
		 */
		qp->s_ack_state = OP(SEND_ONLY);
		qp->s_flags &= ~RVT_S_ACK_PENDING;
		ps->s_txreq->ss = NULL;
normal_no_state:
		if (qp->s_nak_state)
			ohdr->u.aeth =
				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -311,9 +379,11 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
		len = 0;
		bth0 = OP(ACKNOWLEDGE) << 24;
		bth2 = mask_psn(qp->s_ack_psn);
		qp->s_flags &= ~RVT_S_ACK_PENDING;
		ps->s_txreq->ss = NULL;
	}
	qp->s_rdma_ack_cnt++;
	ps->s_txreq->sde = priv->s_sde;
	ps->s_txreq->sde = qpriv->s_sde;
	ps->s_txreq->s_cur_size = len;
	ps->s_txreq->hdr_dwords = hwords;
	hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
@@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
	int middle = 0;
	int delta;
	struct tid_rdma_flow *flow = NULL;
	struct tid_rdma_params *remote;

	trace_hfi1_sender_make_rc_req(qp);
	lockdep_assert_held(&qp->s_lock);
@@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
		goto done_free_tx;
	}

	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
		goto bail;

	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -586,6 +657,108 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
				qp->s_cur = 0;
			break;

		case IB_WR_TID_RDMA_WRITE:
			if (newreq) {
				/*
				 * Limit the number of TID RDMA WRITE requests.
				 */
				if (atomic_read(&priv->n_tid_requests) >=
				    HFI1_TID_RDMA_WRITE_CNT)
					goto bail;

				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
					qp->s_lsn++;
			}

			hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
								&bth1, &bth2,
								&len);
			ss = NULL;
			if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
				priv->s_tid_cur = qp->s_cur;
				if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
					priv->s_tid_tail = qp->s_cur;
					priv->s_state = TID_OP(WRITE_RESP);
				}
			} else if (priv->s_tid_cur == priv->s_tid_head) {
				struct rvt_swqe *__w;
				struct tid_rdma_request *__r;

				__w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
				__r = wqe_to_tid_req(__w);

				/*
				 * The s_tid_cur pointer is advanced to s_cur if
				 * any of the following conditions about the WQE
				 * to which s_ti_cur currently points to are
				 * satisfied:
				 *   1. The request is not a TID RDMA WRITE
				 *      request,
				 *   2. The request is in the INACTIVE or
				 *      COMPLETE states (TID RDMA READ requests
				 *      stay at INACTIVE and TID RDMA WRITE
				 *      transition to COMPLETE when done),
				 *   3. The request is in the ACTIVE or SYNC
				 *      state and the number of completed
				 *      segments is equal to the total segment
				 *      count.
				 *      (If ACTIVE, the request is waiting for
				 *       ACKs. If SYNC, the request has not
				 *       received any responses because it's
				 *       waiting on a sync point.)
				 */
				if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
				    __r->state == TID_REQUEST_INACTIVE ||
				    __r->state == TID_REQUEST_COMPLETE ||
				    ((__r->state == TID_REQUEST_ACTIVE ||
				      __r->state == TID_REQUEST_SYNC) &&
				     __r->comp_seg == __r->total_segs)) {
					if (priv->s_tid_tail ==
					    priv->s_tid_cur &&
					    priv->s_state ==
					    TID_OP(WRITE_DATA_LAST)) {
						priv->s_tid_tail = qp->s_cur;
						priv->s_state =
							TID_OP(WRITE_RESP);
					}
					priv->s_tid_cur = qp->s_cur;
				}
				/*
				 * A corner case: when the last TID RDMA WRITE
				 * request was completed, s_tid_head,
				 * s_tid_cur, and s_tid_tail all point to the
				 * same location. Other requests are posted and
				 * s_cur wraps around to the same location,
				 * where a new TID RDMA WRITE is posted. In
				 * this case, none of the indices need to be
				 * updated. However, the priv->s_state should.
				 */
				if (priv->s_tid_tail == qp->s_cur &&
				    priv->s_state == TID_OP(WRITE_DATA_LAST))
					priv->s_state = TID_OP(WRITE_RESP);
			}
			req = wqe_to_tid_req(wqe);
			if (newreq) {
				priv->s_tid_head = qp->s_cur;
				priv->pending_tid_w_resp += req->total_segs;
				atomic_inc(&priv->n_tid_requests);
				atomic_dec(&priv->n_requests);
			} else {
				req->state = TID_REQUEST_RESEND;
				req->comp_seg = delta_psn(bth2, wqe->psn);
				/*
				 * Pull back any segments since we are going
				 * to re-receive them.
				 */
				req->setup_head = req->clear_tail;
				priv->pending_tid_w_resp +=
					delta_psn(wqe->lpsn, bth2) + 1;
			}

			if (++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		case IB_WR_RDMA_READ:
			/*
			 * Don't allow more operations to be started
@@ -745,7 +918,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
			if (qp->s_tail >= qp->s_size)
				qp->s_tail = 0;
		}
		if (wqe->wr.opcode == IB_WR_RDMA_READ)
		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
			qp->s_psn = wqe->lpsn + 1;
		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
			qp->s_psn = req->s_next_psn;
@@ -865,6 +1039,33 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
		if (qp->s_cur == qp->s_size)
			qp->s_cur = 0;
		break;

	case TID_OP(WRITE_RESP):
		/*
		 * This value for s_state is used for restarting a TID RDMA
		 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
		 * for more).
		 */
		req = wqe_to_tid_req(wqe);
		req->state = TID_REQUEST_RESEND;
		rcu_read_lock();
		remote = rcu_dereference(priv->tid_rdma.remote);
		req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
		len = wqe->length - (req->comp_seg * remote->max_len);
		rcu_read_unlock();

		bth2 = mask_psn(qp->s_psn);
		hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
							&bth2, &len);
		qp->s_psn = wqe->lpsn + 1;
		ss = NULL;
		qp->s_state = TID_OP(WRITE_REQ);
		priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
		priv->s_tid_cur = qp->s_cur;
		if (++qp->s_cur == qp->s_size)
			qp->s_cur = 0;
		break;

	case TID_OP(READ_RESP):
		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
			goto bail;
@@ -965,7 +1166,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
	}
	qp->s_sending_hpsn = bth2;
	delta = delta_psn(bth2, wqe->psn);
	if (delta && delta % HFI1_PSN_CREDIT == 0)
	if (delta && delta % HFI1_PSN_CREDIT == 0 &&
	    wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
		bth2 |= IB_BTH_REQ_ACK;
	if (qp->s_flags & RVT_S_SEND_ONE) {
		qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -998,6 +1200,12 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
bail_no_tx:
	ps->s_txreq = NULL;
	qp->s_flags &= ~RVT_S_BUSY;
	/*
	 * If we didn't get a txreq, the QP will be woken up later to try
	 * again. Set the flags to indicate which work item to wake
	 * up.
	 */
	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
	return 0;
}

@@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
	lockdep_assert_held(&qp->s_lock);
	qp->s_cur = n;
	priv->pending_tid_r_segs = 0;
	priv->pending_tid_w_resp = 0;
	qp->s_num_rd_atomic = 0;

	/*
@@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
		break;

	case IB_WR_TID_RDMA_WRITE:
		qp->s_state = TID_OP(WRITE_RESP);
		break;

	case IB_WR_RDMA_READ:
		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
		break;
@@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)

	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
			 RVT_S_WAIT_ACK);
			 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
	if (wait)
		qp->s_flags |= RVT_S_SEND_ONE;
	reset_psn(qp, psn);
@@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)

/*
 * Set qp->s_sending_psn to the next PSN after the given one.
 * This would be psn+1 except when RDMA reads are present.
 * This would be psn+1 except when RDMA reads or TID RDMA ops
 * are present.
 */
static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
{
@@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
		wqe = rvt_get_swqe_ptr(qp, n);
		if (cmp_psn(psn, wqe->lpsn) <= 0) {
			if (wqe->wr.opcode == IB_WR_RDMA_READ ||
			    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
			    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
			    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
				qp->s_sending_psn = wqe->lpsn + 1;
			else
				qp->s_sending_psn = psn + 1;
@@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
	struct rvt_swqe *wqe;
	struct ib_header *hdr = NULL;
	struct hfi1_16b_header *hdr_16b = NULL;
	u32 opcode;
	u32 opcode, head, tail;
	u32 psn;
	struct tid_rdma_request *req;

	lockdep_assert_held(&qp->s_lock);
	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1507,20 +1723,56 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
	opcode = ib_bth_get_opcode(ohdr);
	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
	    opcode == TID_OP(READ_RESP)) {
	    opcode == TID_OP(READ_RESP) ||
	    opcode == TID_OP(WRITE_RESP)) {
		WARN_ON(!qp->s_rdma_ack_cnt);
		qp->s_rdma_ack_cnt--;
		return;
	}

	psn = ib_bth_get_psn(ohdr);
	/*
	 * Don't attempt to reset the sending PSN for packets in the
	 * KDETH PSN space since the PSN does not match anything.
	 */
	if (opcode != TID_OP(WRITE_DATA) &&
	    opcode != TID_OP(WRITE_DATA_LAST) &&
	    opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
		reset_sending_psn(qp, psn);

	/* Handle TID RDMA WRITE packets differently */
	if (opcode >= TID_OP(WRITE_REQ) &&
	    opcode <= TID_OP(WRITE_DATA_LAST)) {
		head = priv->s_tid_head;
		tail = priv->s_tid_cur;
		/*
		 * s_tid_cur is set to s_tid_head in the case, where
		 * a new TID RDMA request is being started and all
		 * previous ones have been completed.
		 * Therefore, we need to do a secondary check in order
		 * to properly determine whether we should start the
		 * RC timer.
		 */
		wqe = rvt_get_swqe_ptr(qp, tail);
		req = wqe_to_tid_req(wqe);
		if (head == tail && req->comp_seg < req->total_segs) {
			if (tail == 0)
				tail = qp->s_size - 1;
			else
				tail -= 1;
		}
	} else {
		head = qp->s_tail;
		tail = qp->s_acked;
	}

	/*
	 * Start timer after a packet requesting an ACK has been sent and
	 * there are still requests that haven't been acked.
	 */
	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
	if ((psn & IB_BTH_REQ_ACK) && tail != head &&
	    opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
	    opcode != TID_OP(RESYNC) &&
	    !(qp->s_flags &
	      (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
@@ -1530,6 +1782,25 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
			rvt_add_retry_timer(qp);
	}

	/* Start TID RDMA ACK timer */
	if ((opcode == TID_OP(WRITE_DATA) ||
	     opcode == TID_OP(WRITE_DATA_LAST) ||
	     opcode == TID_OP(RESYNC)) &&
	    (psn & IB_BTH_REQ_ACK) &&
	    !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
		/*
		 * The TID RDMA ACK packet could be received before this
		 * function is called. Therefore, add the timer only if TID
		 * RDMA ACK packets are actually pending.
		 */
		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
		req = wqe_to_tid_req(wqe);
		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
		    req->ack_seg < req->cur_seg)
			hfi1_add_tid_retry_timer(qp);
	}

	while (qp->s_last != qp->s_acked) {
		u32 s_last;

@@ -1628,6 +1899,15 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
	}

	qp->s_retry = qp->s_retry_cnt;
	/*
	 * Don't update the last PSN if the request being completed is
	 * a TID RDMA WRITE request.
	 * Completion of the TID RDMA WRITE requests are done by the
	 * TID RDMA ACKs and as such could be for a request that has
	 * already been ACKed as far as the IB state machine is
	 * concerned.
	 */
	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
		update_last_psn(qp, wqe->lpsn);

	/*
@@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
	return wqe;
}

static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
{
	/* Retry this request. */
	if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
		qp->r_flags |= RVT_R_RDMAR_SEQ;
		hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
		if (list_empty(&qp->rspwait)) {
			qp->r_flags |= RVT_R_RSP_SEND;
			rvt_get_qp(qp);
			list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
		}
	}
}

/**
 * update_qp_retry_state - Update qp retry state.
 * @qp: the QP
 * @psn: the packet sequence number of the TID RDMA WRITE RESP.
 * @spsn:  The start psn for the given TID RDMA WRITE swqe.
 * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
 *
 * This function is called to update the qp retry state upon
 * receiving a TID WRITE RESP after the qp is scheduled to retry
 * a request.
 */
static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
				  u32 lpsn)
{
	struct hfi1_qp_priv *qpriv = qp->priv;

	qp->s_psn = psn + 1;
	/*
	 * If this is the first TID RDMA WRITE RESP packet for the current
	 * request, change the s_state so that the retry will be processed
	 * correctly. Similarly, if this is the last TID RDMA WRITE RESP
	 * packet, change the s_state and advance the s_cur.
	 */
	if (cmp_psn(psn, lpsn) >= 0) {
		qp->s_cur = qpriv->s_tid_cur + 1;
		if (qp->s_cur >= qp->s_size)
			qp->s_cur = 0;
		qp->s_state = TID_OP(WRITE_REQ);
	} else  if (!cmp_psn(psn, spsn)) {
		qp->s_cur = qpriv->s_tid_cur;
		qp->s_state = TID_OP(WRITE_RESP);
	}
}

/**
 * do_rc_ack - process an incoming RC ACK
 * @qp: the QP the ACK came in on
@@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
	int ret = 0;
	u32 ack_psn;
	int diff;
	struct rvt_dev_info *rdi;

	lockdep_assert_held(&qp->s_lock);
	/*
@@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
		     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
			/* Retry this request. */
			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
				qp->r_flags |= RVT_R_RDMAR_SEQ;
				hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
				if (list_empty(&qp->rspwait)) {
					qp->r_flags |= RVT_R_RSP_SEND;
					rvt_get_qp(qp);
					list_add_tail(&qp->rspwait,
						      &rcd->qp_wait_list);
				}
			}
		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
		    (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
		     (delta_psn(psn, qp->s_last_psn) != 1))) {
			set_restart_qp(qp, rcd);
			/*
			 * No need to process the ACK/NAK since we are
			 * restarting an earlier request.
@@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
				hfi1_schedule_send(qp);
			}
		}

		/*
		 * TID RDMA WRITE requests will be completed by the TID RDMA
		 * ACK packet handler (see tid_rdma.c).
		 */
		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
			break;

		wqe = do_rc_completion(qp, wqe, ibp);
		if (qp->s_acked == qp->s_tail)
			break;
@@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
			else
				rvt_stop_rc_timers(qp);
		} else if (qp->s_acked != qp->s_tail) {
			struct rvt_swqe *__w = NULL;

			if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
				__w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);

			/*
			 * Stop timers if we've received all of the TID RDMA
			 * WRITE * responses.
			 */
			if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
			    opcode == TID_OP(WRITE_RESP)) {
				/*
				 * Normally, the loop above would correctly
				 * process all WQEs from s_acked onward and
				 * either complete them or check for correct
				 * PSN sequencing.
				 * However, for TID RDMA, due to pipelining,
				 * the response may not be for the request at
				 * s_acked so the above look would just be
				 * skipped. This does not allow for checking
				 * the PSN sequencing. It has to be done
				 * separately.
				 */
				if (cmp_psn(psn, qp->s_last_psn + 1)) {
					set_restart_qp(qp, rcd);
					goto bail_stop;
				}
				/*
				 * If the psn is being resent, stop the
				 * resending.
				 */
				if (qp->s_cur != qp->s_tail &&
				    cmp_psn(qp->s_psn, psn) <= 0)
					update_qp_retry_state(qp, psn,
							      __w->psn,
							      __w->lpsn);
				else if (--qpriv->pending_tid_w_resp)
					rvt_mod_retry_timer(qp);
				else
					rvt_stop_rc_timers(qp);
			} else {
				/*
				 * We are expecting more ACKs so
				 * mod the retry timer.
				 */
				rvt_mod_retry_timer(qp);
				/*
			 * We can stop re-sending the earlier packets and
			 * continue with the next packet the receiver wants.
				 * We can stop re-sending the earlier packets
				 * and continue with the next packet the
				 * receiver wants.
				 */
				if (cmp_psn(qp->s_psn, psn) <= 0)
					reset_psn(qp, psn + 1);
			}
		} else {
			/* No more acks - kill all timers */
			rvt_stop_rc_timers(qp);
@@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
		rvt_get_credit(qp, aeth);
		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
		qp->s_retry = qp->s_retry_cnt;
		/*
		 * If the current request is a TID RDMA WRITE request and the
		 * response is not a TID RDMA WRITE RESP packet, s_last_psn
		 * can't be advanced.
		 */
		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
		    opcode != TID_OP(WRITE_RESP) &&
		    cmp_psn(psn, wqe->psn) >= 0)
			return 1;
		update_last_psn(qp, psn);
		return 1;

@@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
			goto bail_stop;
		if (qp->s_flags & RVT_S_WAIT_RNR)
			goto bail_stop;
		if (qp->s_rnr_retry == 0) {
		rdi = ib_to_rvt(qp->ibqp.device);
		if (qp->s_rnr_retry == 0 &&
		    !((rdi->post_parms[wqe->wr.opcode].flags &
		      RVT_OPERATION_IGN_RNR_CNT) &&
		      qp->s_rnr_retry_cnt == 0)) {
			status = IB_WC_RNR_RETRY_EXC_ERR;
			goto class_b;
		}
		if (qp->s_rnr_retry_cnt < 7)
		if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
			qp->s_rnr_retry--;

		/* The last valid PSN is the previous PSN. */
		/*
		 * The last valid PSN is the previous PSN. For TID RDMA WRITE
		 * request, s_last_psn should be incremented only when a TID
		 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
		 * WRITE RESP packets.
		 */
		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
			reset_psn(qp, qp->s_last_psn + 1);
		} else {
			update_last_psn(qp, psn - 1);

		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);

			reset_psn(qp, psn);
		}

		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
		rvt_stop_rc_timers(qp);
		rvt_add_rnr_timer(qp, aeth);
@@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
	while (cmp_psn(psn, wqe->lpsn) > 0) {
		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
			break;
+14 −0
Original line number Diff line number Diff line
@@ -3205,6 +3205,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
		do {
			struct hfi1_swqe_priv *priv = wqe->priv;

			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
		} while (!ret);
	}
	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
		struct rvt_ack_entry *e = &qp->s_ack_queue[i];

		if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
			i = 0;
		/* Free only locally allocated TID entries */
		if (e->opcode != TID_OP(WRITE_REQ))
			continue;
		do {
			struct hfi1_ack_priv *priv = e->priv;

			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
		} while (!ret);
	}
+2 −1
Original line number Diff line number Diff line
@@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
			0xffffffull),
		psn = val & mask;
	if (expct)
		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
	else
		psn = psn + frags;
	return psn & mask;
+16 −1
Original line number Diff line number Diff line
@@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the
 */
const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
	[IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
	[IB_WR_SEND] = IB_WC_SEND,
	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
@@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = {
	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
	[IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
	[IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
	/* UC */
	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = {
	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,

	/* TID RDMA has separate handlers for different opcodes.*/
	[IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
	[IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
	[IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
	[IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
	[IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
	[IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
	[IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,

	/* UC */
	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
@@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
	rdi->dparms.props.max_mr_size = U64_MAX;
	rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
	rdi->dparms.props.max_qp = hfi1_max_qps;
	rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
	rdi->dparms.props.max_qp_wr =
		(hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
		 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
	rdi->dparms.props.max_send_sge = hfi1_max_sges;
	rdi->dparms.props.max_recv_sge = hfi1_max_sges;
	rdi->dparms.props.max_sge_rd = hfi1_max_sges;
Loading