Commit 3898f52c authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net-smc-virt-contig-buffers'



Wen Gu says:

====================
net/smc: Introduce virtually contiguous buffers for SMC-R

On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.

When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.

So this patch set is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.

Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:

1) regression in data path, which is brought by additional address
   translation of sndbuf by RNIC in Tx. But in general, translating
   address through MTT is fast. According to qperf test, this part
   regression is basically less than 10% in latency and bandwidth.
   (see patch 5/6 for details)

2) regression in buffer initialization and destruction path, which is
   brought by additional MR operations of sndbufs. But thanks to link
   group buffer reuse mechanism, the impact of this kind of regression
   decreases as times of buffer reuse increases.

Patch set overview:
- Patch 1/6 and 2/6 mainly about simplifying and optimizing DMA sync
  operation, which will reduce overhead on the data path, especially
  when using virtually contiguous buffers;
- Patch 3/6 and 4/6 introduce a sysctl smcr_buf_type to set the type
  of buffers in new created link group;
- Patch 5/6 allows SMC-R to use virtually contiguous sndbufs and RMBs,
  including buffer creation, destruction, MR operation and access;
- patch 6/6 extends netlink attribute for buffer type of SMC-R link group;

v1->v2:
- Patch 5/6 fixes build issue on 32bit;
- Patch 3/6 adds description of new sysctl in smc-sysctl.rst;
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 2acd1022 ddefb2d2
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -21,3 +21,16 @@ autocorking_size - INTEGER
	know how/when to uncork their sockets.

	Default: 64K

smcr_buf_type - INTEGER
        Controls which type of sndbufs and RMBs to use in later newly created
        SMC-R link group. Only for SMC-R.

        Default: 0 (physically contiguous sndbufs and RMBs)

        Possible values:

        - 0 - Use physically contiguous buffers
        - 1 - Use virtually contiguous buffers
        - 2 - Mixed use of the two types. Try physically contiguous buffers first.
          If not available, use virtually contiguous buffers then.
+1 −0
Original line number Diff line number Diff line
@@ -18,5 +18,6 @@ struct netns_smc {
	struct ctl_table_header		*smc_hdr;
#endif
	unsigned int			sysctl_autocorking_size;
	unsigned int			sysctl_smcr_buf_type;
};
#endif
+1 −0
Original line number Diff line number Diff line
@@ -124,6 +124,7 @@ enum {
	SMC_NLA_LGR_R_V2,		/* nest */
	SMC_NLA_LGR_R_NET_COOKIE,	/* u64 */
	SMC_NLA_LGR_R_PAD,		/* flag */
	SMC_NLA_LGR_R_BUF_TYPE,		/* u8 */
	__SMC_NLA_LGR_R_MAX,
	SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1
};
+58 −10
Original line number Diff line number Diff line
@@ -487,6 +487,29 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}

/* register the new vzalloced sndbuf on all links */
static int smcr_lgr_reg_sndbufs(struct smc_link *link,
				struct smc_buf_desc *snd_desc)
{
	struct smc_link_group *lgr = link->lgr;
	int i, rc = 0;

	if (!snd_desc->is_vm)
		return -EINVAL;

	/* protect against parallel smcr_link_reg_buf() */
	mutex_lock(&lgr->llc_conf_mutex);
	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
		if (!smc_link_active(&lgr->lnk[i]))
			continue;
		rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc);
		if (rc)
			break;
	}
	mutex_unlock(&lgr->llc_conf_mutex);
	return rc;
}

/* register the new rmb on all links */
static int smcr_lgr_reg_rmbs(struct smc_link *link,
			     struct smc_buf_desc *rmb_desc)
@@ -498,13 +521,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link,
	if (rc)
		return rc;
	/* protect against parallel smc_llc_cli_rkey_exchange() and
	 * parallel smcr_link_reg_rmb()
	 * parallel smcr_link_reg_buf()
	 */
	mutex_lock(&lgr->llc_conf_mutex);
	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
		if (!smc_link_active(&lgr->lnk[i]))
			continue;
		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
		rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc);
		if (rc)
			goto out;
	}
@@ -550,8 +573,15 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)

	smc_wr_remember_qp_attr(link);

	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
		return SMC_CLC_DECL_ERR_REGRMB;
	/* reg the sndbuf if it was vzalloced */
	if (smc->conn.sndbuf_desc->is_vm) {
		if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
			return SMC_CLC_DECL_ERR_REGBUF;
	}

	/* reg the rmb */
	if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
		return SMC_CLC_DECL_ERR_REGBUF;

	/* confirm_rkey is implicit on 1st contact */
	smc->conn.rmb_desc->is_conf_rkey = true;
@@ -1221,12 +1251,18 @@ static int smc_connect_rdma(struct smc_sock *smc,
			goto connect_abort;
		}
	} else {
		/* reg sendbufs if they were vzalloced */
		if (smc->conn.sndbuf_desc->is_vm) {
			if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) {
				reason_code = SMC_CLC_DECL_ERR_REGBUF;
				goto connect_abort;
			}
		}
		if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
			reason_code = SMC_CLC_DECL_ERR_REGRMB;
			reason_code = SMC_CLC_DECL_ERR_REGBUF;
			goto connect_abort;
		}
	}
	smc_rmb_sync_sg_for_device(&smc->conn);

	if (aclc->hdr.version > SMC_V1) {
		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
@@ -1750,8 +1786,15 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
	struct smc_llc_qentry *qentry;
	int rc;

	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
		return SMC_CLC_DECL_ERR_REGRMB;
	/* reg the sndbuf if it was vzalloced*/
	if (smc->conn.sndbuf_desc->is_vm) {
		if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
			return SMC_CLC_DECL_ERR_REGBUF;
	}

	/* reg the rmb */
	if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
		return SMC_CLC_DECL_ERR_REGBUF;

	/* send CONFIRM LINK request to client over the RoCE fabric */
	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
@@ -2110,10 +2153,15 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
	struct smc_connection *conn = &new_smc->conn;

	if (!local_first) {
		/* reg sendbufs if they were vzalloced */
		if (conn->sndbuf_desc->is_vm) {
			if (smcr_lgr_reg_sndbufs(conn->lnk,
						 conn->sndbuf_desc))
				return SMC_CLC_DECL_ERR_REGBUF;
		}
		if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
			return SMC_CLC_DECL_ERR_REGRMB;
			return SMC_CLC_DECL_ERR_REGBUF;
	}
	smc_rmb_sync_sg_for_device(&new_smc->conn);

	return 0;
}
+5 −3
Original line number Diff line number Diff line
@@ -1034,7 +1034,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
		       ETH_ALEN);
		hton24(clc->r0.qpn, link->roce_qp->qp_num);
		clc->r0.rmb_rkey =
			htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
			htonl(conn->rmb_desc->mr[link->link_idx]->rkey);
		clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
		clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
		switch (clc->hdr.type) {
@@ -1046,7 +1046,9 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
			break;
		}
		clc->r0.rmbe_size = conn->rmbe_size_short;
		clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
		clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
			cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
			cpu_to_be64((u64)sg_dma_address
				    (conn->rmb_desc->sgt[link->link_idx].sgl));
		hton24(clc->r0.psn, link->psn_initial);
		if (version == SMC_V1) {
Loading