Commit 7282c126 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'smc-datapath-opts'



Dust Li says:

====================
net/smc: some datapath performance optimizations

This series tries to improve the performance of SMC in datapath.

- patch #1, add sysctl interface to support tuning the behaviour of
  SMC in container environment.

- patch #2/#3, add autocorking support which is very efficient for small
  messages without trade-off for latency.

- patch #4, send directly on setting TCP_NODELAY, without wake up the
  TX worker, this make it consistent with clearing TCP_CORK.

- patch #5, this correct the setting of RMB window update limit, so
  we don't send CDC messages to update peer's RMB window too frequently
  in some cases.

- patch #6, implemented something like NAPI in SMC, decrease the number
  of hardirq when busy.

- patch #7, this moves TX work doing in the BH to the user context when
  sock_lock is hold by user.

With this patchset applied, we can get a good performance gain:
- qperf tcp_bw test has shown a great improvement. Other benchmarks like
  'netperf TCP_STREAM' or 'sockperf throughput' has similar result.
- In my testing environment, running qperf tcp_bw and tcp_lat, SMC behaves
  better then TCP in most all message size.

Here are some test results with the following testing command:
client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \
		-t 30 -vu tcp_{bw|lat}
server: smc_run taskset -c 1 qperf

==== Bandwidth ====
 MsgSize        Origin SMC              TCP                SMC with patches
       1         0.578 MB/s      2.392 MB/s(313.57%)      2.561 MB/s(342.83%)
       2         1.159 MB/s      4.780 MB/s(312.53%)      5.162 MB/s(345.46%)
       4         2.283 MB/s     10.266 MB/s(349.77%)     10.122 MB/s(343.46%)
       8         4.668 MB/s     19.040 MB/s(307.86%)     20.521 MB/s(339.59%)
      16         9.147 MB/s     38.904 MB/s(325.31%)     40.823 MB/s(346.29%)
      32        18.369 MB/s     79.587 MB/s(333.25%)     80.535 MB/s(338.42%)
      64        36.562 MB/s    148.668 MB/s(306.61%)    158.170 MB/s(332.60%)
     128        72.961 MB/s    274.913 MB/s(276.80%)    316.217 MB/s(333.41%)
     256       144.705 MB/s    512.059 MB/s(253.86%)    626.019 MB/s(332.62%)
     512       288.873 MB/s    884.977 MB/s(206.35%)   1221.596 MB/s(322.88%)
    1024       574.180 MB/s   1337.736 MB/s(132.98%)   2203.156 MB/s(283.70%)
    2048      1095.192 MB/s   1865.952 MB/s( 70.38%)   3036.448 MB/s(177.25%)
    4096      2066.157 MB/s   2380.337 MB/s( 15.21%)   3834.271 MB/s( 85.58%)
    8192      3717.198 MB/s   2733.073 MB/s(-26.47%)   4904.910 MB/s( 31.95%)
   16384      4742.221 MB/s   2958.693 MB/s(-37.61%)   5220.272 MB/s( 10.08%)
   32768      5349.550 MB/s   3061.285 MB/s(-42.77%)   5321.865 MB/s( -0.52%)
   65536      5162.919 MB/s   3731.408 MB/s(-27.73%)   5245.021 MB/s(  1.59%)
==== Latency ====
 MsgSize        Origin SMC              TCP                SMC with patches
       1        10.540 us     11.938 us( 13.26%)         10.356 us( -1.75%)
       2        10.996 us     11.992 us(  9.06%)         10.073 us( -8.39%)
       4        10.229 us     11.687 us( 14.25%)          9.996 us( -2.28%)
       8        10.203 us     11.653 us( 14.21%)         10.063 us( -1.37%)
      16        10.530 us     11.313 us(  7.44%)         10.013 us( -4.91%)
      32        10.241 us     11.586 us( 13.13%)         10.081 us( -1.56%)
      64        10.693 us     11.652 us(  8.97%)          9.986 us( -6.61%)
     128        10.597 us     11.579 us(  9.27%)         10.262 us( -3.16%)
     256        10.409 us     11.957 us( 14.87%)         10.148 us( -2.51%)
     512        11.088 us     12.505 us( 12.78%)         10.206 us( -7.95%)
    1024        11.240 us     12.255 us(  9.03%)         10.631 us( -5.42%)
    2048        11.485 us     16.970 us( 47.76%)         10.981 us( -4.39%)
    4096        12.077 us     13.948 us( 15.49%)         11.847 us( -1.90%)
    8192        13.683 us     16.693 us( 22.00%)         13.336 us( -2.54%)
   16384        16.470 us     23.615 us( 43.38%)         16.519 us(  0.30%)
   32768        22.540 us     40.966 us( 81.75%)         22.452 us( -0.39%)
   65536        34.192 us     73.003 us(113.51%)         33.916 us( -0.81%)

------------
Test environment notes:
1. Testing is run on 2 VMs within the same physical host
2. The NIC is ConnectX-4Lx, using SRIOV, and passing through 2 VFs to the
   2 VMs respectively.
3. To decrease jitter, VM's vCPU are binded to each physical CPU, and those
   physical CPUs are all isolated using boot parameter `isolcpus=xxx`
4. The queue number are set to 1, and interrupt from the queue is binded to
   CPU0 in the guest
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1e385c08 6b88af83
Loading
Loading
Loading
Loading
+23 −0
Original line number Diff line number Diff line
.. SPDX-License-Identifier: GPL-2.0

=========
SMC Sysctl
=========

/proc/sys/net/smc/* Variables
==============================

autocorking_size - INTEGER
	Setting SMC auto corking size:
	SMC auto corking is like TCP auto corking from the application's
	perspective of view. When applications do consecutive small
	write()/sendmsg() system calls, we try to coalesce these small writes
	as much as possible, to lower total amount of CDC and RDMA Write been
	sent.
	autocorking_size limits the maximum corked bytes that can be sent to
	the under device in 1 single sending. If set to 0, the SMC auto corking
	is disabled.
	Applications can still use TCP_CORK for optimal behavior when they
	know how/when to uncork their sockets.

	Default: 64K
+4 −0
Original line number Diff line number Diff line
@@ -14,5 +14,9 @@ struct netns_smc {
	struct smc_stats_rsn		*fback_rsn;

	bool				limit_smc_hs;	/* constraint on handshake */
#ifdef CONFIG_SYSCTL
	struct ctl_table_header		*smc_hdr;
#endif
	unsigned int			sysctl_autocorking_size;
};
#endif
+1 −1
Original line number Diff line number Diff line
@@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o
obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o
smc-y += smc_tracepoint.o smc_sysctl.o
+28 −2
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@
#include "smc_close.h"
#include "smc_stats.h"
#include "smc_tracepoint.h"
#include "smc_sysctl.h"

static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
						 * creation on server
@@ -192,12 +193,27 @@ void smc_unhash_sk(struct sock *sk)
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);

/* This will be called before user really release sock_lock. So do the
 * work which we didn't do because of user hold the sock_lock in the
 * BH context
 */
static void smc_release_cb(struct sock *sk)
{
	struct smc_sock *smc = smc_sk(sk);

	if (smc->conn.tx_in_release_sock) {
		smc_tx_pending(&smc->conn);
		smc->conn.tx_in_release_sock = false;
	}
}

struct proto smc_proto = {
	.name		= "SMC",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.release_cb	= smc_release_cb,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v4_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
@@ -210,6 +226,7 @@ struct proto smc_proto6 = {
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.release_cb	= smc_release_cb,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v6_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
@@ -2795,8 +2812,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
		    sk->sk_state != SMC_CLOSED) {
			if (val) {
				SMC_STAT_INC(smc, ndly_cnt);
				mod_delayed_work(smc->conn.lgr->tx_wq,
						 &smc->conn.tx_work, 0);
				smc_tx_pending(&smc->conn);
				cancel_delayed_work(&smc->conn.tx_work);
			}
		}
		break;
@@ -3273,9 +3290,17 @@ static int __init smc_init(void)
		goto out_sock;
	}

	rc = smc_sysctl_init();
	if (rc) {
		pr_err("%s: sysctl_init fails with %d\n", __func__, rc);
		goto out_ulp;
	}

	static_branch_enable(&tcp_have_smc);
	return 0;

out_ulp:
	tcp_unregister_ulp(&smc_ulp_ops);
out_sock:
	sock_unregister(PF_SMC);
out_proto6:
@@ -3303,6 +3328,7 @@ static int __init smc_init(void)
static void __exit smc_exit(void)
{
	static_branch_disable(&tcp_have_smc);
	smc_sysctl_exit();
	tcp_unregister_ulp(&smc_ulp_ops);
	sock_unregister(PF_SMC);
	smc_core_exit();
+6 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#define SMC_MAX_ISM_DEVS	8	/* max # of proposed non-native ISM
					 * devices
					 */
#define SMC_AUTOCORKING_DEFAULT_SIZE	0x10000	/* 64K by default */

extern struct proto smc_proto;
extern struct proto smc_proto6;
@@ -192,6 +193,7 @@ struct smc_connection {
						 * - dec on polled tx cqe
						 */
	wait_queue_head_t	cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
	atomic_t		tx_pushing;     /* nr_threads trying tx push */
	struct delayed_work	tx_work;	/* retry of smc_cdc_msg_send */
	u32			tx_off;		/* base offset in peer rmb */

@@ -211,6 +213,10 @@ struct smc_connection {
						 * data still pending
						 */
	char			urg_rx_byte;	/* urgent byte */
	bool			tx_in_release_sock;
						/* flush pending tx data in
						 * sock release_cb()
						 */
	atomic_t		bytes_to_rcv;	/* arrived data,
						 * not yet received
						 */
Loading