Commit bc9d3a9f authored by Thomas Gleixner's avatar Thomas Gleixner Committed by Jakub Kicinski
Browse files

net: dst: Switch to rcuref_t reference counting



Under high contention dst_entry::__refcnt becomes a significant bottleneck.

atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
high retry rates on contention.

Switch the reference count to rcuref_t which results in a significant
performance gain. Rename the reference count member to __rcuref to reflect
the change.

The gain depends on the micro-architecture and the number of concurrent
operations and has been measured in the range of +25% to +130% with a
localhost memtier/memcached benchmark which amplifies the problem
massively.

Running the memtier/memcached benchmark over a real (1Gb) network
connection the conversion on top of the false sharing fix for struct
dst_entry::__refcnt results in a total gain in the 2%-5% range over the
upstream baseline.

Reported-by: default avatarWangyang Guo <wangyang.guo@intel.com>
Reported-by: default avatarArjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de
Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent d288a162
Loading
Loading
Loading
Loading
+10 −9
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <linux/rcuref.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>
@@ -61,11 +62,11 @@ struct dst_entry {
	unsigned short		trailer_len;	/* space to reserve at tail */

	/*
	 * __refcnt wants to be on a different cache line from
	 * __rcuref wants to be on a different cache line from
	 * input/output/ops or performance tanks badly
	 */
#ifdef CONFIG_64BIT
	atomic_t		__refcnt;	/* 64-bit offset 64 */
	rcuref_t		__rcuref;	/* 64-bit offset 64 */
#endif
	int			__use;
	unsigned long		lastuse;
@@ -75,16 +76,16 @@ struct dst_entry {
	__u32			tclassid;
#ifndef CONFIG_64BIT
	struct lwtunnel_state   *lwtstate;
	atomic_t		__refcnt;	/* 32-bit offset 64 */
	rcuref_t		__rcuref;	/* 32-bit offset 64 */
#endif
	netdevice_tracker	dev_tracker;

	/*
	 * Used by rtable and rt6_info. Moves lwtstate into the next cache
	 * line on 64bit so that lwtstate does not cause false sharing with
	 * __refcnt under contention of __refcnt. This also puts the
	 * __rcuref under contention of __rcuref. This also puts the
	 * frequently accessed members of rtable and rt6_info out of the
	 * __refcnt cache line.
	 * __rcuref cache line.
	 */
	struct list_head	rt_uncached;
	struct uncached_list	*rt_uncached_list;
@@ -238,10 +239,10 @@ static inline void dst_hold(struct dst_entry *dst)
{
	/*
	 * If your kernel compilation stops here, please check
	 * the placement of __refcnt in struct dst_entry
	 * the placement of __rcuref in struct dst_entry
	 */
	BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
	WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
	BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
	WARN_ON(!rcuref_get(&dst->__rcuref));
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
	return atomic_inc_not_zero(&dst->__refcnt);
	return rcuref_get(&dst->__rcuref);
}

/**
+1 −1
Original line number Diff line number Diff line
@@ -2131,7 +2131,7 @@ sk_dst_get(struct sock *sk)

	rcu_read_lock();
	dst = rcu_dereference(sk->sk_dst_cache);
	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
	if (dst && !rcuref_get(&dst->__rcuref))
		dst = NULL;
	rcu_read_unlock();
	return dst;
+1 −1
Original line number Diff line number Diff line
@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
{
	struct rtable *rt = &br->fake_rtable;

	atomic_set(&rt->dst.__refcnt, 1);
	rcuref_init(&rt->dst.__rcuref, 1);
	rt->dst.dev = br->dev;
	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
+5 −21
Original line number Diff line number Diff line
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
	dst->tclassid = 0;
#endif
	dst->lwtstate = NULL;
	atomic_set(&dst->__refcnt, initial_ref);
	rcuref_init(&dst->__rcuref, initial_ref);
	dst->__use = 0;
	dst->lastuse = jiffies;
	dst->flags = flags;
@@ -162,32 +162,16 @@ EXPORT_SYMBOL(dst_dev_put);

void dst_release(struct dst_entry *dst)
{
	if (dst) {
		int newrefcnt;

		newrefcnt = atomic_dec_return(&dst->__refcnt);
		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
					     __func__, dst, newrefcnt);
		if (!newrefcnt)
	if (dst && rcuref_put(&dst->__rcuref))
		call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);

void dst_release_immediate(struct dst_entry *dst)
{
	if (dst) {
		int newrefcnt;

		newrefcnt = atomic_dec_return(&dst->__refcnt);
		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
					     __func__, dst, newrefcnt);
		if (!newrefcnt)
	if (dst && rcuref_put(&dst->__rcuref))
		dst_destroy(dst);
}
}
EXPORT_SYMBOL(dst_release_immediate);

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+1 −1
Original line number Diff line number Diff line
@@ -843,7 +843,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
	if (dst) {
		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
		ci.rta_used = dst->__use;
		ci.rta_clntref = atomic_read(&dst->__refcnt);
		ci.rta_clntref = rcuref_read(&dst->__rcuref);
	}
	if (expires) {
		unsigned long clock;
Loading