Commit 383d9f87 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-core-use-a-dedicated-kmem_cache-for-skb-head-allocs'

Eric Dumazet says:

====================
net: core: use a dedicated kmem_cache for skb head allocs

Our profile data show that using kmalloc(non_const_size)/kfree(ptr)
has a certain cost, because kfree(ptr) has to pull a 'struct page'
in cpu caches.

Using a dedicated kmem_cache for TCP skb->head allocations makes
a difference, both in cpu cycles and memory savings.

This kmem_cache could also be used for GRO skb allocations,
this is left as a future exercise.
====================

Link: https://lore.kernel.org/r/20230206173103.2617121-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 61d731e6 bf9f1baa
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -255,6 +255,14 @@
#define SKB_DATA_ALIGN(X)	ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)	\
	((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

/* For X bytes available in skb->head, what is the minimal
 * allocation needed, knowing struct skb_shared_info needs
 * to be aligned.
 */
#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \
	SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

#define SKB_MAX_ORDER(X, ORDER) \
	SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)		(SKB_MAX_ORDER((X), 0))
+82 −33
Original line number Diff line number Diff line
@@ -89,6 +89,34 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

/* skb_small_head_cache and related code is only supported
 * for CONFIG_SLAB and CONFIG_SLUB.
 * As soon as SLOB is removed from the kernel, we can clean up this.
 */
#if !defined(CONFIG_SLOB)
# define HAVE_SKB_SMALL_HEAD_CACHE 1
#endif

#ifdef HAVE_SKB_SMALL_HEAD_CACHE
static struct kmem_cache *skb_small_head_cache __ro_after_init;

#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)

/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
 * size, and we can differentiate heads from skb_small_head_cache
 * vs system slabs by looking at their size (skb_end_offset()).
 */
#define SKB_SMALL_HEAD_CACHE_SIZE					\
	(is_power_of_2(SKB_SMALL_HEAD_SIZE) ?			\
		(SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :	\
		SKB_SMALL_HEAD_SIZE)

#define SKB_SMALL_HEAD_HEADROOM						\
	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
#endif /* HAVE_SKB_SMALL_HEAD_CACHE */

int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);

@@ -478,17 +506,37 @@ EXPORT_SYMBOL(napi_build_skb);
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
			     bool *pfmemalloc)
{
	void *obj;
	bool ret_pfmemalloc = false;
	unsigned int obj_size;
	void *obj;

	obj_size = SKB_HEAD_ALIGN(*size);
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
	if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
	    !(flags & KMALLOC_NOT_NORMAL_BITS)) {

		/* skb_small_head_cache has non power of two size,
		 * likely forcing SLUB to use order-3 pages.
		 * We deliberately attempt a NOMEMALLOC allocation only.
		 */
		obj = kmem_cache_alloc_node(skb_small_head_cache,
				flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
				node);
		if (obj) {
			*size = SKB_SMALL_HEAD_CACHE_SIZE;
			goto out;
		}
	}
#endif
	*size = obj_size = kmalloc_size_roundup(obj_size);
	/*
	 * Try a regular allocation, when that fails and we're not entitled
	 * to the reserves, fail.
	 */
	obj = kmalloc_node_track_caller(size,
	obj = kmalloc_node_track_caller(obj_size,
					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
					node);
	if (obj || !(gfp_pfmemalloc_allowed(flags)))
@@ -496,7 +544,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node,

	/* Try again but now we are using pfmemalloc reserves */
	ret_pfmemalloc = true;
	obj = kmalloc_node_track_caller(size, flags, node);
	obj = kmalloc_node_track_caller(obj_size, flags, node);

out:
	if (pfmemalloc)
@@ -533,7 +581,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
	struct kmem_cache *cache;
	struct sk_buff *skb;
	unsigned int osize;
	bool pfmemalloc;
	u8 *data;

@@ -558,18 +605,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
	 * Both skb->head and skb_shared_info are cache line aligned.
	 */
	size = SKB_DATA_ALIGN(size);
	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	osize = kmalloc_size_roundup(size);
	data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
	data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
	if (unlikely(!data))
		goto nodata;
	/* kmalloc_size_roundup() might give us more room than requested.
	 * Put skb_shared_info exactly at the end of allocated zone,
	 * to allow max possible filling before reallocation.
	 */
	size = SKB_WITH_OVERHEAD(osize);
	prefetchw(data + size);
	prefetchw(data + SKB_WITH_OVERHEAD(size));

	/*
	 * Only clear those fields we need to clear, not those that we will
@@ -577,7 +620,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
	 * the tail pointer in struct sk_buff!
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
	__build_skb_around(skb, data, osize);
	__build_skb_around(skb, data, size);
	skb->pfmemalloc = pfmemalloc;

	if (flags & SKB_ALLOC_FCLONE) {
@@ -632,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
		goto skb_success;
	}

	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	len = SKB_DATA_ALIGN(len);
	len = SKB_HEAD_ALIGN(len);

	if (sk_memalloc_socks())
		gfp_mask |= __GFP_MEMALLOC;
@@ -732,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
		pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
	} else {
		len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
		len = SKB_DATA_ALIGN(len);
		len = SKB_HEAD_ALIGN(len);

		data = page_frag_alloc(&nc->page, len, gfp_mask);
		pfmemalloc = nc->page.pfmemalloc;
@@ -809,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
	return page_pool_return_skb_page(virt_to_page(data));
}

static void skb_kfree_head(void *head, unsigned int end_offset)
{
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
	if (end_offset == SKB_SMALL_HEAD_HEADROOM)
		kmem_cache_free(skb_small_head_cache, head);
	else
#endif
		kfree(head);
}

static void skb_free_head(struct sk_buff *skb)
{
	unsigned char *head = skb->head;
@@ -818,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb)
			return;
		skb_free_frag(head);
	} else {
		kfree(head);
		skb_kfree_head(head, skb_end_offset(skb));
	}
}

@@ -1938,10 +1989,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
	if (skb_pfmemalloc(skb))
		gfp_mask |= __GFP_MEMALLOC;

	size = SKB_DATA_ALIGN(size);
	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	size = kmalloc_size_roundup(size);
	data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
	data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
	if (!data)
		goto nodata;
	size = SKB_WITH_OVERHEAD(size);
@@ -2004,7 +2052,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
	return 0;

nofrags:
	kfree(data);
	skb_kfree_head(data, size);
nodata:
	return -ENOMEM;
}
@@ -4641,6 +4689,13 @@ void __init skb_init(void)
						0,
						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
						NULL);
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
	skb_small_head_cache = kmem_cache_create("skbuff_small_head",
						SKB_SMALL_HEAD_CACHE_SIZE,
						0,
						SLAB_HWCACHE_ALIGN | SLAB_PANIC,
						NULL);
#endif
	skb_extensions_init();
}

@@ -6289,10 +6344,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
	if (skb_pfmemalloc(skb))
		gfp_mask |= __GFP_MEMALLOC;

	size = SKB_DATA_ALIGN(size);
	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	size = kmalloc_size_roundup(size);
	data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
	data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
	if (!data)
		return -ENOMEM;
	size = SKB_WITH_OVERHEAD(size);
@@ -6308,7 +6360,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
	if (skb_cloned(skb)) {
		/* drop the old head gracefully */
		if (skb_orphan_frags(skb, gfp_mask)) {
			kfree(data);
			skb_kfree_head(data, size);
			return -ENOMEM;
		}
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -6408,10 +6460,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
	if (skb_pfmemalloc(skb))
		gfp_mask |= __GFP_MEMALLOC;

	size = SKB_DATA_ALIGN(size);
	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	size = kmalloc_size_roundup(size);
	data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
	data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
	if (!data)
		return -ENOMEM;
	size = SKB_WITH_OVERHEAD(size);
@@ -6419,7 +6468,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
	memcpy((struct skb_shared_info *)(data + size),
	       skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
	if (skb_orphan_frags(skb, gfp_mask)) {
		kfree(data);
		skb_kfree_head(data, size);
		return -ENOMEM;
	}
	shinfo = (struct skb_shared_info *)(data + size);
@@ -6455,7 +6504,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
		if (skb_has_frag_list(skb))
			kfree_skb_list(skb_shinfo(skb)->frag_list);
		kfree(data);
		skb_kfree_head(data, size);
		return -ENOMEM;
	}
	skb_release_data(skb, SKB_CONSUMED);