Commit 32e09298 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'io_uring-zerocopy-send' of...

Merge branch 'io_uring-zerocopy-send' of git://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux into for-5.20/io_uring-zerocopy-send

Merge prep net series for io_uring tx zc from the Jakub's tree.

* 'io_uring-zerocopy-send' of git://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux:
  net: fix uninitialised msghdr->sg_from_iter
  tcp: support externally provided ubufs
  ipv6/udp: support externally provided ubufs
  ipv4/udp: support externally provided ubufs
  net: introduce __skb_fill_page_desc_noacc
  net: introduce managed frags infrastructure
  net: Allow custom iter handler in msghdr
  skbuff: carry external ubuf_info in msghdr
  skbuff: add SKBFL_DONT_ORPHAN flag
  skbuff: don't mix ubuf_info from different sources
  ipv6: avoid partial copy for zc
  ipv4: avoid partial copy for zc
parents 32346491 2829a267
Loading
Loading
Loading
Loading
+48 −18
Original line number Diff line number Diff line
@@ -686,10 +686,18 @@ enum {
	 * charged to the kernel memory.
	 */
	SKBFL_PURE_ZEROCOPY = BIT(2),

	SKBFL_DONT_ORPHAN = BIT(3),

	/* page references are managed by the ubuf_info, so it's safe to
	 * use frags only up until ubuf_info is released
	 */
	SKBFL_MANAGED_FRAG_REFS = BIT(4),
};

#define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
				 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
			   bool success);

int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
			    struct iov_iter *from, size_t length);
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
			    struct sk_buff *skb, struct iov_iter *from,
			    size_t length);

static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
					  struct msghdr *msg, int len)
{
	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
@@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}

static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
	return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}

static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
				       const struct sk_buff *skb2)
{
@@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
	}
}

void __skb_zcopy_downgrade_managed(struct sk_buff *skb);

static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
	if (unlikely(skb_zcopy_managed(skb)))
		__skb_zcopy_downgrade_managed(skb);
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
	skb->next = NULL;
@@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
	return skb_headlen(skb) + __skb_pagelen(skb);
}

static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
					      int i, struct page *page,
					      int off, int size)
{
	skb_frag_t *frag = &shinfo->frags[i];

	/*
	 * Propagate page pfmemalloc to the skb if we can. The problem is
	 * that not all callers have unique ownership of the page but rely
	 * on page_is_pfmemalloc doing the right thing(tm).
	 */
	frag->bv_page		  = page;
	frag->bv_offset		  = off;
	skb_frag_size_set(frag, size);
}

/**
 * __skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
@@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
					struct page *page, int off, int size)
{
	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

	/*
	 * Propagate page pfmemalloc to the skb if we can. The problem is
	 * that not all callers have unique ownership of the page but rely
	 * on page_is_pfmemalloc doing the right thing(tm).
	 */
	frag->bv_page		  = page;
	frag->bv_offset		  = off;
	skb_frag_size_set(frag, size);

	__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
	page = compound_head(page);
	if (page_is_pfmemalloc(page))
		skb->pfmemalloc	= true;
@@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
	if (likely(!skb_zcopy(skb)))
		return 0;
	if (!skb_zcopy_is_nouarg(skb) &&
	    skb_uarg(skb)->callback == msg_zerocopy_callback)
	if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
		return 0;
	return skb_copy_ubufs(skb, gfp_mask);
}
@@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 */
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
	__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
	struct skb_shared_info *shinfo = skb_shinfo(skb);

	if (!skb_zcopy_managed(skb))
		__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}

/**
+5 −0
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@ struct file;
struct pid;
struct cred;
struct socket;
struct sock;
struct sk_buff;

#define __sockaddr_check_size(size)	\
	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -69,6 +71,9 @@ struct msghdr {
	unsigned int	msg_flags;	/* flags on received message */
	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
	struct kiocb	*msg_iocb;	/* ptr to iocb for async requests */
	struct ubuf_info *msg_ubuf;
	int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
			    struct iov_iter *from, size_t length);
};

struct user_msghdr {
+1 −0
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@ int __get_compat_msghdr(struct msghdr *kmsg,
		return -EMSGSIZE;

	kmsg->msg_iocb = NULL;
	kmsg->msg_ubuf = NULL;
	*ptr = msg.msg_iov;
	*len = msg.msg_iovlen;
	return 0;
+10 −4
Original line number Diff line number Diff line
@@ -613,10 +613,16 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);

int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
			    struct iov_iter *from, size_t length)
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
			    struct sk_buff *skb, struct iov_iter *from,
			    size_t length)
{
	int frag = skb_shinfo(skb)->nr_frags;
	int frag;

	if (msg && msg->msg_ubuf && msg->sg_from_iter)
		return msg->sg_from_iter(sk, skb, from, length);

	frag = skb_shinfo(skb)->nr_frags;

	while (length && iov_iter_count(from)) {
		struct page *pages[MAX_SKB_FRAGS];
@@ -702,7 +708,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
		return -EFAULT;

	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);

+33 −4
Original line number Diff line number Diff line
@@ -666,11 +666,18 @@ static void skb_release_data(struct sk_buff *skb)
			      &shinfo->dataref))
		goto exit;

	if (skb_zcopy(skb)) {
		bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;

		skb_zcopy_clear(skb, true);
		if (skip_unref)
			goto free_head;
	}

	for (i = 0; i < shinfo->nr_frags; i++)
		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);

free_head:
	if (shinfo->frag_list)
		kfree_skb_list(shinfo->frag_list);

@@ -895,8 +902,11 @@ EXPORT_SYMBOL(skb_dump);
 */
void skb_tx_error(struct sk_buff *skb)
{
	if (skb) {
		skb_zcopy_downgrade_managed(skb);
		skb_zcopy_clear(skb, true);
	}
}
EXPORT_SYMBOL(skb_tx_error);

#ifdef CONFIG_TRACEPOINTS
@@ -1193,7 +1203,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
	uarg->len = 1;
	uarg->bytelen = size;
	uarg->zerocopy = 1;
	uarg->flags = SKBFL_ZEROCOPY_FRAG;
	uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
	refcount_set(&uarg->refcnt, 1);
	sock_hold(sk);

@@ -1212,6 +1222,10 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
		u32 bytelen, next;

		/* there might be non MSG_ZEROCOPY users */
		if (uarg->callback != msg_zerocopy_callback)
			return NULL;

		/* realloc only when socket is locked (TCP, UDP cork),
		 * so uarg->len and sk_zckey access is serialized
		 */
@@ -1354,7 +1368,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
	if (orig_uarg && uarg != orig_uarg)
		return -EEXIST;

	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
		struct sock *save_sk = skb->sk;

@@ -1371,6 +1385,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
	int i;

	skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
		skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);

static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
			      gfp_t gfp_mask)
{
@@ -1688,6 +1712,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

	BUG_ON(skb_shared(skb));

	skb_zcopy_downgrade_managed(skb);

	size = SKB_DATA_ALIGN(size);

	if (skb_pfmemalloc(skb))
@@ -3484,6 +3510,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
	int pos = skb_headlen(skb);
	const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;

	skb_zcopy_downgrade_managed(skb);

	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
	skb_zerocopy_clone(skb1, skb, 0);
	if (len < pos)	/* Split line is inside header. */
@@ -3837,6 +3865,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
	if (skb_can_coalesce(skb, i, page, offset)) {
		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
	} else if (i < MAX_SKB_FRAGS) {
		skb_zcopy_downgrade_managed(skb);
		get_page(page);
		skb_fill_page_desc(skb, i, page, offset, size);
	} else {
Loading