Commit 6b921536 authored by Eric Dumazet's avatar Eric Dumazet Committed by Greg Kroah-Hartman
Browse files

net: sk_buff rbnode reorg



commit bffa72cf upstream

skb->rbnode shares space with skb->next, skb->prev and skb->tstamp

Current uses (TCP receive ofo queue and netem) need to save/restore
tstamp, while skb->dev is either NULL (TCP) or a constant for a given
queue (netem).

Since we plan using an RB tree for TCP retransmit queue to speedup SACK
processing with large BDP, this patch exchanges skb->dev and
skb->tstamp.

This saves some overhead in both TCP and netem.

v2: removes the swtstamp field from struct tcp_skb_cb

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Wei Wang <weiwan@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 37c7cc80
Loading
Loading
Loading
Loading
+14 −10
Original line number Diff line number Diff line
@@ -662,15 +662,6 @@ struct sk_buff {
			struct sk_buff		*next;
			struct sk_buff		*prev;

			union {
				ktime_t		tstamp;
				u64		skb_mstamp;
			};
		};
		struct rb_node	rbnode; /* used in netem & tcp stack */
	};
	struct sock		*sk;

			union {
				struct net_device	*dev;
				/* Some protocols might use this space to store information,
@@ -678,8 +669,21 @@ struct sk_buff {
				 * UDP receive path is one user.
				 */
				unsigned long		dev_scratch;
			};
		};
		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
		struct list_head	list;
	};

	union {
		struct sock		*sk;
		int			ip_defrag_offset;
	};

	union {
		ktime_t		tstamp;
		u64		skb_mstamp;
	};
	/*
	 * This is the control buffer. It is free to use for every
	 * layer. Please put your private variables there. If you
+2 −1
Original line number Diff line number Diff line
@@ -75,7 +75,8 @@ struct inet_frag_queue {
	struct timer_list	timer;
	spinlock_t		lock;
	refcount_t		refcnt;
	struct sk_buff		*fragments;
	struct sk_buff		*fragments;  /* Used in IPv6. */
	struct rb_root		rb_fragments; /* Used in IPv4. */
	struct sk_buff		*fragments_tail;
	ktime_t			stamp;
	int			len;
+10 −6
Original line number Diff line number Diff line
@@ -136,12 +136,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
	fp = q->fragments;
	nf = q->net;
	f = nf->f;
	while (fp) {
	if (fp) {
		do {
			struct sk_buff *xp = fp->next;

			sum_truesize += fp->truesize;
			kfree_skb(fp);
			fp = xp;
		} while (fp);
	} else {
		sum_truesize = skb_rbtree_purge(&q->rb_fragments);
	}
	sum = sum_truesize + f->qsize;

+101 −81
Original line number Diff line number Diff line
@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
{
	struct inet_frag_queue *frag = from_timer(frag, t, timer);
	const struct iphdr *iph;
	struct sk_buff *head;
	struct sk_buff *head = NULL;
	struct net *net;
	struct ipq *qp;
	int err;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)

	ipq_kill(qp);
	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);

	head = qp->q.fragments;

	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);

	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
	if (!qp->q.flags & INET_FRAG_FIRST_IN)
		goto out;

	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
	 * pull the head out of the tree in order to be able to
	 * deal with head->dev.
	 */
	if (qp->q.fragments) {
		head = qp->q.fragments;
		qp->q.fragments = head->next;
	} else {
		head = skb_rb_first(&qp->q.rb_fragments);
		if (!head)
			goto out;
		rb_erase(&head->rbnode, &qp->q.rb_fragments);
		memset(&head->rbnode, 0, sizeof(head->rbnode));
		barrier();
	}
	if (head == qp->q.fragments_tail)
		qp->q.fragments_tail = NULL;

	sub_frag_mem_limit(qp->q.net, head->truesize);

	head->dev = dev_get_by_index_rcu(net, qp->iif);
	if (!head->dev)
		goto out;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
	    (skb_rtable(head)->rt_type != RTN_LOCAL))
		goto out;

	skb_get(head);
	spin_unlock(&qp->q.lock);
	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
	kfree_skb(head);
	goto out_rcu_unlock;

out:
	spin_unlock(&qp->q.lock);
out_rcu_unlock:
	rcu_read_unlock();
	if (head)
		kfree_skb(head);
	ipq_put(qp);
}

@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

	rc = qp->q.fragments && (end - start) > max;
	rc = qp->q.fragments_tail && (end - start) > max;

	if (rc) {
		struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;
	unsigned int sum_truesize = 0;

	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
		return -ETIMEDOUT;
	}

	fp = qp->q.fragments;
	do {
		struct sk_buff *xp = fp->next;

		sum_truesize += fp->truesize;
		kfree_skb(fp);
		fp = xp;
	} while (fp);
	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
	sub_frag_mem_limit(qp->q.net, sum_truesize);

	qp->q.flags = 0;
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
	qp->q.rb_fragments = RB_ROOT;
	qp->q.fragments_tail = NULL;
	qp->iif = 0;
	qp->ecn = 0;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
	struct sk_buff *prev, *next;
	struct rb_node **rbn, *parent;
	struct sk_buff *skb1;
	struct net_device *dev;
	unsigned int fragsize;
	int flags, offset;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
	if (err)
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
	prev = qp->q.fragments_tail;
	if (!prev || prev->ip_defrag_offset < offset) {
		next = NULL;
		goto found;
	}
	prev = NULL;
	for (next = qp->q.fragments; next != NULL; next = next->next) {
		if (next->ip_defrag_offset >= offset)
			break;	/* bingo! */
		prev = next;
	}
	/* Note : skb->rbnode and skb->dev share the same location. */
	dev = skb->dev;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();

found:
	/* RFC5722, Section 4, amended by Errata ID : 3089
	 *                          When reassembling an IPv6 datagram, if
	 *   one or more its constituent fragments is determined to be an
	 *   overlapping fragment, the entire datagram (and any constituent
	 *   fragments) MUST be silently discarded.
	 *
	 * We do the same here for IPv4.
	 * We do the same here for IPv4 (and increment an snmp counter).
	 */

	/* Is there an overlap with the previous fragment? */
	if (prev &&
	    (prev->ip_defrag_offset + prev->len) > offset)
	/* Find out where to put this fragment.  */
	skb1 = qp->q.fragments_tail;
	if (!skb1) {
		/* This is the first fragment we've received. */
		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
		qp->q.fragments_tail = skb;
	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
		/* This is the common/special case: skb goes to the end. */
		/* Detect and discard overlaps. */
		if (offset < (skb1->ip_defrag_offset + skb1->len))
			goto discard_qp;

	/* Is there an overlap with the next fragment? */
	if (next && next->ip_defrag_offset < end)
		/* Insert after skb1. */
		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
		qp->q.fragments_tail = skb;
	} else {
		/* Binary search. Note that skb can become the first fragment, but
		 * not the last (covered above). */
		rbn = &qp->q.rb_fragments.rb_node;
		do {
			parent = *rbn;
			skb1 = rb_to_skb(parent);
			if (end <= skb1->ip_defrag_offset)
				rbn = &parent->rb_left;
			else if (offset >= skb1->ip_defrag_offset + skb1->len)
				rbn = &parent->rb_right;
			else /* Found an overlap with skb1. */
				goto discard_qp;
		} while (*rbn);
		/* Here we have parent properly set, and rbn pointing to
		 * one of its NULL left/right children. Insert skb. */
		rb_link_node(&skb->rbnode, parent, rbn);
	}
	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);

	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
	dev = skb->dev;
	if (dev)
		qp->iif = dev->ifindex;
	/* Makes sure compiler wont do silly aliasing games */
	barrier();
	skb->ip_defrag_offset = offset;

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
	if (!next)
		qp->q.fragments_tail = skb;
	if (prev)
		prev->next = skb;
	else
		qp->q.fragments = skb;

	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
	qp->ecn |= ecn;
@@ -414,7 +425,7 @@ found:
		unsigned long orefdst = skb->_skb_refdst;

		skb->_skb_refdst = 0UL;
		err = ip_frag_reasm(qp, prev, dev);
		err = ip_frag_reasm(qp, skb, dev);
		skb->_skb_refdst = orefdst;
		return err;
	}
@@ -431,15 +442,15 @@ err:
	return err;
}


/* Build a new IP datagram from all its fragments. */

static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
			 struct net_device *dev)
{
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
	struct iphdr *iph;
	struct sk_buff *fp, *head = qp->q.fragments;
	struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
	struct sk_buff **nextp; /* To build frag_list. */
	struct rb_node *rbn;
	int len;
	int ihlen;
	int err;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
		goto out_fail;
	}
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
	if (head != skb) {
		fp = skb_clone(skb, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
		if (!fp->next)
		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
		if (qp->q.fragments_tail == skb)
			qp->q.fragments_tail = fp;
		prev->next = fp;

		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;

		consume_skb(qp->q.fragments);
		qp->q.fragments = head;
		skb_morph(skb, head);
		rb_replace_node(&head->rbnode, &skb->rbnode,
				&qp->q.rb_fragments);
		consume_skb(head);
		head = skb;
	}

	WARN_ON(!head);
	WARN_ON(head->ip_defrag_offset != 0);

	/* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
		clone = alloc_skb(0, GFP_ATOMIC);
		if (!clone)
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
		skb_frag_list_init(head);
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		skb->truesize += clone->truesize;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
		add_frag_mem_limit(qp->q.net, clone->truesize);
		skb_shinfo(head)->frag_list = clone;
		nextp = &clone->next;
	} else {
		nextp = &skb_shinfo(head)->frag_list;
	}

	skb_shinfo(head)->frag_list = head->next;
	skb_push(head, head->data - skb_network_header(head));

	for (fp=head->next; fp; fp = fp->next) {
	/* Traverse the tree in order, to build frag_list. */
	rbn = rb_next(&head->rbnode);
	rb_erase(&head->rbnode, &qp->q.rb_fragments);
	while (rbn) {
		struct rb_node *rbnext = rb_next(rbn);
		fp = rb_to_skb(rbn);
		rb_erase(rbn, &qp->q.rb_fragments);
		rbn = rbnext;
		*nextp = fp;
		nextp = &fp->next;
		fp->prev = NULL;
		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
		head->data_len += fp->len;
		head->len += fp->len;
		if (head->ip_summed != fp->ip_summed)
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
	}
	sub_frag_mem_limit(qp->q.net, head->truesize);

	*nextp = NULL;
	head->next = NULL;
	head->prev = NULL;
	head->dev = dev;
	head->tstamp = qp->q.stamp;
	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,

	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
	qp->q.fragments = NULL;
	qp->q.rb_fragments = RB_ROOT;
	qp->q.fragments_tail = NULL;
	return 0;

+1 −0
Original line number Diff line number Diff line
@@ -471,6 +471,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
					  head->csum);

	fq->q.fragments = NULL;
	fq->q.rb_fragments = RB_ROOT;
	fq->q.fragments_tail = NULL;

	return true;
Loading