Commit 6f1a298b authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'inet-add-drop-monitor-support'

Eric Dumazet says:

====================
inet: add drop monitor support

I recently tried to analyse flakes in ip_defrag selftest.
This failed miserably.

IPv4 and IPv6 reassembly units are causing false kfree_skb()
notifications. It is time to deal with this issue.

First two patches are changing core networking to better
deal with eventual skb frag_list chains, in respect
of kfree_skb/consume_skb status.

Last three patches are adding three new drop reasons,
and make sure skbs that have been reassembled into
a large datagram are no longer viewed as dropped ones.

After this, understanding why ip_defrag selftest is flaky
is possible using standard drop monitoring tools.
====================

Link: https://lore.kernel.org/r/20221029154520.2747444-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b98deb2f 3bdfb04f
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -68,6 +68,9 @@
	FN(IP_INADDRERRORS)		\
	FN(IP_INNOROUTES)		\
	FN(PKT_TOO_BIG)			\
	FN(DUP_FRAG)			\
	FN(FRAG_REASM_TIMEOUT)		\
	FN(FRAG_TOO_FAR)		\
	FNe(MAX)

/**
@@ -80,6 +83,8 @@ enum skb_drop_reason {
	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
	 */
	SKB_NOT_DROPPED_YET = 0,
	/** @SKB_CONSUMED: packet has been consumed */
	SKB_CONSUMED,
	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
	SKB_DROP_REASON_NOT_SPECIFIED,
	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
@@ -298,6 +303,15 @@ enum skb_drop_reason {
	 * MTU)
	 */
	SKB_DROP_REASON_PKT_TOO_BIG,
	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
	SKB_DROP_REASON_DUP_FRAG,
	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
	/**
	 * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
	 * (/proc/sys/net/ipv4/ipfrag_max_dist)
	 */
	SKB_DROP_REASON_FRAG_TOO_FAR,
	/**
	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
	 * used as a real 'reason'
+5 −1
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include <linux/in6.h>
#include <linux/rbtree_types.h>
#include <linux/refcount.h>
#include <net/dropreason.h>

/* Per netns frag queues directory */
struct fqdir {
@@ -34,12 +35,14 @@ struct fqdir {
 * @INET_FRAG_LAST_IN: final fragment has arrived
 * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
 * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
 * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed)
 */
enum {
	INET_FRAG_FIRST_IN	= BIT(0),
	INET_FRAG_LAST_IN	= BIT(1),
	INET_FRAG_COMPLETE	= BIT(2),
	INET_FRAG_HASH_DEAD	= BIT(3),
	INET_FRAG_DROP		= BIT(4),
};

struct frag_v4_compare_key {
@@ -139,7 +142,8 @@ void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);

/* Free all skbs in the queue; return the sum of their truesizes. */
unsigned int inet_frag_rbtree_purge(struct rb_root *root);
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
				    enum skb_drop_reason reason);

static inline void inet_frag_put(struct inet_frag_queue *q)
{
+2 −1
Original line number Diff line number Diff line
@@ -76,6 +76,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
	if (fq->q.flags & INET_FRAG_COMPLETE)
		goto out;

	fq->q.flags |= INET_FRAG_DROP;
	inet_frag_kill(&fq->q);

	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -101,7 +102,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
	spin_unlock(&fq->q.lock);

	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
	kfree_skb(head);
	kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
	goto out_rcu_unlock;

out:
+17 −13
Original line number Diff line number Diff line
@@ -94,6 +94,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
const char * const drop_reasons[] = {
	[SKB_CONSUMED] = "CONSUMED",
	DEFINE_DROP_REASON(FN, FN)
};
EXPORT_SYMBOL(drop_reasons);
@@ -768,7 +769,7 @@ static void skb_free_head(struct sk_buff *skb)
	}
}

static void skb_release_data(struct sk_buff *skb)
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	int i;
@@ -791,7 +792,7 @@ static void skb_release_data(struct sk_buff *skb)

free_head:
	if (shinfo->frag_list)
		kfree_skb_list(shinfo->frag_list);
		kfree_skb_list_reason(shinfo->frag_list, reason);

	skb_free_head(skb);
exit:
@@ -854,11 +855,11 @@ void skb_release_head_state(struct sk_buff *skb)
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb)
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
	skb_release_head_state(skb);
	if (likely(skb->head))
		skb_release_data(skb);
		skb_release_data(skb, reason);
}

/**
@@ -872,7 +873,7 @@ static void skb_release_all(struct sk_buff *skb)

void __kfree_skb(struct sk_buff *skb)
{
	skb_release_all(skb);
	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
	kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
@@ -894,6 +895,9 @@ kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)

	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);

	if (reason == SKB_CONSUMED)
		trace_consume_skb(skb);
	else
		trace_kfree_skb(skb, __builtin_return_address(0), reason);
	__kfree_skb(skb);
}
@@ -1052,7 +1056,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
	trace_consume_skb(skb);
	skb_release_data(skb);
	skb_release_data(skb, SKB_CONSUMED);
	kfree_skbmem(skb);
}

@@ -1077,7 +1081,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)

void __kfree_skb_defer(struct sk_buff *skb)
{
	skb_release_all(skb);
	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
	napi_skb_cache_put(skb);
}

@@ -1115,7 +1119,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
		return;
	}

	skb_release_all(skb);
	skb_release_all(skb, SKB_CONSUMED);
	napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1246,7 +1250,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
	skb_release_all(dst);
	skb_release_all(dst, SKB_CONSUMED);
	return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -1869,7 +1873,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
		if (skb_has_frag_list(skb))
			skb_clone_fraglist(skb);

		skb_release_data(skb);
		skb_release_data(skb, SKB_CONSUMED);
	} else {
		skb_free_head(skb);
	}
@@ -6209,7 +6213,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
			skb_frag_ref(skb, i);
		if (skb_has_frag_list(skb))
			skb_clone_fraglist(skb);
		skb_release_data(skb);
		skb_release_data(skb, SKB_CONSUMED);
	} else {
		/* we can reuse existing recount- all we did was
		 * relocate values
@@ -6352,7 +6356,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
		kfree(data);
		return -ENOMEM;
	}
	skb_release_data(skb);
	skb_release_data(skb, SKB_CONSUMED);

	skb->head = data;
	skb->head_frag = 0;
+10 −4
Original line number Diff line number Diff line
@@ -133,6 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
	count = del_timer_sync(&fq->timer) ? 1 : 0;

	spin_lock_bh(&fq->lock);
	fq->flags |= INET_FRAG_DROP;
	if (!(fq->flags & INET_FRAG_COMPLETE)) {
		fq->flags |= INET_FRAG_COMPLETE;
		count++;
@@ -260,7 +261,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
	kmem_cache_free(f->frags_cachep, q);
}

unsigned int inet_frag_rbtree_purge(struct rb_root *root)
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
				    enum skb_drop_reason reason)
{
	struct rb_node *p = rb_first(root);
	unsigned int sum = 0;
@@ -274,7 +276,7 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root)
			struct sk_buff *next = FRAG_CB(skb)->next_frag;

			sum += skb->truesize;
			kfree_skb(skb);
			kfree_skb_reason(skb, reason);
			skb = next;
		}
	}
@@ -284,17 +286,21 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);

void inet_frag_destroy(struct inet_frag_queue *q)
{
	struct fqdir *fqdir;
	unsigned int sum, sum_truesize = 0;
	enum skb_drop_reason reason;
	struct inet_frags *f;
	struct fqdir *fqdir;

	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
	reason = (q->flags & INET_FRAG_DROP) ?
			SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
			SKB_CONSUMED;
	WARN_ON(del_timer(&q->timer) != 0);

	/* Release all fragment data. */
	fqdir = q->fqdir;
	f = fqdir->f;
	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
	sum = sum_truesize + f->qsize;

	call_rcu(&q->rcu, inet_frag_destroy_rcu);
Loading