!2713 Fix netfilter conntrack (48d03614) · Commits · EulixOS / Software / Kernel

include/linux/rculist_nulls.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -137,6 +137,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
		}
		}

		/* after that hlist_nulls_del will work */
		static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
		{
		n->pprev = &n->next;
		n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
		}

		/**
		* hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
		* @tpos: the type * to use as a loop cursor.

include/uapi/linux/netfilter/nf_conntrack_common.h

+11 −1

Original line number	Diff line number	Diff line
		@@ -97,6 +97,15 @@ enum ip_conntrack_status {
		IPS_UNTRACKED_BIT = 12,
		IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),

		#ifdef __KERNEL__
		/* Re-purposed for in-kernel use:
		* Tags a conntrack entry that clashed with an existing entry
		* on insert.
		*/
		IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT,
		IPS_NAT_CLASH = IPS_UNTRACKED,
		#endif

		/* Conntrack got a helper explicitly attached via CT target. */
		IPS_HELPER_BIT = 13,
		IPS_HELPER = (1 << IPS_HELPER_BIT),
		@@ -110,7 +119,8 @@ enum ip_conntrack_status {
		*/
		IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK \| IPS_NAT_MASK \|
		IPS_EXPECTED \| IPS_CONFIRMED \| IPS_DYING \|
		IPS_SEQ_ADJUST \| IPS_TEMPLATE \| IPS_OFFLOAD),
		IPS_SEQ_ADJUST \| IPS_TEMPLATE \| IPS_UNTRACKED \|
		IPS_OFFLOAD),

		__IPS_MAX_BIT = 15,
		};

net/netfilter/nf_conntrack_core.c

+181 −34

Original line number	Diff line number	Diff line
		@@ -869,31 +869,179 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
		}
		}

		/* Resolve race on insertion if this protocol allows this. */
		static int nf_ct_resolve_clash(struct net net, struct sk_buff skb,
		enum ip_conntrack_info ctinfo,
		static void __nf_conntrack_insert_prepare(struct nf_conn ct, struct sk_buff skb)
		{
		struct nf_conn_tstamp *tstamp;

		atomic_inc(&ct->ct_general.use);
		ct->status \|= IPS_CONFIRMED;

		/* set conntrack timestamp, if enabled. */
		tstamp = nf_conn_tstamp_find(ct);
		if (tstamp) {
		if (skb->tstamp == 0)
		__net_timestamp(skb);

		tstamp->start = ktime_to_ns(skb->tstamp);
		}
		}

		static int __nf_ct_resolve_clash(struct sk_buff *skb,
		struct nf_conntrack_tuple_hash *h)
		{
		/* This is the conntrack entry already in hashes that won race. */
		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
		const struct nf_conntrack_l4proto *l4proto;
		enum ip_conntrack_info oldinfo;
		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
		enum ip_conntrack_info ctinfo;
		struct nf_conn *loser_ct;

		loser_ct = nf_ct_get(skb, &ctinfo);

		if (nf_ct_is_dying(ct))
		return NF_DROP;

		if (!atomic_inc_not_zero(&ct->ct_general.use))
		return NF_DROP;

		l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
		if (l4proto->allow_clash &&
		!nf_ct_is_dying(ct) &&
		atomic_inc_not_zero(&ct->ct_general.use)) {
		if (((ct->status & IPS_NAT_DONE_MASK) == 0) \|\|
		nf_ct_match(ct, loser_ct)) {
		struct net *net = nf_ct_net(ct);

		nf_ct_acct_merge(ct, ctinfo, loser_ct);
		nf_ct_add_to_dying_list(loser_ct);
		nf_conntrack_put(&loser_ct->ct_general);
		nf_ct_set(skb, ct, oldinfo);
		nf_ct_set(skb, ct, ctinfo);

		NF_CT_STAT_INC(net, insert_failed);
		return NF_ACCEPT;
		}

		nf_ct_put(ct);
		return NF_DROP;
		}

		/**
		* nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
		*
		* @skb: skb that causes the collision
		* @repl_idx: hash slot for reply direction
		*
		* Called when origin or reply direction had a clash.
		* The skb can be handled without packet drop provided the reply direction
		* is unique or there the existing entry has the identical tuple in both
		* directions.
		*
		* Caller must hold conntrack table locks to prevent concurrent updates.
		*
		* Returns NF_DROP if the clash could not be handled.
		*/
		static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
		{
		struct nf_conn loser_ct = (struct nf_conn )skb_nfct(skb);
		const struct nf_conntrack_zone *zone;
		struct nf_conntrack_tuple_hash *h;
		struct hlist_nulls_node *n;
		struct net *net;

		zone = nf_ct_zone(loser_ct);
		net = nf_ct_net(loser_ct);

		/* Reply direction must never result in a clash, unless both origin
		* and reply tuples are identical.
		*/
		hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
		if (nf_ct_key_equal(h,
		&loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
		zone, net))
		return __nf_ct_resolve_clash(skb, h);
		}

		/* We want the clashing entry to go away real soon: 1 second timeout. */
		loser_ct->timeout = nfct_time_stamp + HZ;

		/* IPS_NAT_CLASH removes the entry automatically on the first
		* reply. Also prevents UDP tracker from moving the entry to
		* ASSURED state, i.e. the entry can always be evicted under
		* pressure.
		*/
		loser_ct->status \|= IPS_FIXED_TIMEOUT \| IPS_NAT_CLASH;

		__nf_conntrack_insert_prepare(loser_ct, skb);

		/* fake add for ORIGINAL dir: we want lookups to only find the entry
		* already in the table. This also hides the clashing entry from
		* ctnetlink iteration, i.e. conntrack -L won't show them.
		*/
		hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);

		hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
		&nf_conntrack_hash[repl_idx]);
		return NF_ACCEPT;
		}

		/**
		* nf_ct_resolve_clash - attempt to handle clash without packet drop
		*
		* @skb: skb that causes the clash
		* @h: tuplehash of the clashing entry already in table
		* @hash_reply: hash slot for reply direction
		*
		* A conntrack entry can be inserted to the connection tracking table
		* if there is no existing entry with an identical tuple.
		*
		* If there is one, @skb (and the assocated, unconfirmed conntrack) has
		* to be dropped. In case @skb is retransmitted, next conntrack lookup
		* will find the already-existing entry.
		*
		* The major problem with such packet drop is the extra delay added by
		* the packet loss -- it will take some time for a retransmit to occur
		* (or the sender to time out when waiting for a reply).
		*
		* This function attempts to handle the situation without packet drop.
		*
		* If @skb has no NAT transformation or if the colliding entries are
		* exactly the same, only the to-be-confirmed conntrack entry is discarded
		* and @skb is associated with the conntrack entry already in the table.
		*
		* Failing that, the new, unconfirmed conntrack is still added to the table
		* provided that the collision only occurs in the ORIGINAL direction.
		* The new entry will be added after the existing one in the hash list,
		* so packets in the ORIGINAL direction will continue to match the existing
		* entry. The new entry will also have a fixed timeout so it expires --
		* due to the collision, it will not see bidirectional traffic.
		*
		* Returns NF_DROP if the clash could not be resolved.
		*/
		static __cold noinline int
		nf_ct_resolve_clash(struct sk_buff skb, struct nf_conntrack_tuple_hash h,
		u32 reply_hash)
		{
		/* This is the conntrack entry already in hashes that won race. */
		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
		const struct nf_conntrack_l4proto *l4proto;
		enum ip_conntrack_info ctinfo;
		struct nf_conn *loser_ct;
		struct net *net;
		int ret;

		loser_ct = nf_ct_get(skb, &ctinfo);
		net = nf_ct_net(loser_ct);

		l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
		if (!l4proto->allow_clash)
		goto drop;

		ret = __nf_ct_resolve_clash(skb, h);
		if (ret == NF_ACCEPT)
		return ret;

		ret = nf_ct_resolve_clash_harder(skb, reply_hash);
		if (ret == NF_ACCEPT)
		return ret;

		drop:
		nf_ct_add_to_dying_list(loser_ct);
		NF_CT_STAT_INC(net, drop);
		NF_CT_STAT_INC(net, insert_failed);
		return NF_DROP;
		}

		@@ -906,7 +1054,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		struct nf_conntrack_tuple_hash *h;
		struct nf_conn *ct;
		struct nf_conn_help *help;
		struct nf_conn_tstamp *tstamp;
		struct hlist_nulls_node *n;
		enum ip_conntrack_info ctinfo;
		struct net *net;
		@@ -963,6 +1110,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)

		if (unlikely(nf_ct_is_dying(ct))) {
		nf_ct_add_to_dying_list(ct);
		NF_CT_STAT_INC(net, insert_failed);
		goto dying;
		}

		@@ -983,17 +1131,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		setting time, otherwise we'd get timer wrap in
		weird delay cases. */
		ct->timeout += nfct_time_stamp;
		atomic_inc(&ct->ct_general.use);
		ct->status \|= IPS_CONFIRMED;

		/* set conntrack timestamp, if enabled. */
		tstamp = nf_conn_tstamp_find(ct);
		if (tstamp) {
		if (skb->tstamp == 0)
		__net_timestamp(skb);

		tstamp->start = ktime_to_ns(skb->tstamp);
		}
		__nf_conntrack_insert_prepare(ct, skb);
		/* Since the lookup is lockless, hash insertion must be done after
		* starting the timer and setting the CONFIRMED bit. The RCU barriers
		* guarantee that no other CPU can find the conntrack before the above
		@@ -1012,11 +1150,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		return NF_ACCEPT;

		out:
		nf_ct_add_to_dying_list(ct);
		ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
		ret = nf_ct_resolve_clash(skb, h, reply_hash);
		dying:
		nf_conntrack_double_unlock(hash, reply_hash);
		NF_CT_STAT_INC(net, insert_failed);
		local_bh_enable();
		return ret;
		}
		@@ -1923,8 +2059,19 @@ get_next_corpse(int (iter)(struct nf_conn i, void *data),
		nf_conntrack_lock(lockp);
		if (*bucket < nf_conntrack_htable_size) {
		hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
		if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
		if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
		continue;
		/* All nf_conn objects are added to hash table twice, one
		* for original direction tuple, once for the reply tuple.
		*
		* Exception: In the IPS_NAT_CLASH case, only the reply
		* tuple is added (the original tuple already existed for
		* a different object).
		*
		* We only need to call the iterator once for each
		* conntrack, so we just use the 'reply' direction
		* tuple while iterating.
		*/
		ct = nf_ct_tuplehash_to_ctrack(h);
		if (iter(ct, data))
		goto found;

net/netfilter/nf_conntrack_proto_udp.c

+6 −2

Original line number	Diff line number	Diff line
		@@ -54,12 +54,16 @@ static int udp_packet(struct nf_conn *ct,
		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
		nf_ct_refresh_acct(ct, ctinfo, skb,
		timeouts[UDP_CT_REPLIED]);

		/* never set ASSURED for IPS_NAT_CLASH, they time out soon */
		if (unlikely((ct->status & IPS_NAT_CLASH)))
		return NF_ACCEPT;

		/* Also, more likely to be important, and not a probe */
		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
		nf_conntrack_event_cache(IPCT_ASSURED, ct);
		} else {
		nf_ct_refresh_acct(ct, ctinfo, skb,
		timeouts[UDP_CT_UNREPLIED]);
		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
		}
		return NF_ACCEPT;
		}

net/netfilter/nft_flow_offload.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -100,7 +100,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
		}

		if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) \|\|
		ct->status & IPS_SEQ_ADJUST)
		ct->status & (IPS_SEQ_ADJUST \| IPS_NAT_CLASH))
		goto out;

		if (!nf_ct_is_confirmed(ct))