Unverified Commit 48d03614 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!2713 Fix netfilter conntrack

Merge Pull Request from: @ci-robot 
 
PR sync from: Lu Wei <luwei32@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/6GSEK3YM6DDWX2QQXZ7ROS4QPFIJFZDM/ 
Florian Westphal (7):
  netfilter: conntrack: tell compiler to not inline nf_ct_resolve_clash
  netfilter: conntrack: remove two args from resolve_clash
  netfilter: conntrack: place confirm-bit setting in a helper
  netfilter: conntrack: split resolve_clash function
  netfilter: conntrack: allow insertion of clashing entries
  netfilter: conntrack: do not auto-delete clash entries on reply
  netfilter: conntrack: fix infinite loop on rmmod


-- 
2.34.1
 
https://gitee.com/openeuler/kernel/issues/I63OS1 
 
Link:https://gitee.com/openeuler/kernel/pulls/2713

 

Reviewed-by: default avatarYue Haibing <yuehaibing@huawei.com>
Reviewed-by: default avatarLiu YongQiang <liuyongqiang13@huawei.com>
Signed-off-by: default avatarZhang Changzhong <zhangchangzhong@huawei.com>
parents 3d1841e0 3098db4c
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -137,6 +137,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
	}
}

/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
	n->pprev = &n->next;
	n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}

/**
 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:	the type * to use as a loop cursor.
+11 −1
Original line number Diff line number Diff line
@@ -97,6 +97,15 @@ enum ip_conntrack_status {
	IPS_UNTRACKED_BIT = 12,
	IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),

#ifdef __KERNEL__
	/* Re-purposed for in-kernel use:
	 * Tags a conntrack entry that clashed with an existing entry
	 * on insert.
	 */
	IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT,
	IPS_NAT_CLASH = IPS_UNTRACKED,
#endif

	/* Conntrack got a helper explicitly attached via CT target. */
	IPS_HELPER_BIT = 13,
	IPS_HELPER = (1 << IPS_HELPER_BIT),
@@ -110,7 +119,8 @@ enum ip_conntrack_status {
	 */
	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED |
				 IPS_OFFLOAD),

	__IPS_MAX_BIT = 15,
};
+181 −34
Original line number Diff line number Diff line
@@ -869,31 +869,179 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
	}
}

/* Resolve race on insertion if this protocol allows this. */
static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
			       enum ip_conntrack_info ctinfo,
static void __nf_conntrack_insert_prepare(struct nf_conn *ct, struct sk_buff *skb)
{
	struct nf_conn_tstamp *tstamp;

	atomic_inc(&ct->ct_general.use);
	ct->status |= IPS_CONFIRMED;

	/* set conntrack timestamp, if enabled. */
	tstamp = nf_conn_tstamp_find(ct);
	if (tstamp) {
		if (skb->tstamp == 0)
			__net_timestamp(skb);

		tstamp->start = ktime_to_ns(skb->tstamp);
	}
}

static int __nf_ct_resolve_clash(struct sk_buff *skb,
				 struct nf_conntrack_tuple_hash *h)
{
	/* This is the conntrack entry already in hashes that won race. */
	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
	const struct nf_conntrack_l4proto *l4proto;
	enum ip_conntrack_info oldinfo;
	struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
	enum ip_conntrack_info ctinfo;
	struct nf_conn *loser_ct;

	loser_ct = nf_ct_get(skb, &ctinfo);

	if (nf_ct_is_dying(ct))
		return NF_DROP;

	if (!atomic_inc_not_zero(&ct->ct_general.use))
		return NF_DROP;

	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
	if (l4proto->allow_clash &&
	    !nf_ct_is_dying(ct) &&
	    atomic_inc_not_zero(&ct->ct_general.use)) {
	if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
	    nf_ct_match(ct, loser_ct)) {
		struct net *net = nf_ct_net(ct);

		nf_ct_acct_merge(ct, ctinfo, loser_ct);
		nf_ct_add_to_dying_list(loser_ct);
		nf_conntrack_put(&loser_ct->ct_general);
			nf_ct_set(skb, ct, oldinfo);
		nf_ct_set(skb, ct, ctinfo);

		NF_CT_STAT_INC(net, insert_failed);
		return NF_ACCEPT;
	}

	nf_ct_put(ct);
	return NF_DROP;
}

/**
 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
 *
 * @skb: skb that causes the collision
 * @repl_idx: hash slot for reply direction
 *
 * Called when origin or reply direction had a clash.
 * The skb can be handled without packet drop provided the reply direction
 * is unique or there the existing entry has the identical tuple in both
 * directions.
 *
 * Caller must hold conntrack table locks to prevent concurrent updates.
 *
 * Returns NF_DROP if the clash could not be handled.
 */
static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
{
	struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
	const struct nf_conntrack_zone *zone;
	struct nf_conntrack_tuple_hash *h;
	struct hlist_nulls_node *n;
	struct net *net;

	zone = nf_ct_zone(loser_ct);
	net = nf_ct_net(loser_ct);

	/* Reply direction must never result in a clash, unless both origin
	 * and reply tuples are identical.
	 */
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
		if (nf_ct_key_equal(h,
				    &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				    zone, net))
			return __nf_ct_resolve_clash(skb, h);
	}

	/* We want the clashing entry to go away real soon: 1 second timeout. */
	loser_ct->timeout = nfct_time_stamp + HZ;

	/* IPS_NAT_CLASH removes the entry automatically on the first
	 * reply.  Also prevents UDP tracker from moving the entry to
	 * ASSURED state, i.e. the entry can always be evicted under
	 * pressure.
	 */
	loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;

	__nf_conntrack_insert_prepare(loser_ct, skb);

	/* fake add for ORIGINAL dir: we want lookups to only find the entry
	 * already in the table.  This also hides the clashing entry from
	 * ctnetlink iteration, i.e. conntrack -L won't show them.
	 */
	hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);

	hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
				 &nf_conntrack_hash[repl_idx]);
	return NF_ACCEPT;
}

/**
 * nf_ct_resolve_clash - attempt to handle clash without packet drop
 *
 * @skb: skb that causes the clash
 * @h: tuplehash of the clashing entry already in table
 * @hash_reply: hash slot for reply direction
 *
 * A conntrack entry can be inserted to the connection tracking table
 * if there is no existing entry with an identical tuple.
 *
 * If there is one, @skb (and the assocated, unconfirmed conntrack) has
 * to be dropped.  In case @skb is retransmitted, next conntrack lookup
 * will find the already-existing entry.
 *
 * The major problem with such packet drop is the extra delay added by
 * the packet loss -- it will take some time for a retransmit to occur
 * (or the sender to time out when waiting for a reply).
 *
 * This function attempts to handle the situation without packet drop.
 *
 * If @skb has no NAT transformation or if the colliding entries are
 * exactly the same, only the to-be-confirmed conntrack entry is discarded
 * and @skb is associated with the conntrack entry already in the table.
 *
 * Failing that, the new, unconfirmed conntrack is still added to the table
 * provided that the collision only occurs in the ORIGINAL direction.
 * The new entry will be added after the existing one in the hash list,
 * so packets in the ORIGINAL direction will continue to match the existing
 * entry.  The new entry will also have a fixed timeout so it expires --
 * due to the collision, it will not see bidirectional traffic.
 *
 * Returns NF_DROP if the clash could not be resolved.
 */
static __cold noinline int
nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
		    u32 reply_hash)
{
	/* This is the conntrack entry already in hashes that won race. */
	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
	const struct nf_conntrack_l4proto *l4proto;
	enum ip_conntrack_info ctinfo;
	struct nf_conn *loser_ct;
	struct net *net;
	int ret;

	loser_ct = nf_ct_get(skb, &ctinfo);
	net = nf_ct_net(loser_ct);

	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
	if (!l4proto->allow_clash)
		goto drop;

	ret = __nf_ct_resolve_clash(skb, h);
	if (ret == NF_ACCEPT)
		return ret;

	ret = nf_ct_resolve_clash_harder(skb, reply_hash);
	if (ret == NF_ACCEPT)
		return ret;

drop:
	nf_ct_add_to_dying_list(loser_ct);
	NF_CT_STAT_INC(net, drop);
	NF_CT_STAT_INC(net, insert_failed);
	return NF_DROP;
}

@@ -906,7 +1054,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
	struct nf_conn_help *help;
	struct nf_conn_tstamp *tstamp;
	struct hlist_nulls_node *n;
	enum ip_conntrack_info ctinfo;
	struct net *net;
@@ -963,6 +1110,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)

	if (unlikely(nf_ct_is_dying(ct))) {
		nf_ct_add_to_dying_list(ct);
		NF_CT_STAT_INC(net, insert_failed);
		goto dying;
	}

@@ -983,17 +1131,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	   setting time, otherwise we'd get timer wrap in
	   weird delay cases. */
	ct->timeout += nfct_time_stamp;
	atomic_inc(&ct->ct_general.use);
	ct->status |= IPS_CONFIRMED;

	/* set conntrack timestamp, if enabled. */
	tstamp = nf_conn_tstamp_find(ct);
	if (tstamp) {
		if (skb->tstamp == 0)
			__net_timestamp(skb);

		tstamp->start = ktime_to_ns(skb->tstamp);
	}
	__nf_conntrack_insert_prepare(ct, skb);
	/* Since the lookup is lockless, hash insertion must be done after
	 * starting the timer and setting the CONFIRMED bit. The RCU barriers
	 * guarantee that no other CPU can find the conntrack before the above
@@ -1012,11 +1150,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	return NF_ACCEPT;

out:
	nf_ct_add_to_dying_list(ct);
	ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
	ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
	nf_conntrack_double_unlock(hash, reply_hash);
	NF_CT_STAT_INC(net, insert_failed);
	local_bh_enable();
	return ret;
}
@@ -1923,8 +2059,19 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
		nf_conntrack_lock(lockp);
		if (*bucket < nf_conntrack_htable_size) {
			hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
				if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
				if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
					continue;
				/* All nf_conn objects are added to hash table twice, one
				 * for original direction tuple, once for the reply tuple.
				 *
				 * Exception: In the IPS_NAT_CLASH case, only the reply
				 * tuple is added (the original tuple already existed for
				 * a different object).
				 *
				 * We only need to call the iterator once for each
				 * conntrack, so we just use the 'reply' direction
				 * tuple while iterating.
				 */
				ct = nf_ct_tuplehash_to_ctrack(h);
				if (iter(ct, data))
					goto found;
+6 −2
Original line number Diff line number Diff line
@@ -54,12 +54,16 @@ static int udp_packet(struct nf_conn *ct,
	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
		nf_ct_refresh_acct(ct, ctinfo, skb,
				   timeouts[UDP_CT_REPLIED]);

		/* never set ASSURED for IPS_NAT_CLASH, they time out soon */
		if (unlikely((ct->status & IPS_NAT_CLASH)))
			return NF_ACCEPT;

		/* Also, more likely to be important, and not a probe */
		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
			nf_conntrack_event_cache(IPCT_ASSURED, ct);
	} else {
		nf_ct_refresh_acct(ct, ctinfo, skb,
				   timeouts[UDP_CT_UNREPLIED]);
		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
	}
	return NF_ACCEPT;
}
+1 −1
Original line number Diff line number Diff line
@@ -100,7 +100,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
	}

	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
	    ct->status & IPS_SEQ_ADJUST)
	    ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
		goto out;

	if (!nf_ct_is_confirmed(ct))