Commit 45897255 authored by Florian Westphal's avatar Florian Westphal Committed by Pablo Neira Ayuso
Browse files

netfilter: snat: evict closing tcp entries on reply tuple collision



When all tried source tuples are in use, the connection request (skb)
and the new conntrack will be dropped in nf_confirm() due to the
non-recoverable clash.

Make it so that the last 32 attempts are allowed to evict a colliding
entry if this connection is already closing and the new sequence number
has advanced past the old one.

Such "all tuples taken" secenario can happen with tcp-rpc workloads where
same dst:dport gets queried repeatedly.

Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent 96b2ef9b
Loading
Loading
Loading
Loading
+88 −4
Original line number Diff line number Diff line
@@ -27,6 +27,9 @@

#include "nf_internals.h"

#define NF_NAT_MAX_ATTEMPTS	128
#define NF_NAT_HARDER_THRESH	(NF_NAT_MAX_ATTEMPTS / 4)

static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];

static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
	return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}

static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
{
	static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
						  IPS_DYING;
	static const unsigned long flags_needed = IPS_SRC_NAT;
	enum tcp_conntrack old_state;

	old_state = READ_ONCE(ct->proto.tcp.state);
	if (old_state < TCP_CONNTRACK_TIME_WAIT)
		return false;

	if (flags & flags_refuse)
		return false;

	return (flags & flags_needed) == flags_needed;
}

/* reverse direction will send packets to new source, so
 * make sure such packets are invalid.
 */
static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
{
	return (__s32)(new->proto.tcp.seen[0].td_end -
		       old->proto.tcp.seen[0].td_end) > 0;
}

static int
nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
			 const struct nf_conn *ignored_conntrack,
			 unsigned int attempts_left)
{
	static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
	struct nf_conntrack_tuple_hash *thash;
	const struct nf_conntrack_zone *zone;
	struct nf_conntrack_tuple reply;
	unsigned long flags;
	struct nf_conn *ct;
	bool taken = true;
	struct net *net;

	nf_ct_invert_tuple(&reply, tuple);

	if (attempts_left > NF_NAT_HARDER_THRESH ||
	    tuple->dst.protonum != IPPROTO_TCP ||
	    ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
		return nf_conntrack_tuple_taken(&reply, ignored_conntrack);

	/* :ast few attempts to find a free tcp port. Destructive
	 * action: evict colliding if its in timewait state and the
	 * tcp sequence number has advanced past the one used by the
	 * old entry.
	 */
	net = nf_ct_net(ignored_conntrack);
	zone = nf_ct_zone(ignored_conntrack);

	thash = nf_conntrack_find_get(net, zone, &reply);
	if (!thash)
		return false;

	ct = nf_ct_tuplehash_to_ctrack(thash);

	if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
		goto out;

	if (WARN_ON_ONCE(ct == ignored_conntrack))
		goto out;

	flags = READ_ONCE(ct->status);
	if (!nf_nat_may_kill(ct, flags))
		goto out;

	if (!nf_seq_has_advanced(ct, ignored_conntrack))
		goto out;

	/* Even if we can evict do not reuse if entry is offloaded. */
	if (nf_ct_kill(ct))
		taken = flags & flags_offload;
out:
	nf_ct_put(ct);
	return taken;
}

static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
				 const struct nf_nat_range2 *range)
{
@@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
	unsigned int range_size, min, max, i, attempts;
	__be16 *keyptr;
	u16 off;
	static const unsigned int max_attempts = 128;

	switch (tuple->dst.protonum) {
	case IPPROTO_ICMP:
@@ -471,8 +555,8 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
		off = get_random_u16();

	attempts = range_size;
	if (attempts > max_attempts)
		attempts = max_attempts;
	if (attempts > NF_NAT_MAX_ATTEMPTS)
		attempts = NF_NAT_MAX_ATTEMPTS;

	/* We are in softirq; doing a search of the entire range risks
	 * soft lockup when all tuples are already used.
@@ -483,7 +567,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
another_round:
	for (i = 0; i < attempts; i++, off++) {
		*keyptr = htons(min + off % range_size);
		if (!nf_nat_used_tuple(tuple, ct))
		if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
			return;
	}