Commit 10905b4a authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Pablo Neira Ayuso says:

====================
Netfilter fixes for net

1) Protect nft_ct template with global mutex, from Pavel Skripkin.

2) Two recent commits switched inet rt and nexthop exception hashes
   from jhash to siphash. If those two spots are problematic then
   conntrack is affected as well, so switch voer to siphash too.
   While at it, add a hard upper limit on chain lengths and reject
   insertion if this is hit. Patches from Florian Westphal.

3) Fix use-after-scope in nf_socket_ipv6 reported by KASAN,
   from Benjamin Hesmans.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
  netfilter: socket: icmp6: fix use-after-scope
  netfilter: refuse insertion if chain has grown too large
  netfilter: conntrack: switch to siphash
  netfilter: conntrack: sanitize table size default settings
  netfilter: nft_ct: protect nft_ct_pcpu_template_refcnt with mutex
====================

Link: https://lore.kernel.org/r/20210903163020.13741-1-pablo@netfilter.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 52a67fbf 730affed
Loading
Loading
Loading
Loading
+8 −5
Original line number Diff line number Diff line
@@ -17,9 +17,8 @@ nf_conntrack_acct - BOOLEAN
nf_conntrack_buckets - INTEGER
	Size of hash table. If not specified as parameter during module
	loading, the default size is calculated by dividing total memory
	by 16384 to determine the number of buckets but the hash table will
	never have fewer than 32 and limited to 16384 buckets. For systems
	with more than 4GB of memory it will be 65536 buckets.
	by 16384 to determine the number of buckets. The hash table will
	never have fewer than 1024 and never more than 262144 buckets.
	This sysctl is only writeable in the initial net namespace.

nf_conntrack_checksum - BOOLEAN
@@ -100,8 +99,12 @@ nf_conntrack_log_invalid - INTEGER
	Log invalid packets of a type specified by value.

nf_conntrack_max - INTEGER
	Size of connection tracking table.  Default value is
	nf_conntrack_buckets value * 4.
        Maximum number of allowed connection tracking entries. This value is set
        to nf_conntrack_buckets by default.
        Note that connection tracking entries are added to the table twice -- once
        for the original direction and once for the reply direction (i.e., with
        the reversed address). This means that with default settings a maxed-out
        table will have a average hash chain length of 2, not 1.

nf_conntrack_tcp_be_liberal - BOOLEAN
	- 0 - disabled (default)
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ struct ip_conntrack_stat {
	unsigned int expect_create;
	unsigned int expect_delete;
	unsigned int search_restart;
	unsigned int chaintoolong;
};

#define NFCT_INFOMASK	7UL
+1 −0
Original line number Diff line number Diff line
@@ -258,6 +258,7 @@ enum ctattr_stats_cpu {
	CTA_STATS_ERROR,
	CTA_STATS_SEARCH_RESTART,
	CTA_STATS_CLASH_RESOLVE,
	CTA_STATS_CHAIN_TOOLONG,
	__CTA_STATS_MAX,
};
#define CTA_STATS_MAX (__CTA_STATS_MAX - 1)
+1 −3
Original line number Diff line number Diff line
@@ -99,7 +99,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
{
	__be16 dport, sport;
	const struct in6_addr *daddr = NULL, *saddr = NULL;
	struct ipv6hdr *iph = ipv6_hdr(skb);
	struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
	struct sk_buff *data_skb = NULL;
	int doff = 0;
	int thoff = 0, tproto;
@@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
			thoff + sizeof(*hp);

	} else if (tproto == IPPROTO_ICMPV6) {
		struct ipv6hdr ipv6_var;

		if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
					 &sport, &dport, &ipv6_var))
			return NULL;
+67 −36
Original line number Diff line number Diff line
@@ -21,7 +21,6 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
@@ -78,6 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all;
#define GC_SCAN_INTERVAL	(120u * HZ)
#define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)

#define MAX_CHAINLEN	64u

static struct conntrack_gc_work conntrack_gc_work;

void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
@@ -184,25 +185,31 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static unsigned int nf_conntrack_hash_rnd __read_mostly;
static siphash_key_t nf_conntrack_hash_rnd __read_mostly;

static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
			      const struct net *net)
{
	unsigned int n;
	u32 seed;
	struct {
		struct nf_conntrack_man src;
		union nf_inet_addr dst_addr;
		u32 net_mix;
		u16 dport;
		u16 proto;
	} __aligned(SIPHASH_ALIGNMENT) combined;

	get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));

	/* The direction must be ignored, so we hash everything up to the
	 * destination ports (which is a multiple of 4) and treat the last
	 * three bytes manually.
	 */
	seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
	return jhash2((u32 *)tuple, n, seed ^
		      (((__force __u16)tuple->dst.u.all << 16) |
		      tuple->dst.protonum));
	memset(&combined, 0, sizeof(combined));

	/* The direction must be ignored, so handle usable members manually. */
	combined.src = tuple->src;
	combined.dst_addr = tuple->dst.u3;
	combined.net_mix = net_hash_mix(net);
	combined.dport = (__force __u16)tuple->dst.u.all;
	combined.proto = tuple->dst.protonum;

	return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
}

static u32 scale_hash(u32 hash)
@@ -835,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	unsigned int hash, reply_hash;
	struct nf_conntrack_tuple_hash *h;
	struct hlist_nulls_node *n;
	unsigned int chainlen = 0;
	unsigned int sequence;
	int err = -EEXIST;

	zone = nf_ct_zone(ct);

@@ -849,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	/* See if there's one in the list already, including reverse */
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				    zone, net))
			goto out;

	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
		if (chainlen++ > MAX_CHAINLEN)
			goto chaintoolong;
	}

	chainlen = 0;

	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > MAX_CHAINLEN)
			goto chaintoolong;
	}

	smp_wmb();
	/* The caller holds a reference to this object */
@@ -867,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	NF_CT_STAT_INC(net, insert);
	local_bh_enable();
	return 0;

chaintoolong:
	NF_CT_STAT_INC(net, chaintoolong);
	err = -ENOSPC;
out:
	nf_conntrack_double_unlock(hash, reply_hash);
	local_bh_enable();
	return -EEXIST;
	return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);

@@ -1084,6 +1104,7 @@ int
__nf_conntrack_confirm(struct sk_buff *skb)
{
	const struct nf_conntrack_zone *zone;
	unsigned int chainlen = 0, sequence;
	unsigned int hash, reply_hash;
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
@@ -1091,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	struct hlist_nulls_node *n;
	enum ip_conntrack_info ctinfo;
	struct net *net;
	unsigned int sequence;
	int ret = NF_DROP;

	ct = nf_ct_get(skb, &ctinfo);
@@ -1151,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb)
	/* See if there's one in the list already, including reverse:
	   NAT could have grabbed it without realizing, since we're
	   not in the hash.  If there is, we lost race. */
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > MAX_CHAINLEN)
			goto chaintoolong;
	}

	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
	chainlen = 0;
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > MAX_CHAINLEN) {
chaintoolong:
			nf_ct_add_to_dying_list(ct);
			NF_CT_STAT_INC(net, chaintoolong);
			NF_CT_STAT_INC(net, insert_failed);
			ret = NF_DROP;
			goto dying;
		}
	}

	/* Timer relative to confirmation time, not original
	   setting time, otherwise we'd get timer wrap in
@@ -2594,26 +2627,24 @@ int nf_conntrack_init_start(void)
		spin_lock_init(&nf_conntrack_locks[i]);

	if (!nf_conntrack_htable_size) {
		/* Idea from tcp.c: use 1/16384 of memory.
		 * On i386: 32MB machine has 512 buckets.
		 * >= 1GB machines have 16384 buckets.
		 * >= 4GB machines have 65536 buckets.
		 */
		nf_conntrack_htable_size
			= (((nr_pages << PAGE_SHIFT) / 16384)
			   / sizeof(struct hlist_head));
		if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
			nf_conntrack_htable_size = 65536;
		if (BITS_PER_LONG >= 64 &&
		    nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
			nf_conntrack_htable_size = 262144;
		else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
			nf_conntrack_htable_size = 16384;
		if (nf_conntrack_htable_size < 32)
			nf_conntrack_htable_size = 32;

		/* Use a max. factor of four by default to get the same max as
		 * with the old struct list_heads. When a table size is given
		 * we use the old value of 8 to avoid reducing the max.
		 * entries. */
		max_factor = 4;
			nf_conntrack_htable_size = 65536;

		if (nf_conntrack_htable_size < 1024)
			nf_conntrack_htable_size = 1024;
		/* Use a max. factor of one by default to keep the average
		 * hash chain length at 2 entries.  Each entry has to be added
		 * twice (once for original direction, once for reply).
		 * When a table size is given we use the old value of 8 to
		 * avoid implicit reduction of the max entries setting.
		 */
		max_factor = 1;
	}

	nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
Loading