Commit 7fe7f318 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

1) ipset limits the max allocatable memory via kvmalloc() to MAX_INT,
   from Jozsef Kadlecsik.

2) Check ip_vs_conn_tab_bits value to be in the range specified
   in Kconfig, from Andrea Claudi.

3) Initialize fragment offset in ip6tables, from Jeremy Sowden.

4) Make conntrack hash chain length random, from Florian Westphal.

5) Add zone ID to conntrack and NAT hashtuple again, also from Florian.

6) Add selftests for bidirectional zone support and colliding tuples,
   from Florian Westphal.

7) Unlink table before synchronize_rcu when cleaning tables with
   owner, from Florian.

8) ipset limits the max allocatable memory via kvmalloc() to MAX_INT.

9) Release conntrack entries via workqueue in masquerade, from Florian.

10) Fix bogus net_init in iptables raw table definition, also from Florian.

11) Work around missing softdep in log extensions, from Florian Westphal.

12) Serialize hash resizes and cleanups with mutex, from Eric Dumazet.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
  netfilter: conntrack: serialize hash resizes and cleanups
  netfilter: log: work around missing softdep backend module
  netfilter: iptable_raw: drop bogus net_init annotation
  netfilter: nf_nat_masquerade: defer conntrack walk to work queue
  netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic
  netfilter: nf_tables: Fix oversized kvmalloc() calls
  netfilter: nf_tables: unlink table before deleting it
  selftests: netfilter: add zone stress test with colliding tuples
  selftests: netfilter: add selftest for directional zone support
  netfilter: nat: include zone id in nat table hash again
  netfilter: conntrack: include zone id in tuple hash again
  netfilter: conntrack: make max chain length random
  netfilter: ip6_tables: zero-initialize fragment offset
  ipvs: check that ip_vs_conn_tab_bits is between 8 and 20
  netfilter: ipset: Fix oversized kvmalloc() calls
====================

Link: https://lore.kernel.org/r/20210924221113.348767-1-pablo@netfilter.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 4526fe74 e9edc188
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -42,7 +42,7 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,

static struct nf_hook_ops *rawtable_ops __read_mostly;

static int __net_init iptable_raw_table_init(struct net *net)
static int iptable_raw_table_init(struct net *net)
{
	struct ipt_replace *repl;
	const struct xt_table *table = &packet_raw;
+1 −0
Original line number Diff line number Diff line
@@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb,
	 * things we don't know, ie. tcp syn flag or ports).  If the
	 * rule is also a fragment-specific rule, non-fragments won't
	 * match it. */
	acpar.fragoff = 0;
	acpar.hotdrop = false;
	acpar.state   = state;

+2 −2
Original line number Diff line number Diff line
@@ -130,11 +130,11 @@ htable_size(u8 hbits)
{
	size_t hsize;

	/* We must fit both into u32 in jhash and size_t */
	/* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
	if (hbits > 31)
		return 0;
	hsize = jhash_size(hbits);
	if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
	if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
	    < hsize)
		return 0;

+4 −0
Original line number Diff line number Diff line
@@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void)
	int idx;

	/* Compute size and mask */
	if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
		pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
		ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
	}
	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;

+100 −54
Original line number Diff line number Diff line
@@ -74,10 +74,14 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;

/* serialize hash resizes and nf_ct_iterate_cleanup */
static DEFINE_MUTEX(nf_conntrack_mutex);

#define GC_SCAN_INTERVAL	(120u * HZ)
#define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)

#define MAX_CHAINLEN	64u
#define MIN_CHAINLEN	8u
#define MAX_CHAINLEN	(32u - MIN_CHAINLEN)

static struct conntrack_gc_work conntrack_gc_work;

@@ -188,11 +192,13 @@ seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_key_t nf_conntrack_hash_rnd __read_mostly;

static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
			      unsigned int zoneid,
			      const struct net *net)
{
	struct {
		struct nf_conntrack_man src;
		union nf_inet_addr dst_addr;
		unsigned int zone;
		u32 net_mix;
		u16 dport;
		u16 proto;
@@ -205,6 +211,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
	/* The direction must be ignored, so handle usable members manually. */
	combined.src = tuple->src;
	combined.dst_addr = tuple->dst.u3;
	combined.zone = zoneid;
	combined.net_mix = net_hash_mix(net);
	combined.dport = (__force __u16)tuple->dst.u.all;
	combined.proto = tuple->dst.protonum;
@@ -219,15 +226,17 @@ static u32 scale_hash(u32 hash)

static u32 __hash_conntrack(const struct net *net,
			    const struct nf_conntrack_tuple *tuple,
			    unsigned int zoneid,
			    unsigned int size)
{
	return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
	return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
}

static u32 hash_conntrack(const struct net *net,
			  const struct nf_conntrack_tuple *tuple)
			  const struct nf_conntrack_tuple *tuple,
			  unsigned int zoneid)
{
	return scale_hash(hash_conntrack_raw(tuple, net));
	return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
}

static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
@@ -650,9 +659,11 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
	do {
		sequence = read_seqcount_begin(&nf_conntrack_generation);
		hash = hash_conntrack(net,
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				      nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
		reply_hash = hash_conntrack(net,
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
					   nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	clean_from_lists(ct);
@@ -819,8 +830,20 @@ struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
		      const struct nf_conntrack_tuple *tuple)
{
	unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
	struct nf_conntrack_tuple_hash *thash;

	thash = __nf_conntrack_find_get(net, zone, tuple,
					hash_conntrack_raw(tuple, zone_id, net));

	if (thash)
		return thash;

	rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
	if (rid != zone_id)
		return __nf_conntrack_find_get(net, zone, tuple,
				       hash_conntrack_raw(tuple, net));
					       hash_conntrack_raw(tuple, rid, net));
	return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);

@@ -842,6 +865,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	unsigned int hash, reply_hash;
	struct nf_conntrack_tuple_hash *h;
	struct hlist_nulls_node *n;
	unsigned int max_chainlen;
	unsigned int chainlen = 0;
	unsigned int sequence;
	int err = -EEXIST;
@@ -852,18 +876,22 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
	do {
		sequence = read_seqcount_begin(&nf_conntrack_generation);
		hash = hash_conntrack(net,
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				      nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
		reply_hash = hash_conntrack(net,
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
					   nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);

	/* See if there's one in the list already, including reverse */
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				    zone, net))
			goto out;

		if (chainlen++ > MAX_CHAINLEN)
		if (chainlen++ > max_chainlen)
			goto chaintoolong;
	}

@@ -873,7 +901,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > MAX_CHAINLEN)
		if (chainlen++ > max_chainlen)
			goto chaintoolong;
	}

@@ -1103,8 +1131,8 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
	unsigned int chainlen = 0, sequence, max_chainlen;
	const struct nf_conntrack_zone *zone;
	unsigned int chainlen = 0, sequence;
	unsigned int hash, reply_hash;
	struct nf_conntrack_tuple_hash *h;
	struct nf_conn *ct;
@@ -1133,8 +1161,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
		hash = scale_hash(hash);
		reply_hash = hash_conntrack(net,
					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
					   nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

	/* We're not in hash table, and we refuse to set up related
@@ -1168,6 +1196,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		goto dying;
	}

	max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
	/* See if there's one in the list already, including reverse:
	   NAT could have grabbed it without realizing, since we're
	   not in the hash.  If there is, we lost race. */
@@ -1175,7 +1204,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > MAX_CHAINLEN)
		if (chainlen++ > max_chainlen)
			goto chaintoolong;
	}

@@ -1184,7 +1213,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > MAX_CHAINLEN) {
		if (chainlen++ > max_chainlen) {
chaintoolong:
			nf_ct_add_to_dying_list(ct);
			NF_CT_STAT_INC(net, chaintoolong);
@@ -1246,7 +1275,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
	rcu_read_lock();
 begin:
	nf_conntrack_get_ht(&ct_hash, &hsize);
	hash = __hash_conntrack(net, tuple, hsize);
	hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);

	hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
		ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1687,8 +1716,8 @@ resolve_normal_ct(struct nf_conn *tmpl,
	struct nf_conntrack_tuple_hash *h;
	enum ip_conntrack_info ctinfo;
	struct nf_conntrack_zone tmp;
	u32 hash, zone_id, rid;
	struct nf_conn *ct;
	u32 hash;

	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
			     dataoff, state->pf, protonum, state->net,
@@ -1699,8 +1728,20 @@ resolve_normal_ct(struct nf_conn *tmpl,

	/* look for tuple match */
	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
	hash = hash_conntrack_raw(&tuple, state->net);

	zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
	hash = hash_conntrack_raw(&tuple, zone_id, state->net);
	h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);

	if (!h) {
		rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
		if (zone_id != rid) {
			u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);

			h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
		}
	}

	if (!h) {
		h = init_conntrack(state->net, tmpl, &tuple,
				   skb, dataoff, hash);
@@ -2225,11 +2266,15 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
	spinlock_t *lockp;

	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
		struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];

		if (hlist_nulls_empty(hslot))
			continue;

		lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
		local_bh_disable();
		nf_conntrack_lock(lockp);
		if (*bucket < nf_conntrack_htable_size) {
			hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
		hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
			if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
				continue;
			/* All nf_conn objects are added to hash table twice, one
@@ -2247,7 +2292,6 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
			if (iter(ct, data))
				goto found;
		}
		}
		spin_unlock(lockp);
		local_bh_enable();
		cond_resched();
@@ -2264,14 +2308,12 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
				  void *data, u32 portid, int report)
{
	unsigned int bucket = 0, sequence;
	unsigned int bucket = 0;
	struct nf_conn *ct;

	might_sleep();

	for (;;) {
		sequence = read_seqcount_begin(&nf_conntrack_generation);

	mutex_lock(&nf_conntrack_mutex);
	while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
		/* Time to push up daises... */

@@ -2279,11 +2321,7 @@ static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
		nf_ct_put(ct);
		cond_resched();
	}

		if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
			break;
		bucket = 0;
	}
	mutex_unlock(&nf_conntrack_mutex);
}

struct iter_data {
@@ -2519,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
	if (!hash)
		return -ENOMEM;

	mutex_lock(&nf_conntrack_mutex);
	old_size = nf_conntrack_htable_size;
	if (old_size == hashsize) {
		mutex_unlock(&nf_conntrack_mutex);
		kvfree(hash);
		return 0;
	}
@@ -2537,12 +2577,16 @@ int nf_conntrack_hash_resize(unsigned int hashsize)

	for (i = 0; i < nf_conntrack_htable_size; i++) {
		while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
			unsigned int zone_id;

			h = hlist_nulls_entry(nf_conntrack_hash[i].first,
					      struct nf_conntrack_tuple_hash, hnnode);
			ct = nf_ct_tuplehash_to_ctrack(h);
			hlist_nulls_del_rcu(&h->hnnode);

			zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
			bucket = __hash_conntrack(nf_ct_net(ct),
						  &h->tuple, hashsize);
						  &h->tuple, zone_id, hashsize);
			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
		}
	}
@@ -2556,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
	nf_conntrack_all_unlock();
	local_bh_enable();

	mutex_unlock(&nf_conntrack_mutex);

	synchronize_net();
	kvfree(old_hash);
	return 0;
Loading