Commit 61dc651c authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Pablo Neira Ayuso says:

====================
Netfilter/IPVS updates for net-next

1) Allow slightly larger IPVS connection table size from Kconfig for
   64-bit arch, from Abhijeet Rastogi.

2) Since IPVS connection table might be larger than 2^20 after previous
   patch, allow to limit it depending on the available memory.
   Moreover, use kvmalloc. From Julian Anastasov.

3) Do not rebuild VLAN header in nft_payload when matching source and
   destination MAC address.

4) Remove nested rcu read lock side in ip_set_test(), from Florian Westphal.

5) Allow to update set size, also from Florian.

6) Improve NAT tuple selection when connection is closing,
   from Florian Westphal.

7) Support for resetting set element stateful expression, from Phil Sutter.

8) Use NLA_POLICY_MAX to narrow down maximum attribute value in nf_tables,
   from Florian Westphal.

* tag 'nf-next-23-06-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
  netfilter: nf_tables: limit allowed range via nla_policy
  netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET
  netfilter: snat: evict closing tcp entries on reply tuple collision
  netfilter: nf_tables: permit update of set size
  netfilter: ipset: remove rcu_read_lock_bh pair from ip_set_test
  netfilter: nft_payload: rebuild vlan header when needed
  ipvs: dynamically limit the connection hash table
  ipvs: increase ip_vs_conn_tab_bits range for 64BIT
====================

Link: https://lore.kernel.org/r/20230626064749.75525-1-pablo@netfilter.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 771ca3de a412dbf4
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -1611,6 +1611,7 @@ struct nft_trans_set {
	u64				timeout;
	bool				update;
	bool				bound;
	u32				size;
};

#define nft_trans_set(trans)	\
@@ -1625,6 +1626,8 @@ struct nft_trans_set {
	(((struct nft_trans_set *)trans->data)->timeout)
#define nft_trans_set_gc_int(trans)	\
	(((struct nft_trans_set *)trans->data)->gc_int)
#define nft_trans_set_size(trans)	\
	(((struct nft_trans_set *)trans->data)->size)

struct nft_trans_chain {
	struct nft_chain		*chain;
+2 −0
Original line number Diff line number Diff line
@@ -105,6 +105,7 @@ enum nft_verdicts {
 * @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes)
 * @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes)
 * @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes)
 * @NFT_MSG_GETSETELEM_RESET: get set elements and reset attached stateful expressions (enum nft_set_elem_attributes)
 */
enum nf_tables_msg_types {
	NFT_MSG_NEWTABLE,
@@ -140,6 +141,7 @@ enum nf_tables_msg_types {
	NFT_MSG_DESTROYSETELEM,
	NFT_MSG_DESTROYOBJ,
	NFT_MSG_DESTROYFLOWTABLE,
	NFT_MSG_GETSETELEM_RESET,
	NFT_MSG_MAX,
};

+0 −2
Original line number Diff line number Diff line
@@ -739,9 +739,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
		return 0;

	rcu_read_lock_bh();
	ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
	rcu_read_unlock_bh();

	if (ret == -EAGAIN) {
		/* Type requests element to be completed */
+14 −13
Original line number Diff line number Diff line
@@ -44,7 +44,8 @@ config IP_VS_DEBUG

config	IP_VS_TAB_BITS
	int "IPVS connection table size (the Nth power of 2)"
	range 8 20
	range 8 20 if !64BIT
	range 8 27 if 64BIT
	default 12
	help
	  The IPVS connection hash table uses the chaining scheme to handle
@@ -54,24 +55,24 @@ config IP_VS_TAB_BITS

	  Note the table size must be power of 2. The table size will be the
	  value of 2 to the your input number power. The number to choose is
	  from 8 to 20, the default number is 12, which means the table size
	  is 4096. Don't input the number too small, otherwise you will lose
	  performance on it. You can adapt the table size yourself, according
	  to your virtual server application. It is good to set the table size
	  not far less than the number of connections per second multiplying
	  average lasting time of connection in the table.  For example, your
	  virtual server gets 200 connections per second, the connection lasts
	  for 200 seconds in average in the connection table, the table size
	  should be not far less than 200x200, it is good to set the table
	  size 32768 (2**15).
	  from 8 to 27 for 64BIT(20 otherwise), the default number is 12,
	  which means the table size is 4096. Don't input the number too
	  small, otherwise you will lose performance on it. You can adapt the
	  table size yourself, according to your virtual server application.
	  It is good to set the table size not far less than the number of
	  connections per second multiplying average lasting time of
	  connection in the table.  For example, your virtual server gets 200
	  connections per second, the connection lasts for 200 seconds in
	  average in the connection table, the table size should be not far
	  less than 200x200, it is good to set the table size 32768 (2**15).

	  Another note that each connection occupies 128 bytes effectively and
	  each hash entry uses 8 bytes, so you can estimate how much memory is
	  needed for your box.

	  You can overwrite this number setting conn_tab_bits module parameter
	  or by appending ip_vs.conn_tab_bits=? to the kernel command line
	  if IP VS was compiled built-in.
	  or by appending ip_vs.conn_tab_bits=? to the kernel command line if
	  IP VS was compiled built-in.

comment "IPVS transport protocol load balancing support"

+17 −9
Original line number Diff line number Diff line
@@ -26,7 +26,6 @@
#include <linux/net.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/proc_fs.h>		/* for proc_net_* */
#include <linux/slab.h>
#include <linux/seq_file.h>
@@ -1482,13 +1481,21 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
int __init ip_vs_conn_init(void)
{
	size_t tab_array_size;
	int max_avail;
#if BITS_PER_LONG > 32
	int max = 27;
#else
	int max = 20;
#endif
	int min = 8;
	int idx;

	/* Compute size and mask */
	if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
		pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
		ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
	}
	max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
	max_avail -= 2;		/* ~4 in hash row */
	max_avail -= 1;		/* IPVS up to 1/2 of mem */
	max_avail -= order_base_2(sizeof(struct ip_vs_conn));
	max = clamp(max, min, max_avail);
	ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max);
	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;

@@ -1497,7 +1504,8 @@ int __init ip_vs_conn_init(void)
	 */
	tab_array_size = array_size(ip_vs_conn_tab_size,
				    sizeof(*ip_vs_conn_tab));
	ip_vs_conn_tab = vmalloc(tab_array_size);
	ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
					sizeof(*ip_vs_conn_tab), GFP_KERNEL);
	if (!ip_vs_conn_tab)
		return -ENOMEM;

@@ -1506,7 +1514,7 @@ int __init ip_vs_conn_init(void)
					      sizeof(struct ip_vs_conn), 0,
					      SLAB_HWCACHE_ALIGN, NULL);
	if (!ip_vs_conn_cachep) {
		vfree(ip_vs_conn_tab);
		kvfree(ip_vs_conn_tab);
		return -ENOMEM;
	}

@@ -1534,5 +1542,5 @@ void ip_vs_conn_cleanup(void)
	rcu_barrier();
	/* Release the empty cache */
	kmem_cache_destroy(ip_vs_conn_cachep);
	vfree(ip_vs_conn_tab);
	kvfree(ip_vs_conn_tab);
}
Loading