Commit fd258f2a authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'udp-pernetns-hash'

Kuniyuki Iwashima says:

====================
udp: Introduce optional per-netns hash table.

This series is the UDP version of the per-netns ehash series [0],
which were initially in the same patch set. [1]

The notable difference with TCP is the max table size is 64K and the min
size is 128.  This is because the possible hash range by udp_hashfn()
always fits in 64K within the same netns and because we want to keep a
bitmap in udp_lib_get_port() on the stack.  Also, the UDP per-netns table
isolates both 1-tuple and 2-tuple tables.

For details, please see the last patch.

  patch 1 - 4: prep for per-netns hash table
  patch     5: add per-netns hash table

[0]: https://lore.kernel.org/netdev/20220908011022.45342-1-kuniyu@amazon.com/
[1]: https://lore.kernel.org/netdev/20220826000445.46552-1-kuniyu@amazon.com/


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e8822565 9804985b
Loading
Loading
Loading
Loading
+27 −0
Original line number Diff line number Diff line
@@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER
udp_wmem_min - INTEGER
	UDP does not have tx memory accounting and this tunable has no effect.

udp_hash_entries - INTEGER
	Show the number of hash buckets for UDP sockets in the current
	networking namespace.

	A negative value means the networking namespace does not own its
	hash buckets and shares the initial networking namespace's one.

udp_child_ehash_entries - INTEGER
	Control the number of hash buckets for UDP sockets in the child
	networking namespace, which must be set before clone() or unshare().

	If the value is not 0, the kernel uses a value rounded up to 2^n
	as the actual hash bucket size.  0 is a special value, meaning
	the child networking namespace will share the initial networking
	namespace's hash buckets.

	Note that the child will use the global one in case the kernel
	fails to allocate enough memory.  In addition, the global hash
	buckets are spread over available NUMA nodes, but the allocation
	of the child hash table depends on the current process's NUMA
	policy, which could result in performance differences.

	Possible values: 0, 2^n (n: 7 (128) - 16 (64K))

	Default: 0


RAW variables
=============

+2 −0
Original line number Diff line number Diff line
@@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
	return (struct udphdr *)skb_transport_header(skb);
}

#define UDP_HTABLE_SIZE_MIN_PERNET	128
#define UDP_HTABLE_SIZE_MIN		(CONFIG_BASE_SMALL ? 128 : 256)
#define UDP_HTABLE_SIZE_MAX		65536

static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
{
+3 −0
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ struct tcp_fastopen_context;

struct netns_ipv4 {
	struct inet_timewait_death_row tcp_death_row;
	struct udp_table *udp_table;

#ifdef CONFIG_SYSCTL
	struct ctl_table_header	*forw_hdr;
@@ -207,6 +208,8 @@ struct netns_ipv4 {

	atomic_t dev_addr_genid;

	unsigned int sysctl_udp_child_hash_entries;

#ifdef CONFIG_SYSCTL
	unsigned long *sysctl_local_reserved_ports;
	int sysctl_ip_prot_sock;
+2 −2
Original line number Diff line number Diff line
@@ -6432,7 +6432,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
		else
			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
					       dst4, tuple->ipv4.dport,
					       dif, sdif, &udp_table, NULL);
					       dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
@@ -6448,7 +6448,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
							    src6, tuple->ipv6.sport,
							    dst6, tuple->ipv6.dport,
							    dif, sdif,
							    &udp_table, NULL);
							    net->ipv4.udp_table, NULL);
#endif
	}

+40 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600;
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;

@@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
	if (!net_eq(net, &init_net) && !hinfo->pernet)
		tcp_ehash_entries *= -1;

	memset(&tbl, 0, sizeof(tbl));
	tbl.data = &tcp_ehash_entries;
	tbl.maxlen = sizeof(int);

	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}

static int proc_udp_hash_entries(struct ctl_table *table, int write,
				 void *buffer, size_t *lenp, loff_t *ppos)
{
	struct net *net = container_of(table->data, struct net,
				       ipv4.sysctl_udp_child_hash_entries);
	int udp_hash_entries;
	struct ctl_table tbl;

	udp_hash_entries = net->ipv4.udp_table->mask + 1;

	/* A negative number indicates that the child netns
	 * shares the global udp_table.
	 */
	if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
		udp_hash_entries *= -1;

	memset(&tbl, 0, sizeof(tbl));
	tbl.data = &udp_hash_entries;
	tbl.maxlen = sizeof(int);

	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
					  void *buffer, size_t *lenp,
@@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = {
		.extra1		= SYSCTL_ZERO,
		.extra2		= &tcp_child_ehash_entries_max,
	},
	{
		.procname	= "udp_hash_entries",
		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
		.mode		= 0444,
		.proc_handler	= proc_udp_hash_entries,
	},
	{
		.procname	= "udp_child_hash_entries",
		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
		.maxlen		= sizeof(unsigned int),
		.mode		= 0644,
		.proc_handler	= proc_douintvec_minmax,
		.extra1		= SYSCTL_ZERO,
		.extra2		= &udp_child_hash_entries_max,
	},
	{
		.procname	= "udp_rmem_min",
		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
Loading