Commit 3f17e16f authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'add-ip_local_port_range-socket-option'

Jakub Sitnicki says:

====================
Add IP_LOCAL_PORT_RANGE socket option

This patch set is a follow up to the "How to share IPv4 addresses by
partitioning the port space" talk given at LPC 2022 [1].

Please see patch #1 for the motivation & the use case description.
Patch #2 adds tests exercising the new option in various scenarios.

Documentation
-------------

Proposed update to the ip(7) man-page:

       IP_LOCAL_PORT_RANGE (since Linux X.Y)
              Set or get the per-socket default local  port  range.  This
              option  can  be  used  to  clamp down the global local port
              range, defined by the ip_local_port_range  /proc  interface
              described below, for a given socket.

              The  option  takes  an uint32_t value with the high 16 bits
              set to the upper range bound, and the low 16  bits  set  to
              the  lower  range  bound.  Range  bounds are inclusive. The
              16-bit values should be in host byte order.

              The lower bound has to be less than the  upper  bound  when
              both  bounds  are  not  zero. Otherwise, setting the option
              fails with EINVAL.

              If either bound is outside of the global local port  range,
              or is zero, then that bound has no effect.

              To  reset  the setting, pass zero as both the upper and the
              lower bound.

Interaction with SELinux bind() hook
------------------------------------

SELinux bind() hook - selinux_socket_bind() - performs a permission check
if the requested local port number lies outside of the netns ephemeral port
range.

The proposed socket option cannot be used change the ephemeral port range
to extend beyond the per-netns port range, as set by
net.ipv4.ip_local_port_range.

Hence, there is no interaction with SELinux, AFAICT.

RFC -> v1
RFC: https://lore.kernel.org/netdev/20220912225308.93659-1-jakub@cloudflare.com/

 * Allow either the high bound or the low bound, or both, to be zero
 * Add getsockopt support
 * Add selftests

Links:
------

[1]: https://lpc.events/event/16/contributions/1349/
====================

Link: https://lore.kernel.org/r/20221221-sockopt-port-range-v6-0-be255cc0e51f@cloudflare.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 6a7a2c18 ae543965
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -249,6 +249,10 @@ struct inet_sock {
	__be32			mc_addr;
	struct ip_mc_socklist __rcu	*mc_list;
	struct inet_cork_full	cork;
	struct {
		__u16 lo;
		__u16 hi;
	}			local_port_range;
};

#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
+2 −1
Original line number Diff line number Diff line
@@ -340,7 +340,8 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
	} \
}

void inet_get_local_port_range(struct net *net, int *low, int *high);
void inet_get_local_port_range(const struct net *net, int *low, int *high);
void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);

#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
+1 −0
Original line number Diff line number Diff line
@@ -162,6 +162,7 @@ struct in_addr {
#define MCAST_MSFILTER			48
#define IP_MULTICAST_ALL		49
#define IP_UNICAST_IF			50
#define IP_LOCAL_PORT_RANGE		51

#define MCAST_EXCLUDE	0
#define MCAST_INCLUDE	1
+23 −2
Original line number Diff line number Diff line
@@ -117,7 +117,7 @@ bool inet_rcv_saddr_any(const struct sock *sk)
	return !sk->sk_rcv_saddr;
}

void inet_get_local_port_range(struct net *net, int *low, int *high)
void inet_get_local_port_range(const struct net *net, int *low, int *high)
{
	unsigned int seq;

@@ -130,6 +130,27 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);

void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
	const struct inet_sock *inet = inet_sk(sk);
	const struct net *net = sock_net(sk);
	int lo, hi, sk_lo, sk_hi;

	inet_get_local_port_range(net, &lo, &hi);

	sk_lo = inet->local_port_range.lo;
	sk_hi = inet->local_port_range.hi;

	if (unlikely(lo <= sk_lo && sk_lo <= hi))
		lo = sk_lo;
	if (unlikely(lo <= sk_hi && sk_hi <= hi))
		hi = sk_hi;

	*low = lo;
	*high = hi;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);

static bool inet_use_bhash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
@@ -316,7 +337,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
ports_exhausted:
	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
	inet_get_local_port_range(net, &low, &high);
	inet_sk_get_local_port_range(sk, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	if (high - low < 4)
		attempt_half = 0;
+1 −1
Original line number Diff line number Diff line
@@ -1016,7 +1016,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,

	l3mdev = inet_sk_bound_l3mdev(sk);

	inet_get_local_port_range(net, &low, &high);
	inet_sk_get_local_port_range(sk, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
Loading