Commit 569dce3f authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'inet-data-races'



Eric Dumazet says:

====================
inet: socket lock and data-races avoidance

In this series, I converted 20 bits in "struct inet_sock" and made
them truly atomic.

This allows to implement many IP_ socket options in a lockless
fashion (no need to acquire socket lock), and fixes data-races
that were showing up in various KCSAN reports.

I also took care of IP_TTL/IP_MINTTL, but left few other options
for another series.

v4: Rebased after recent mptcp changes.
  Added Reviewed-by: tags from Simon (thanks !)

v3: fixed patch 7, feedback from build bot about ipvs set_mcast_loop()

v2: addressed a feedback from a build bot in patch 9 by removing
 unused issk variable in mptcp_setsockopt_sol_ip_set_transparent()
 Added Acked-by: tags from Soheil (thanks !)
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 936db833 12af7326
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -342,9 +342,9 @@ static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
	return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
}

static inline bool inet_csk_has_ulp(struct sock *sk)
static inline bool inet_csk_has_ulp(const struct sock *sk)
{
	return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops;
	return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops;
}

#endif /* _INET_CONNECTION_SOCK_H */
+62 −30
Original line number Diff line number Diff line
@@ -194,13 +194,13 @@ struct rtable;
 * @inet_rcv_saddr - Bound local IPv4 addr
 * @inet_dport - Destination port
 * @inet_num - Local port
 * @inet_flags - various atomic flags
 * @inet_saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @inet_sport - Source port
 * @inet_id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @uc_index - Unicast outgoing device index
 * @mc_index - Multicast device index
 * @mc_list - Group array
@@ -218,57 +218,88 @@ struct inet_sock {
#define inet_dport		sk.__sk_common.skc_dport
#define inet_num		sk.__sk_common.skc_num

	unsigned long		inet_flags;
	__be32			inet_saddr;
	__s16			uc_ttl;
	__u16			cmsg_flags;
	struct ip_options_rcu __rcu	*inet_opt;
	__be16			inet_sport;
	struct ip_options_rcu __rcu	*inet_opt;
	__u16			inet_id;

	__u8			tos;
	__u8			min_ttl;
	__u8			mc_ttl;
	__u8			pmtudisc;
	__u8			recverr:1,
				is_icsk:1,
				freebind:1,
				hdrincl:1,
				mc_loop:1,
				transparent:1,
				mc_all:1,
				nodefrag:1;
	__u8			bind_address_no_port:1,
				recverr_rfc4884:1,
				defer_connect:1; /* Indicates that fastopen_connect is set
						  * and cookie exists so we defer connect
						  * until first data frame is written
						  */
	__u8			rcv_tos;
	__u8			convert_csum;
	int			uc_index;
	int			mc_index;
	__be32			mc_addr;
	struct ip_mc_socklist __rcu	*mc_list;
	struct inet_cork_full	cork;
	struct {
		__u16 lo;
		__u16 hi;
	}			local_port_range;

	struct ip_mc_socklist __rcu	*mc_list;
	struct inet_cork_full	cork;
};

#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */

enum {
	INET_FLAGS_PKTINFO	= 0,
	INET_FLAGS_TTL		= 1,
	INET_FLAGS_TOS		= 2,
	INET_FLAGS_RECVOPTS	= 3,
	INET_FLAGS_RETOPTS	= 4,
	INET_FLAGS_PASSSEC	= 5,
	INET_FLAGS_ORIGDSTADDR	= 6,
	INET_FLAGS_CHECKSUM	= 7,
	INET_FLAGS_RECVFRAGSIZE	= 8,

	INET_FLAGS_RECVERR	= 9,
	INET_FLAGS_RECVERR_RFC4884 = 10,
	INET_FLAGS_FREEBIND	= 11,
	INET_FLAGS_HDRINCL	= 12,
	INET_FLAGS_MC_LOOP	= 13,
	INET_FLAGS_MC_ALL	= 14,
	INET_FLAGS_TRANSPARENT	= 15,
	INET_FLAGS_IS_ICSK	= 16,
	INET_FLAGS_NODEFRAG	= 17,
	INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
	INET_FLAGS_DEFER_CONNECT = 19,
};

/* cmsg flags for inet */
#define IP_CMSG_PKTINFO		BIT(0)
#define IP_CMSG_TTL		BIT(1)
#define IP_CMSG_TOS		BIT(2)
#define IP_CMSG_RECVOPTS	BIT(3)
#define IP_CMSG_RETOPTS		BIT(4)
#define IP_CMSG_PASSSEC		BIT(5)
#define IP_CMSG_ORIGDSTADDR	BIT(6)
#define IP_CMSG_CHECKSUM	BIT(7)
#define IP_CMSG_RECVFRAGSIZE	BIT(8)
#define IP_CMSG_PKTINFO		BIT(INET_FLAGS_PKTINFO)
#define IP_CMSG_TTL		BIT(INET_FLAGS_TTL)
#define IP_CMSG_TOS		BIT(INET_FLAGS_TOS)
#define IP_CMSG_RECVOPTS	BIT(INET_FLAGS_RECVOPTS)
#define IP_CMSG_RETOPTS		BIT(INET_FLAGS_RETOPTS)
#define IP_CMSG_PASSSEC		BIT(INET_FLAGS_PASSSEC)
#define IP_CMSG_ORIGDSTADDR	BIT(INET_FLAGS_ORIGDSTADDR)
#define IP_CMSG_CHECKSUM	BIT(INET_FLAGS_CHECKSUM)
#define IP_CMSG_RECVFRAGSIZE	BIT(INET_FLAGS_RECVFRAGSIZE)

#define IP_CMSG_ALL	(IP_CMSG_PKTINFO | IP_CMSG_TTL |		\
			 IP_CMSG_TOS | IP_CMSG_RECVOPTS |		\
			 IP_CMSG_RETOPTS | IP_CMSG_PASSSEC |		\
			 IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM |	\
			 IP_CMSG_RECVFRAGSIZE)

static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
{
	return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL;
}

#define inet_test_bit(nr, sk)			\
	test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_set_bit(nr, sk)			\
	set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_clear_bit(nr, sk)			\
	clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_assign_bit(nr, sk, val)		\
	assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

static inline bool sk_is_inet(struct sock *sk)
{
@@ -363,7 +394,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
{
	__u8 flags = 0;

	if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
	if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk))
		flags |= FLOWI_FLAG_ANYSRC;
	return flags;
}
@@ -389,7 +420,8 @@ static inline bool inet_can_nonlocal_bind(struct net *net,
					  struct inet_sock *inet)
{
	return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
		inet->freebind || inet->transparent;
		test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
		test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

static inline bool inet_addr_valid_or_nonlocal(struct net *net,
+2 −1
Original line number Diff line number Diff line
@@ -937,7 +937,8 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net,
					  struct inet_sock *inet)
{
	return net->ipv6.sysctl.ip_nonlocal_bind ||
		inet->freebind || inet->transparent;
		test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
		test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

/* Sysctl settings for net ipv6.auto_flowlabels */
+1 −1
Original line number Diff line number Diff line
@@ -298,7 +298,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
{
	__u8 flow_flags = 0;

	if (inet_sk(sk)->transparent)
	if (inet_test_bit(TRANSPARENT, sk))
		flow_flags |= FLOWI_FLAG_ANYSRC;

	flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
+1 −1
Original line number Diff line number Diff line
@@ -2031,7 +2031,7 @@ static inline bool inet_sk_transparent(const struct sock *sk)
	case TCP_NEW_SYN_RECV:
		return inet_rsk(inet_reqsk(sk))->no_srccheck;
	}
	return inet_sk(sk)->transparent;
	return inet_test_bit(TRANSPARENT, sk);
}

/* Determines whether this is a thin stream (which may suffer from
Loading