Commit b67fd3d9 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-inet-retire-port-only-listening_hash'

Martin KaFai Lau says:

====================
net: inet: Retire port only listening_hash

This series is to retire the port only listening_hash.

The listen sk is currently stored in two hash tables,
listening_hash (hashed by port) and lhash2 (hashed by port and address).

After commit 0ee58dad ("net: tcp6: prefer listeners bound to an address")
and commit d9fbc7f6 ("net: tcp: prefer listeners bound to an address"),
the TCP-SYN lookup fast path does not use listening_hash.

The commit 05c0b357 ("tcp: seq_file: Replace listening_hash with lhash2")
also moved the seq_file (/proc/net/tcp) iteration usage from
listening_hash to lhash2.

There are still a few listening_hash usages left.
One of them is inet_reuseport_add_sock() which uses the listening_hash
to search a listen sk during the listen() system call.  This turns
out to be very slow on use cases that listen on many different
VIPs at a popular port (e.g. 443).  [ On top of the slowness in
adding to the tail in the IPv6 case ]. A latter patch has a
selftest to demonstrate this case.

This series takes this chance to move all remaining listening_hash
usages to lhash2 and then retire listening_hash.
====================

Link: https://lore.kernel.org/r/20220512000546.188616-1-kafai@fb.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 0c1822d9 ec8cb4f6
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -66,7 +66,6 @@ struct inet_connection_sock_af_ops {
 * @icsk_ulp_ops	   Pluggable ULP control hook
 * @icsk_ulp_data	   ULP private data
 * @icsk_clean_acked	   Clean acked data hook
 * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
 * @icsk_ca_state:	   Congestion control state
 * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
 * @icsk_pending:	   Scheduled timer event
@@ -96,7 +95,6 @@ struct inet_connection_sock {
	const struct tcp_ulp_ops  *icsk_ulp_ops;
	void __rcu		  *icsk_ulp_data;
	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
	struct hlist_node         icsk_listen_portaddr_node;
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
	__u8			  icsk_ca_state:5,
				  icsk_ca_initialized:1,
+1 −41
Original line number Diff line number Diff line
@@ -111,12 +111,8 @@ struct inet_bind_hashbucket {
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
	spinlock_t		lock;
	unsigned int		count;
	union {
		struct hlist_head	head;
	struct hlist_nulls_head	nulls_head;
};
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
@@ -143,32 +139,8 @@ struct inet_hashinfo {
	/* The 2nd listener table hashed by local port and address */
	unsigned int			lhash2_mask;
	struct inet_listen_hashbucket	*lhash2;

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * might be often dirty.
	 */
	/* All sockets in TCP_LISTEN state will be in listening_hash.
	 * This is the only table where wildcard'd TCP sockets can
	 * exist.  listening_hash is only hashed by local port number.
	 * If lhash2 is initialized, the same socket will also be hashed
	 * to lhash2 by port and address.
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;
};

#define inet_lhash2_for_each_icsk_continue(__icsk) \
	hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)

#define inet_lhash2_for_each_icsk(__icsk, list) \
	hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)

#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)

static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
@@ -230,23 +202,11 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
		    const unsigned short snum);

/* These can have wildcards, don't try too hard. */
static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
{
	return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
}

static inline int inet_sk_listen_hashfn(const struct sock *sk)
{
	return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
}

/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);

void inet_put_port(struct sock *sk);

void inet_hashinfo_init(struct inet_hashinfo *h);
void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
			 unsigned long numentries, int scale,
			 unsigned long low_limit,
+0 −1
Original line number Diff line number Diff line
@@ -1110,7 +1110,6 @@ static int __init dccp_init(void)

	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
		     sizeof_field(struct sk_buff, cb));
	inet_hashinfo_init(&dccp_hashinfo);
	rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
	if (rc)
		goto out_fail;
+3 −2
Original line number Diff line number Diff line
@@ -1028,12 +1028,13 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
		if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
			goto skip_listen_ht;

		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
		for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
			struct inet_listen_hashbucket *ilb;
			struct hlist_nulls_node *node;

			num = 0;
			ilb = &hashinfo->listening_hash[i];
			ilb = &hashinfo->lhash2[i];

			spin_lock(&ilb->lock);
			sk_nulls_for_each(sk, node, &ilb->nulls_head) {
				struct inet_sock *inet = inet_sk(sk);
+31 −88
Original line number Diff line number Diff line
@@ -193,42 +193,6 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
	return inet_lhash2_bucket(h, hash);
}

static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2)
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	else
		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
				   &ilb2->head);
	ilb2->count++;
	spin_unlock(&ilb2->lock);
}

static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
{
	struct inet_listen_hashbucket *ilb2;

	if (!h->lhash2 ||
	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
		return;

	ilb2 = inet_lhash2_bucket_sk(h, sk);

	spin_lock(&ilb2->lock);
	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
	ilb2->count--;
	spin_unlock(&ilb2->lock);
}

static inline int compute_score(struct sock *sk, struct net *net,
				const unsigned short hnum, const __be32 daddr,
				const int dif, const int sdif)
@@ -282,12 +246,11 @@ static struct sock *inet_lhash2_lookup(struct net *net,
				const __be32 daddr, const unsigned short hnum,
				const int dif, const int sdif)
{
	struct inet_connection_sock *icsk;
	struct sock *sk, *result = NULL;
	struct hlist_nulls_node *node;
	int score, hiscore = 0;

	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
		sk = (struct sock *)icsk;
	sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
		score = compute_score(sk, net, hnum, daddr, dif, sdif);
		if (score > hiscore) {
			result = lookup_reuseport(net, sk, skb, doff,
@@ -633,7 +596,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
int __inet_hash(struct sock *sk, struct sock *osk)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	struct inet_listen_hashbucket *ilb;
	struct inet_listen_hashbucket *ilb2;
	int err = 0;

	if (sk->sk_state != TCP_LISTEN) {
@@ -643,25 +606,23 @@ int __inet_hash(struct sock *sk, struct sock *osk)
		return 0;
	}
	WARN_ON(!sk_unhashed(sk));
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
	ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);

	spin_lock(&ilb->lock);
	spin_lock(&ilb2->lock);
	if (sk->sk_reuseport) {
		err = inet_reuseport_add_sock(sk, ilb);
		err = inet_reuseport_add_sock(sk, ilb2);
		if (err)
			goto unlock;
	}
	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
		sk->sk_family == AF_INET6)
		__sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
	else
		__sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
	inet_hash2(hashinfo, sk);
	ilb->count++;
		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
	sock_set_flag(sk, SOCK_RCU_FREE);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
	spin_unlock(&ilb->lock);
	spin_unlock(&ilb2->lock);

	return err;
}
@@ -678,23 +639,6 @@ int inet_hash(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_hash);

static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
{
	if (sk_unhashed(sk))
		return;

	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_stop_listen_sock(sk);
	if (ilb) {
		struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;

		inet_unhash2(hashinfo, sk);
		ilb->count--;
	}
	__sk_nulls_del_node_init_rcu(sk);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}

void inet_unhash(struct sock *sk)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
@@ -703,20 +647,34 @@ void inet_unhash(struct sock *sk)
		return;

	if (sk->sk_state == TCP_LISTEN) {
		struct inet_listen_hashbucket *ilb;
		struct inet_listen_hashbucket *ilb2;

		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
		ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
		/* Don't disable bottom halves while acquiring the lock to
		 * avoid circular locking dependency on PREEMPT_RT.
		 */
		spin_lock(&ilb->lock);
		__inet_unhash(sk, ilb);
		spin_unlock(&ilb->lock);
		spin_lock(&ilb2->lock);
		if (sk_unhashed(sk)) {
			spin_unlock(&ilb2->lock);
			return;
		}

		if (rcu_access_pointer(sk->sk_reuseport_cb))
			reuseport_stop_listen_sock(sk);

		__sk_nulls_del_node_init_rcu(sk);
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
		spin_unlock(&ilb2->lock);
	} else {
		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

		spin_lock_bh(lock);
		__inet_unhash(sk, NULL);
		if (sk_unhashed(sk)) {
			spin_unlock_bh(lock);
			return;
		}
		__sk_nulls_del_node_init_rcu(sk);
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
		spin_unlock_bh(lock);
	}
}
@@ -874,29 +832,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
}
EXPORT_SYMBOL_GPL(inet_hash_connect);

void inet_hashinfo_init(struct inet_hashinfo *h)
{
	int i;

	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
		spin_lock_init(&h->listening_hash[i].lock);
		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head,
				      i + LISTENING_NULLS_BASE);
		h->listening_hash[i].count = 0;
	}

	h->lhash2 = NULL;
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);

static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
	int i;

	for (i = 0; i <= h->lhash2_mask; i++) {
		spin_lock_init(&h->lhash2[i].lock);
		INIT_HLIST_HEAD(&h->lhash2[i].head);
		h->lhash2[i].count = 0;
		INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
				      i + LISTENING_NULLS_BASE);
	}
}

Loading