Commit 1cd62c21 authored by Kuniyuki Iwashima's avatar Kuniyuki Iwashima Committed by Daniel Borkmann
Browse files

tcp: Add reuseport_migrate_sock() to select a new listener.



reuseport_migrate_sock() does the same check done in
reuseport_listen_stop_sock(). If the reuseport group is capable of
migration, reuseport_migrate_sock() selects a new listener by the child
socket hash and increments the listener's sk_refcnt beforehand. Thus, if we
fail in the migration, we have to decrement it later.

We will support migration by eBPF in the later commits.

Signed-off-by: default avatarKuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-5-kuniyu@amazon.co.jp
parent 333bb73f
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -37,6 +37,9 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
					  u32 hash,
					  struct sk_buff *skb,
					  int hdr_len);
struct sock *reuseport_migrate_sock(struct sock *sk,
				    struct sock *migrating_sk,
				    struct sk_buff *skb);
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
extern int reuseport_detach_prog(struct sock *sk);

+64 −14
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ static void __reuseport_add_sock(struct sock *sk,
				 struct sock_reuseport *reuse)
{
	reuse->socks[reuse->num_socks] = sk;
	/* paired with smp_rmb() in reuseport_select_sock() */
	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
	smp_wmb();
	reuse->num_socks++;
}
@@ -434,6 +434,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
	return reuse->socks[index];
}

static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
						  u32 hash, u16 num_socks)
{
	int i, j;

	i = j = reciprocal_scale(hash, num_socks);
	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
		i++;
		if (i >= num_socks)
			i = 0;
		if (i == j)
			return NULL;
	}

	return reuse->socks[i];
}

/**
 *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
 *  @sk: First socket in the group.
@@ -477,19 +494,8 @@ struct sock *reuseport_select_sock(struct sock *sk,

select_by_hash:
		/* no bpf or invalid bpf result: fall back to hash usage */
		if (!sk2) {
			int i, j;

			i = j = reciprocal_scale(hash, socks);
			while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
				i++;
				if (i >= socks)
					i = 0;
				if (i == j)
					goto out;
			}
			sk2 = reuse->socks[i];
		}
		if (!sk2)
			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
	}

out:
@@ -498,6 +504,50 @@ struct sock *reuseport_select_sock(struct sock *sk,
}
EXPORT_SYMBOL(reuseport_select_sock);

/**
 *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
 *  @sk: close()ed or shutdown()ed socket in the group.
 *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
 *    NEW_SYN_RECV request socket during 3WHS.
 *  @skb: skb to run through BPF filter.
 *  Returns a socket (with sk_refcnt +1) that should accept the child socket
 *  (or NULL on error).
 */
struct sock *reuseport_migrate_sock(struct sock *sk,
				    struct sock *migrating_sk,
				    struct sk_buff *skb)
{
	struct sock_reuseport *reuse;
	struct sock *nsk = NULL;
	u16 socks;
	u32 hash;

	rcu_read_lock();

	reuse = rcu_dereference(sk->sk_reuseport_cb);
	if (!reuse)
		goto out;

	socks = READ_ONCE(reuse->num_socks);
	if (unlikely(!socks))
		goto out;

	/* paired with smp_wmb() in __reuseport_add_sock() */
	smp_rmb();

	hash = migrating_sk->sk_hash;
	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);

	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
		nsk = NULL;

out:
	rcu_read_unlock();
	return nsk;
}
EXPORT_SYMBOL(reuseport_migrate_sock);

int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
	struct sock_reuseport *reuse;