Commit d5e4ddae authored by Kuniyuki Iwashima's avatar Kuniyuki Iwashima Committed by Daniel Borkmann
Browse files

bpf: Support socket migration by eBPF.



This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT
to check if the attached eBPF program is capable of migrating sockets. When
the eBPF program is attached, we run it for socket migration if the
expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or
net.ipv4.tcp_migrate_req is enabled.

Currently, the expected_attach_type is not enforced for the
BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the
earlier idea in the commit aac3fc32 ("bpf: Post-hooks for sys_bind") to
fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type().

Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to
select a new listener based on the child socket. migrating_sk varies
depending on if it is migrating a request in the accept queue or during
3WHS.

  - accept_queue : sock (ESTABLISHED/SYN_RECV)
  - 3WHS         : request_sock (NEW_SYN_RECV)

In the eBPF program, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fallbacks to the random selection
  - SK_DROP, cancel the migration.

There is a noteworthy point. We select a listening socket in three places,
but we do not have struct skb at closing a listener or retransmitting a
SYN+ACK. On the other hand, some helper functions do not expect skb is NULL
(e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer()
in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program.

Suggested-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarKuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp
parent e0610476
Loading
Loading
Loading
Loading
+1 −0
Original line number Original line Diff line number Diff line
@@ -2048,6 +2048,7 @@ struct sk_reuseport_kern {
	struct sk_buff *skb;
	struct sk_buff *skb;
	struct sock *sk;
	struct sock *sk;
	struct sock *selected_sk;
	struct sock *selected_sk;
	struct sock *migrating_sk;
	void *data_end;
	void *data_end;
	u32 hash;
	u32 hash;
	u32 reuseport_id;
	u32 reuseport_id;
+2 −0
Original line number Original line Diff line number Diff line
@@ -996,11 +996,13 @@ void bpf_warn_invalid_xdp_action(u32 act);
#ifdef CONFIG_INET
#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
				  struct bpf_prog *prog, struct sk_buff *skb,
				  struct bpf_prog *prog, struct sk_buff *skb,
				  struct sock *migrating_sk,
				  u32 hash);
				  u32 hash);
#else
#else
static inline struct sock *
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
		     struct bpf_prog *prog, struct sk_buff *skb,
		     struct bpf_prog *prog, struct sk_buff *skb,
		     struct sock *migrating_sk,
		     u32 hash)
		     u32 hash)
{
{
	return NULL;
	return NULL;
+15 −0
Original line number Original line Diff line number Diff line
@@ -994,6 +994,8 @@ enum bpf_attach_type {
	BPF_SK_LOOKUP,
	BPF_SK_LOOKUP,
	BPF_XDP,
	BPF_XDP,
	BPF_SK_SKB_VERDICT,
	BPF_SK_SKB_VERDICT,
	BPF_SK_REUSEPORT_SELECT,
	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
	__MAX_BPF_ATTACH_TYPE
	__MAX_BPF_ATTACH_TYPE
};
};


@@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
	__u32 bind_inany;	/* Is sock bound to an INANY address? */
	__u32 bind_inany;	/* Is sock bound to an INANY address? */
	__u32 hash;		/* A hash of the packet 4 tuples */
	__u32 hash;		/* A hash of the packet 4 tuples */
	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
	 * new incoming connection request (e.g. selecting a listen sk for
	 * the received SYN in the TCP case).  reuse->sk is one of the sk
	 * in the reuseport group. The bpf prog can use reuse->sk to learn
	 * the local listening ip/port without looking into the skb.
	 *
	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
	 * reuse->migrating_sk is the socket that needs to be migrated
	 * to another listening socket.  migrating_sk could be a fullsock
	 * sk that is fully established or a reqsk that is in-the-middle
	 * of 3-way handshake.
	 */
	__bpf_md_ptr(struct bpf_sock *, sk);
	__bpf_md_ptr(struct bpf_sock *, sk);
	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
};
};


#define BPF_TAG_SIZE	8
#define BPF_TAG_SIZE	8
+13 −0
Original line number Original line Diff line number Diff line
@@ -1972,6 +1972,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
			attr->expected_attach_type =
			attr->expected_attach_type =
				BPF_CGROUP_INET_SOCK_CREATE;
				BPF_CGROUP_INET_SOCK_CREATE;
		break;
		break;
	case BPF_PROG_TYPE_SK_REUSEPORT:
		if (!attr->expected_attach_type)
			attr->expected_attach_type =
				BPF_SK_REUSEPORT_SELECT;
		break;
	}
	}
}
}


@@ -2055,6 +2060,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
		if (expected_attach_type == BPF_SK_LOOKUP)
		if (expected_attach_type == BPF_SK_LOOKUP)
			return 0;
			return 0;
		return -EINVAL;
		return -EINVAL;
	case BPF_PROG_TYPE_SK_REUSEPORT:
		switch (expected_attach_type) {
		case BPF_SK_REUSEPORT_SELECT:
		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
			return 0;
		default:
			return -EINVAL;
		}
	case BPF_PROG_TYPE_SYSCALL:
	case BPF_PROG_TYPE_SYSCALL:
	case BPF_PROG_TYPE_EXT:
	case BPF_PROG_TYPE_EXT:
		if (expected_attach_type)
		if (expected_attach_type)
+12 −1
Original line number Original line Diff line number Diff line
@@ -10044,11 +10044,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
				    struct sock_reuseport *reuse,
				    struct sock_reuseport *reuse,
				    struct sock *sk, struct sk_buff *skb,
				    struct sock *sk, struct sk_buff *skb,
				    struct sock *migrating_sk,
				    u32 hash)
				    u32 hash)
{
{
	reuse_kern->skb = skb;
	reuse_kern->skb = skb;
	reuse_kern->sk = sk;
	reuse_kern->sk = sk;
	reuse_kern->selected_sk = NULL;
	reuse_kern->selected_sk = NULL;
	reuse_kern->migrating_sk = migrating_sk;
	reuse_kern->data_end = skb->data + skb_headlen(skb);
	reuse_kern->data_end = skb->data + skb_headlen(skb);
	reuse_kern->hash = hash;
	reuse_kern->hash = hash;
	reuse_kern->reuseport_id = reuse->reuseport_id;
	reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10057,12 +10059,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,


struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
				  struct bpf_prog *prog, struct sk_buff *skb,
				  struct bpf_prog *prog, struct sk_buff *skb,
				  struct sock *migrating_sk,
				  u32 hash)
				  u32 hash)
{
{
	struct sk_reuseport_kern reuse_kern;
	struct sk_reuseport_kern reuse_kern;
	enum sk_action action;
	enum sk_action action;


	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
	action = BPF_PROG_RUN(prog, &reuse_kern);
	action = BPF_PROG_RUN(prog, &reuse_kern);


	if (action == SK_PASS)
	if (action == SK_PASS)
@@ -10207,6 +10210,10 @@ sk_reuseport_is_valid_access(int off, int size,
		info->reg_type = PTR_TO_SOCKET;
		info->reg_type = PTR_TO_SOCKET;
		return size == sizeof(__u64);
		return size == sizeof(__u64);


	case offsetof(struct sk_reuseport_md, migrating_sk):
		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
		return size == sizeof(__u64);

	/* Fields that allow narrowing */
	/* Fields that allow narrowing */
	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
		if (size < sizeof_field(struct sk_buff, protocol))
		if (size < sizeof_field(struct sk_buff, protocol))
@@ -10283,6 +10290,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
	case offsetof(struct sk_reuseport_md, sk):
	case offsetof(struct sk_reuseport_md, sk):
		SK_REUSEPORT_LOAD_FIELD(sk);
		SK_REUSEPORT_LOAD_FIELD(sk);
		break;
		break;

	case offsetof(struct sk_reuseport_md, migrating_sk):
		SK_REUSEPORT_LOAD_FIELD(migrating_sk);
		break;
	}
	}


	return insn - insn_buf;
	return insn - insn_buf;
Loading