Commit e9ddbb77 authored by Jakub Sitnicki's avatar Jakub Sitnicki Committed by Alexei Starovoitov
Browse files

bpf: Introduce SK_LOOKUP program type with a dedicated attach point



Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type
BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer
when looking up a listening socket for a new connection request for
connection oriented protocols, or when looking up an unconnected socket for
a packet for connection-less protocols.

When called, SK_LOOKUP BPF program can select a socket that will receive
the packet. This serves as a mechanism to overcome the limits of what
bind() API allows to express. Two use-cases driving this work are:

 (1) steer packets destined to an IP range, on fixed port to a socket

     192.0.2.0/24, port 80 -> NGINX socket

 (2) steer packets destined to an IP address, on any port to a socket

     198.51.100.1, any port -> L7 proxy socket

In its run-time context program receives information about the packet that
triggered the socket lookup. Namely IP version, L4 protocol identifier, and
address 4-tuple. Context can be further extended to include ingress
interface identifier.

To select a socket BPF program fetches it from a map holding socket
references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...)
helper to record the selection. Transport layer then uses the selected
socket as a result of socket lookup.

In its basic form, SK_LOOKUP acts as a filter and hence must return either
SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should
look for a socket to receive the packet, or use the one selected by the
program if available, while SK_DROP informs the transport layer that the
lookup should fail.

This patch only enables the user to attach an SK_LOOKUP program to a
network namespace. Subsequent patches hook it up to run on local delivery
path in ipv4 and ipv6 stacks.

Suggested-by: default avatarMarek Majkowski <marek@cloudflare.com>
Signed-off-by: default avatarJakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com
parent ce3aa9cc
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
enum netns_bpf_attach_type {
	NETNS_BPF_INVALID = -1,
	NETNS_BPF_FLOW_DISSECTOR = 0,
	NETNS_BPF_SK_LOOKUP,
	MAX_NETNS_BPF_ATTACH_TYPE
};

@@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
	switch (attach_type) {
	case BPF_FLOW_DISSECTOR:
		return NETNS_BPF_FLOW_DISSECTOR;
	case BPF_SK_LOOKUP:
		return NETNS_BPF_SK_LOOKUP;
	default:
		return NETNS_BPF_INVALID;
	}
+1 −0
Original line number Diff line number Diff line
@@ -249,6 +249,7 @@ enum bpf_arg_type {
	ARG_PTR_TO_INT,		/* pointer to int */
	ARG_PTR_TO_LONG,	/* pointer to long */
	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
	ARG_PTR_TO_SOCKET_OR_NULL,	/* pointer to bpf_sock (fullsock) or NULL */
	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
	ARG_PTR_TO_ALLOC_MEM,	/* pointer to dynamically allocated memory */
	ARG_PTR_TO_ALLOC_MEM_OR_NULL,	/* pointer to dynamically allocated memory or NULL */
+2 −0
Original line number Diff line number Diff line
@@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
#ifdef CONFIG_INET
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
	      struct sk_reuseport_md, struct sk_reuseport_kern)
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup,
	      struct bpf_sk_lookup, struct bpf_sk_lookup_kern)
#endif
#if defined(CONFIG_BPF_JIT)
BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
+17 −0
Original line number Diff line number Diff line
@@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern {
	s32		retval;
};

struct bpf_sk_lookup_kern {
	u16		family;
	u16		protocol;
	struct {
		__be32 saddr;
		__be32 daddr;
	} v4;
	struct {
		const struct in6_addr *saddr;
		const struct in6_addr *daddr;
	} v6;
	__be16		sport;
	u16		dport;
	struct sock	*selected_sk;
	bool		no_reuseport;
};

#endif /* __LINUX_FILTER_H__ */
+77 −0
Original line number Diff line number Diff line
@@ -189,6 +189,7 @@ enum bpf_prog_type {
	BPF_PROG_TYPE_STRUCT_OPS,
	BPF_PROG_TYPE_EXT,
	BPF_PROG_TYPE_LSM,
	BPF_PROG_TYPE_SK_LOOKUP,
};

enum bpf_attach_type {
@@ -228,6 +229,7 @@ enum bpf_attach_type {
	BPF_XDP_DEVMAP,
	BPF_CGROUP_INET_SOCK_RELEASE,
	BPF_XDP_CPUMAP,
	BPF_SK_LOOKUP,
	__MAX_BPF_ATTACH_TYPE
};

@@ -3069,6 +3071,10 @@ union bpf_attr {
 *
 * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
 *	Description
 *		Helper is overloaded depending on BPF program type. This
 *		description applies to **BPF_PROG_TYPE_SCHED_CLS** and
 *		**BPF_PROG_TYPE_SCHED_ACT** programs.
 *
 *		Assign the *sk* to the *skb*. When combined with appropriate
 *		routing configuration to receive the packet towards the socket,
 *		will cause *skb* to be delivered to the specified socket.
@@ -3094,6 +3100,56 @@ union bpf_attr {
 *		**-ESOCKTNOSUPPORT** if the socket type is not supported
 *		(reuseport).
 *
 * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
 *	Description
 *		Helper is overloaded depending on BPF program type. This
 *		description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
 *
 *		Select the *sk* as a result of a socket lookup.
 *
 *		For the operation to succeed passed socket must be compatible
 *		with the packet description provided by the *ctx* object.
 *
 *		L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
 *		be an exact match. While IP family (**AF_INET** or
 *		**AF_INET6**) must be compatible, that is IPv6 sockets
 *		that are not v6-only can be selected for IPv4 packets.
 *
 *		Only TCP listeners and UDP unconnected sockets can be
 *		selected. *sk* can also be NULL to reset any previous
 *		selection.
 *
 *		*flags* argument can combination of following values:
 *
 *		* **BPF_SK_LOOKUP_F_REPLACE** to override the previous
 *		  socket selection, potentially done by a BPF program
 *		  that ran before us.
 *
 *		* **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
 *		  load-balancing within reuseport group for the socket
 *		  being selected.
 *
 *		On success *ctx->sk* will point to the selected socket.
 *
 *	Return
 *		0 on success, or a negative errno in case of failure.
 *
 *		* **-EAFNOSUPPORT** if socket family (*sk->family*) is
 *		  not compatible with packet family (*ctx->family*).
 *
 *		* **-EEXIST** if socket has been already selected,
 *		  potentially by another program, and
 *		  **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
 *
 *		* **-EINVAL** if unsupported flags were specified.
 *
 *		* **-EPROTOTYPE** if socket L4 protocol
 *		  (*sk->protocol*) doesn't match packet protocol
 *		  (*ctx->protocol*).
 *
 *		* **-ESOCKTNOSUPPORT** if socket is not in allowed
 *		  state (TCP listening or UDP unconnected).
 *
 * u64 bpf_ktime_get_boot_ns(void)
 * 	Description
 * 		Return the time elapsed since system boot, in nanoseconds.
@@ -3607,6 +3663,12 @@ enum {
	BPF_RINGBUF_HDR_SZ		= 8,
};

/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
enum {
	BPF_SK_LOOKUP_F_REPLACE		= (1ULL << 0),
	BPF_SK_LOOKUP_F_NO_REUSEPORT	= (1ULL << 1),
};

/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
	BPF_ADJ_ROOM_NET,
@@ -4349,4 +4411,19 @@ struct bpf_pidns_info {
	__u32 pid;
	__u32 tgid;
};

/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
struct bpf_sk_lookup {
	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */

	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
	__u32 remote_ip4;	/* Network byte order */
	__u32 remote_ip6[4];	/* Network byte order */
	__u32 remote_port;	/* Network byte order */
	__u32 local_ip4;	/* Network byte order */
	__u32 local_ip6[4];	/* Network byte order */
	__u32 local_port;	/* Host byte order */
};

#endif /* _UAPI__LINUX_BPF_H__ */
Loading