Commit aa7f1f03 authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-xdp-bcast'

Hangbin Liu says:

====================
This patchset is a new implementation for XDP multicast support based
on my previous 2 maps implementation[1]. The reason is that Daniel thinks
the exclude map implementation is missing proper bond support in XDP
context. And there is a plan to add native XDP bonding support. Adding
a exclude map in the helper also increases the complexity of verifier and
has drawbacks on performance.

The new implementation just add two new flags BPF_F_BROADCAST and
BPF_F_EXCLUDE_INGRESS to extend xdp_redirect_map for broadcast support.

With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces
in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be
excluded when do broadcasting.

The patchv11 link is here [2].

  [1] https://lore.kernel.org/bpf/20210223125809.1376577-1-liuhangbin@gmail.com
  [2] https://lore.kernel.org/bpf/20210513070447.1878448-1-liuhangbin@gmail.com



v12: As Daniel pointed out:
  a) defined as const u64 for flag_mask and action_mask in
     __bpf_xdp_redirect_map()
  b) remove BPF_F_ACTION_MASK in uapi header
  c) remove EXPORT_SYMBOL_GPL for xdpf_clone()

v11:
  a) Use unlikely() when checking if this is for broadcast redirecting.
  b) Fix a tracepoint NULL pointer issue Jesper found
  c) Remove BPF_F_REDIR_MASK and just use OR flags to make the reader more
     clear about what's flags we are using
  d) Add the performace number with multi veth interfaces in patch 01
     description.
  e) remove some sleeps to reduce the testing time in patch04. Re-struct the
     test and make clear what flags we are testing.

v10: use READ/WRITE_ONCE when read/write map instead of xchg()
v9: Update patch 01 commit description
v8: use hlist_for_each_entry_rcu() when looping the devmap hash ojbs
v7: No need to free xdpf in dev_map_enqueue_clone() if xdpf_clone failed.
v6: Fix a skb leak in the error path for generic XDP
v5: Just walk the map directly to get interfaces as get_next_key() of devmap
    hash may restart looping from the first key if the device get removed.
    After update the performace has improved 10% compired with v4.
v4: Fix flags never cleared issue in patch 02. Update selftest to cover this.
v3: Rebase the code based on latest bpf-next
v2: fix flag renaming issue in patch 02
====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 21703cf7 d2329247
Loading
Loading
Loading
Loading
+20 −0
Original line number Diff line number Diff line
@@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
		    struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
		    struct net_device *dev_rx);
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
			  struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
			     struct bpf_prog *xdp_prog);
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
			   struct bpf_prog *xdp_prog, struct bpf_map *map,
			   bool exclude_ingress);
bool dev_map_can_have_prog(struct bpf_map *map);

void __cpu_map_flush(void);
@@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
	return 0;
}

static inline
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
			  struct bpf_map *map, bool exclude_ingress)
{
	return 0;
}

struct sk_buff;

static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
@@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
	return 0;
}

static inline
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
			   struct bpf_prog *xdp_prog, struct bpf_map *map,
			   bool exclude_ingress)
{
	return 0;
}

static inline void __cpu_map_flush(void)
{
}
+15 −4
Original line number Diff line number Diff line
@@ -646,6 +646,7 @@ struct bpf_redirect_info {
	u32 flags;
	u32 tgt_index;
	void *tgt_value;
	struct bpf_map *map;
	u32 map_id;
	enum bpf_map_type map_type;
	u32 kern_flags;
@@ -1464,17 +1465,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
						  u64 flags, const u64 flag_mask,
						  void *lookup_elem(struct bpf_map *map, u32 key))
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;

	/* Lower bits of the flags are used as return code on lookup failure */
	if (unlikely(flags > XDP_TX))
	if (unlikely(flags & ~(action_mask | flag_mask)))
		return XDP_ABORTED;

	ri->tgt_value = lookup_elem(map, ifindex);
	if (unlikely(!ri->tgt_value)) {
	if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
		/* If the lookup fails we want to clear out the state in the
		 * redirect_info struct completely, so that if an eBPF program
		 * performs multiple lookups, the last one always takes
@@ -1482,13 +1485,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
		 */
		ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
		ri->map_type = BPF_MAP_TYPE_UNSPEC;
		return flags;
		return flags & action_mask;
	}

	ri->tgt_index = ifindex;
	ri->map_id = map->id;
	ri->map_type = map->map_type;

	if (flags & BPF_F_BROADCAST) {
		WRITE_ONCE(ri->map, map);
		ri->flags = flags;
	} else {
		WRITE_ONCE(ri->map, NULL);
		ri->flags = 0;
	}

	return XDP_REDIRECT;
}

+1 −0
Original line number Diff line number Diff line
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
					 struct net_device *dev);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);

static inline
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
+5 −1
Original line number Diff line number Diff line
@@ -110,6 +110,10 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
		u32 ifindex = 0, map_index = index;

		if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
			/* Just leave to_ifindex to 0 if do broadcast redirect,
			 * as tgt will be NULL.
			 */
			if (tgt)
				ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
		} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
			ifindex = index;
+12 −2
Original line number Diff line number Diff line
@@ -2555,8 +2555,12 @@ union bpf_attr {
 * 		The lower two bits of *flags* are used as the return code if
 * 		the map lookup fails. This is so that the return value can be
 * 		one of the XDP program return codes up to **XDP_TX**, as chosen
 * 		by the caller. Any higher bits in the *flags* argument must be
 * 		unset.
 * 		by the caller. The higher bits of *flags* can be set to
 * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
 *
 * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
 * 		interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
 * 		interface will be excluded when do broadcasting.
 *
 * 		See also **bpf_redirect**\ (), which only supports redirecting
 * 		to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
};

/* Flags for bpf_redirect_map helper */
enum {
	BPF_F_BROADCAST		= (1ULL << 3),
	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
};

#define __bpf_md_ptr(type, name)	\
union {					\
	type name;			\
Loading