Commit 1e385c08 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'vxlan-vnifiltering'



Roopa Prabhu says:

====================
vxlan metadata device vnifiltering support

This series adds vnifiltering support to vxlan collect metadata device.

Motivation:
You can only use a single vxlan collect metadata device for a given
vxlan udp port in the system today. The vxlan collect metadata device
terminates all received vxlan packets. As shown in the below diagram,
there are use-cases where you need to support multiple such vxlan devices in
independent bridge domains. Each vxlan device must terminate the vni's
it is configured for.
Example usecase: In a service provider network a service provider
typically supports multiple bridge domains with overlapping vlans.
One bridge domain per customer. Vlans in each bridge domain are
mapped to globally unique vxlan ranges assigned to each customer.

This series adds vnifiltering support to collect metadata devices to
terminate only configured vnis. This is similar to vlan filtering in
bridge driver. The vni filtering capability is provided by a new flag on
collect metadata device.

In the below pic:
	- customer1 is mapped to br1 bridge domain
	- customer2 is mapped to br2 bridge domain
	- customer1 vlan 10-11 is mapped to vni 1001-1002
	- customer2 vlan 10-11 is mapped to vni 2001-2002
	- br1 and br2 are vlan filtering bridges
	- vxlan1 and vxlan2 are collect metadata devices with
	  vnifiltering enabled

┌──────────────────────────────────────────────────────────────────┐
│  switch                                                          │
│                                                                  │
│         ┌───────────┐                 ┌───────────┐              │
│         │           │                 │           │              │
│         │   br1     │                 │   br2     │              │
│         └┬─────────┬┘                 └──┬───────┬┘              │
│     vlans│         │               vlans │       │               │
│     10,11│         │                10,11│       │               │
│          │     vlanvnimap:               │    vlanvnimap:        │
│          │       10-1001,11-1002         │      10-2001,11-2002  │
│          │         │                     │       │               │
│   ┌──────┴┐     ┌──┴─────────┐       ┌───┴────┐  │               │
│   │ swp1  │     │vxlan1      │       │ swp2   │ ┌┴─────────────┐ │
│   │       │     │  vnifilter:│       │        │ │vxlan2        │ │
│   └───┬───┘     │   1001,1002│       └───┬────┘ │ vnifilter:   │ │
│       │         └────────────┘           │      │  2001,2002   │ │
│       │                                  │      └──────────────┘ │
│       │                                  │                       │
└───────┼──────────────────────────────────┼───────────────────────┘
        │                                  │
        │                                  │
  ┌─────┴───────┐                          │
  │  customer1  │                    ┌─────┴──────┐
  │ host/VM     │                    │customer2   │
  └─────────────┘                    │ host/VM    │
                                     └────────────┘

v2:
  - remove stale xstats declarations pointed out by Nikolay Aleksandrov
  - squash selinux patch with the tunnel api patch as pointed out by
    benjamin poirier
  - Fix various build issues:
	Reported-by: default avatarkernel test robot <lkp@intel.com>

v3:
  - incorporate review feedback from Jakub
	- move rhashtable declarations to c file
	- define and use netlink policy for top level vxlan filter api
	- fix unused stats function warning
	- pass vninode from vnifilter lookup into stats count function
		to avoid another lookup (only applicable to vxlan_rcv)
	- fix missing vxlan vni delete notifications in vnifilter uninit
	  function
	- misc cleanups
  - remote dev check for multicast groups added via vnifiltering api
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f2b77012 445b2f36
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ obj-$(CONFIG_TUN) += tun.o
obj-$(CONFIG_TAP) += tap.o
obj-$(CONFIG_VETH) += veth.o
obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
obj-$(CONFIG_VXLAN) += vxlan.o
obj-$(CONFIG_VXLAN) += vxlan/
obj-$(CONFIG_GENEVE) += geneve.o
obj-$(CONFIG_BAREUDP) += bareudp.o
obj-$(CONFIG_GTP) += gtp.o
+7 −0
Original line number Diff line number Diff line
#
# Makefile for the vxlan driver
#

obj-$(CONFIG_VXLAN) += vxlan.o

vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o
+173 −261
Original line number Diff line number Diff line
@@ -34,10 +34,10 @@
#include <net/ip6_checksum.h>
#endif

#include "vxlan_private.h"

#define VXLAN_VERSION	"0.1"

#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

@@ -53,41 +53,15 @@ static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

static unsigned int vxlan_net_id;
static struct rtnl_link_ops vxlan_link_ops;
unsigned int vxlan_net_id;

static const u8 all_zeros_mac[ETH_ALEN + 2];
const u8 all_zeros_mac[ETH_ALEN + 2];
static struct rtnl_link_ops vxlan_link_ops;

static int vxlan_sock_add(struct vxlan_dev *vxlan);

static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
	spinlock_t	  sock_lock;
	struct notifier_block nexthop_notifier_block;
};

/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
	struct list_head  remotes;
	u8		  eth_addr[ETH_ALEN];
	u16		  state;	/* see ndm_state */
	__be32		  vni;
	u16		  flags;	/* see ndm_flags and below */
	struct list_head  nh_list;
	struct nexthop __rcu *nh;
	struct vxlan_dev  __rcu *vdev;
};

#define NTF_VXLAN_ADDED_BY_USER 0x100

/* salt for hash table */
static u32 vxlan_salt __read_mostly;

@@ -98,17 +72,6 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
}

#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
@@ -135,12 +98,6 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
@@ -161,37 +118,6 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
}
#endif

/* Virtual Network hash table head */
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
{
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
	if (rcu_access_pointer(fdb->nh))
		return NULL;
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
	if (rcu_access_pointer(fdb->nh))
		return NULL;
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
}

/* Find VXLAN socket based on network namespace, address family, UDP port,
 * enabled unshareable flags and socket device binding (see l3mdev with
 * non-default VRF).
@@ -213,18 +139,29 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
	return NULL;
}

static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
					   __be32 vni)
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs,
					   int ifindex, __be32 vni,
					   struct vxlan_vni_node **vninode)
{
	struct vxlan_vni_node *vnode;
	struct vxlan_dev_node *node;

	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
	if (vs->flags & VXLAN_F_COLLECT_METADATA &&
	    !(vs->flags & VXLAN_F_VNIFILTER))
		vni = 0;

	hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
		if (node->vxlan->default_dst.remote_vni != vni)
		if (!node->vxlan)
			continue;
		vnode = NULL;
		if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
			vnode = vxlan_vnifilter_lookup(node->vxlan, vni);
			if (!vnode)
				continue;
		} else if (node->vxlan->default_dst.remote_vni != vni) {
			continue;
		}

		if (IS_ENABLED(CONFIG_IPV6)) {
			const struct vxlan_config *cfg = &node->vxlan->cfg;
@@ -234,6 +171,8 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
				continue;
		}

		if (vninode)
			*vninode = vnode;
		return node->vxlan;
	}

@@ -251,7 +190,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
	if (!vs)
		return NULL;

	return vxlan_vs_find_vni(vs, ifindex, vni);
	return vxlan_vs_find_vni(vs, ifindex, vni, NULL);
}

/* Fill in neighbour message in skbuff. */
@@ -493,7 +432,7 @@ static u32 eth_hash(const unsigned char *addr)
	return hash_64(value, FDB_HASH_BITS);
}

static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
	/* use 1 byte of OUI and 3 bytes of NIC */
	u32 key = get_unaligned((u32 *)(addr + 2));
@@ -501,7 +440,7 @@ static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
		return eth_vni_hash(mac, vni);
@@ -920,7 +859,7 @@ static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
	return err;
}

static int vxlan_fdb_create(struct vxlan_dev *vxlan,
int vxlan_fdb_create(struct vxlan_dev *vxlan,
		     const u8 *mac, union vxlan_addr *ip,
		     __u16 state, __be16 port, __be32 src_vni,
		     __be32 vni, __u32 ifindex, __u16 ndm_flags,
@@ -1150,7 +1089,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
}

/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
int vxlan_fdb_update(struct vxlan_dev *vxlan,
		     const u8 *mac, union vxlan_addr *ip,
		     __u16 state, __u16 flags,
		     __be16 port, __be32 src_vni, __be32 vni,
@@ -1307,7 +1246,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
	return err;
}

static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
		       const unsigned char *addr, union vxlan_addr ip,
		       __be16 port, __be32 src_vni, __be32 vni,
		       u32 ifindex, bool swdev_notify)
@@ -1519,56 +1458,6 @@ static bool vxlan_snoop(struct net_device *dev,
	return false;
}

/* See if multicast group is already in use by other ID */
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
{
	struct vxlan_dev *vxlan;
	struct vxlan_sock *sock4;
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;

	sock4 = rtnl_dereference(dev->vn4_sock);

	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
	if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
		return false;
#if IS_ENABLED(CONFIG_IPV6)
	sock6 = rtnl_dereference(dev->vn6_sock);
	if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
		return false;
#endif

	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
		if (!netif_running(vxlan->dev) || vxlan == dev)
			continue;

		if (family == AF_INET &&
		    rtnl_dereference(vxlan->vn4_sock) != sock4)
			continue;
#if IS_ENABLED(CONFIG_IPV6)
		if (family == AF_INET6 &&
		    rtnl_dereference(vxlan->vn6_sock) != sock6)
			continue;
#endif

		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
				      &dev->default_dst.remote_ip))
			continue;

		if (vxlan->default_dst.remote_ifindex !=
		    dev->default_dst.remote_ifindex)
			continue;

		return true;
	}

	return false;
}

static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
{
	struct vxlan_net *vn;
@@ -1602,6 +1491,9 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
	synchronize_net();

	if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
		vxlan_vs_del_vnigrp(vxlan);
	else
		vxlan_vs_del_dev(vxlan);

	if (__vxlan_sock_release_prep(sock4)) {
@@ -1617,76 +1509,6 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
#endif
}

/* Update multicast group membership when first VNI on
 * multicast address is brought up
 */
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
{
	struct sock *sk;
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
	int ret = -EINVAL;

	if (ip->sa.sa_family == AF_INET) {
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

		sk = sock4->sock->sk;
		lock_sock(sk);
		ret = ip_mc_join_group(sk, &mreq);
		release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
		lock_sock(sk);
		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
						   &ip->sin6.sin6_addr);
		release_sock(sk);
#endif
	}

	return ret;
}

/* Inverse of vxlan_igmp_join when last VNI is brought down */
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
{
	struct sock *sk;
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
	int ret = -EINVAL;

	if (ip->sa.sa_family == AF_INET) {
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

		sk = sock4->sock->sk;
		lock_sock(sk);
		ret = ip_mc_leave_group(sk, &mreq);
		release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
		lock_sock(sk);
		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
						   &ip->sin6.sin6_addr);
		release_sock(sk);
#endif
	}

	return ret;
}

static bool vxlan_remcsum(struct vxlanhdr *unparsed,
			  struct sk_buff *skb, u32 vxflags)
{
@@ -1828,6 +1650,7 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_vni_node *vninode = NULL;
	struct vxlan_dev *vxlan;
	struct vxlan_sock *vs;
	struct vxlanhdr unparsed;
@@ -1860,7 +1683,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)

	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);

	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode);
	if (!vxlan)
		goto drop;

@@ -1930,6 +1753,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
	if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
		++vxlan->dev->stats.rx_frame_errors;
		++vxlan->dev->stats.rx_errors;
		vxlan_vnifilter_count(vxlan, vni, vninode,
				      VXLAN_VNI_STATS_RX_ERRORS, 0);
		goto drop;
	}

@@ -1938,10 +1763,13 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
	if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
		rcu_read_unlock();
		atomic_long_inc(&vxlan->dev->rx_dropped);
		vxlan_vnifilter_count(vxlan, vni, vninode,
				      VXLAN_VNI_STATS_RX_DROPS, 0);
		goto drop;
	}

	dev_sw_netstats_rx_add(vxlan->dev, skb->len);
	vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len);
	gro_cells_receive(&vxlan->gro_cells, skb);

	rcu_read_unlock();
@@ -1975,7 +1803,7 @@ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
		return -ENOENT;

	vni = vxlan_vni(hdr->vx_vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL);
	if (!vxlan)
		return -ENOENT;

@@ -2049,8 +1877,12 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
		reply->ip_summed = CHECKSUM_UNNECESSARY;
		reply->pkt_type = PACKET_HOST;

		if (netif_rx_ni(reply) == NET_RX_DROP)
		if (netif_rx_ni(reply) == NET_RX_DROP) {
			dev->stats.rx_dropped++;
			vxlan_vnifilter_count(vxlan, vni, NULL,
					      VXLAN_VNI_STATS_RX_DROPS, 0);
		}

	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
		union vxlan_addr ipa = {
			.sin.sin_addr.s_addr = tip,
@@ -2204,9 +2036,11 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
		if (reply == NULL)
			goto out;

		if (netif_rx_ni(reply) == NET_RX_DROP)
		if (netif_rx_ni(reply) == NET_RX_DROP) {
			dev->stats.rx_dropped++;

			vxlan_vnifilter_count(vxlan, vni, NULL,
					      VXLAN_VNI_STATS_RX_DROPS, 0);
		}
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
		union vxlan_addr ipa = {
			.sin6.sin6_addr = msg->target,
@@ -2540,15 +2374,20 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
	tx_stats->tx_packets++;
	tx_stats->tx_bytes += len;
	u64_stats_update_end(&tx_stats->syncp);
	vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len);

	if (__netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
		rx_stats->rx_bytes += len;
		u64_stats_update_end(&rx_stats->syncp);
		vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX,
				      len);
	} else {
drop:
		dev->stats.rx_dropped++;
		vxlan_vnifilter_count(dst_vxlan, vni, NULL,
				      VXLAN_VNI_STATS_RX_DROPS, 0);
	}
	rcu_read_unlock();
}
@@ -2578,6 +2417,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
					   vxlan->cfg.flags);
		if (!dst_vxlan) {
			dev->stats.tx_errors++;
			vxlan_vnifilter_count(vxlan, vni, NULL,
					      VXLAN_VNI_STATS_TX_ERRORS, 0);
			kfree_skb(skb);

			return -ENOENT;
@@ -2601,15 +2442,19 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
	union vxlan_addr remote_ip, local_ip;
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
	unsigned int pkt_len = skb->len;
	__be16 src_port = 0, dst_port;
	struct dst_entry *ndst = NULL;
	__be32 vni, label;
	__u8 tos, ttl;
	int ifindex;
	int err;
	u32 flags = vxlan->cfg.flags;
	bool udp_sum = false;
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
	__be32 vni = 0;
#if IS_ENABLED(CONFIG_IPV6)
	__be32 label;
#endif

	info = skb_tunnel_info(skb);

@@ -2647,7 +2492,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
		else
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
#if IS_ENABLED(CONFIG_IPV6)
		label = vxlan->cfg.label;
#endif
	} else {
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
@@ -2674,7 +2521,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
		}
		ttl = info->key.ttl;
		tos = info->key.tos;
#if IS_ENABLED(CONFIG_IPV6)
		label = info->key.label;
#endif
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
	}
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
@@ -2821,12 +2670,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
				     label, src_port, dst_port, !udp_sum);
#endif
	}
	vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
out_unlock:
	rcu_read_unlock();
	return;

drop:
	dev->stats.tx_dropped++;
	vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0);
	dev_kfree_skb(skb);
	return;

@@ -2838,6 +2689,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
		dev->stats.tx_carrier_errors++;
	dst_release(ndst);
	dev->stats.tx_errors++;
	vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
	kfree_skb(skb);
}

@@ -2870,6 +2722,8 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,

drop:
	dev->stats.tx_dropped++;
	vxlan_vnifilter_count(netdev_priv(dev), vni, NULL,
			      VXLAN_VNI_STATS_TX_DROPS, 0);
	dev_kfree_skb(skb);
}

@@ -2944,6 +2798,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
				vxlan_fdb_miss(vxlan, eth->h_dest);

			dev->stats.tx_dropped++;
			vxlan_vnifilter_count(vxlan, vni, NULL,
					      VXLAN_VNI_STATS_TX_DROPS, 0);
			kfree_skb(skb);
			return NETDEV_TX_OK;
		}
@@ -3044,6 +2900,9 @@ static int vxlan_init(struct net_device *dev)
	struct vxlan_dev *vxlan = netdev_priv(dev);
	int err;

	if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
		vxlan_vnigroup_init(vxlan);

	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
	if (!dev->tstats)
		return -ENOMEM;
@@ -3073,6 +2932,9 @@ static void vxlan_uninit(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

	if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
		vxlan_vnigroup_uninit(vxlan);

	gro_cells_destroy(&vxlan->gro_cells);

	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
@@ -3090,15 +2952,11 @@ static int vxlan_open(struct net_device *dev)
	if (ret < 0)
		return ret;

	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
		ret = vxlan_igmp_join(vxlan);
		if (ret == -EADDRINUSE)
			ret = 0;
	ret = vxlan_multicast_join(vxlan);
	if (ret) {
		vxlan_sock_release(vxlan);
		return ret;
	}
	}

	if (vxlan->cfg.age_interval)
		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
@@ -3134,12 +2992,9 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
static int vxlan_stop(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	int ret = 0;

	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
	    !vxlan_group_used(vn, vxlan))
		ret = vxlan_igmp_leave(vxlan);
	vxlan_multicast_leave(vxlan);

	del_timer_sync(&vxlan->age_timer);

@@ -3369,6 +3224,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
};

static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -3554,6 +3410,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
{
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
	struct vxlan_sock *vs = NULL;
	struct vxlan_dev_node *node;
	int l3mdev_index = 0;
@@ -3589,7 +3446,12 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
		rcu_assign_pointer(vxlan->vn4_sock, vs);
		node = &vxlan->hlist4;
	}

	if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER))
		vxlan_vs_add_vnigrp(vxlan, vs, ipv6);
	else
		vxlan_vs_add_dev(vs, vxlan, node);

	return 0;
}

@@ -3616,13 +3478,42 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan)
	return ret;
}

int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan,
		     struct vxlan_config *conf, __be32 vni)
{
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
	struct vxlan_dev *tmp;

	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == vxlan)
			continue;
		if (tmp->cfg.flags & VXLAN_F_VNIFILTER) {
			if (!vxlan_vnifilter_lookup(tmp, vni))
				continue;
		} else if (tmp->cfg.vni != vni) {
			continue;
		}
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

		return -EEXIST;
	}

	return 0;
}

static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
				 struct net_device **lower,
				 struct vxlan_dev *old,
				 struct netlink_ext_ack *extack)
{
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
	struct vxlan_dev *tmp;
	bool use_ipv6 = false;

	if (conf->flags & VXLAN_F_GPE) {
@@ -3755,22 +3646,7 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
	if (!conf->age_interval)
		conf->age_interval = FDB_AGE_DEFAULT;

	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == old)
			continue;

		if (tmp->cfg.vni != conf->vni)
			continue;
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

	if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) {
		NL_SET_ERR_MSG(extack,
			       "A VXLAN device with the specified VNI already exists");
		return -EEXIST;
@@ -4226,6 +4102,21 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
	if (data[IFLA_VXLAN_DF])
		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

	if (data[IFLA_VXLAN_VNIFILTER]) {
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER,
				    VXLAN_F_VNIFILTER, changelink, false,
				    extack);
		if (err)
			return err;

		if ((conf->flags & VXLAN_F_VNIFILTER) &&
		    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER],
					    "vxlan vnifilter only valid in collect metadata mode");
			return -EINVAL;
		}
	}

	return 0;
}

@@ -4301,6 +4192,19 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
					   dst->remote_ifindex,
					   true);
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);

		/* If vni filtering device, also update fdb entries of
		 * all vnis that were using default remote ip
		 */
		if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
			err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip,
							 &conf.remote_ip, extack);
			if (err) {
				netdev_adjacent_change_abort(dst->remote_dev,
							     lowerdev, dev);
				return err;
			}
		}
	}

	if (conf.age_interval != vxlan->cfg.age_interval)
@@ -4446,6 +4350,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
		goto nla_put_failure;

	if (vxlan->cfg.flags & VXLAN_F_VNIFILTER &&
	    nla_put_u8(skb, IFLA_VXLAN_VNIFILTER,
		       !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)))
		goto nla_put_failure;

	return 0;

nla_put_failure:
@@ -4805,6 +4714,8 @@ static int __init vxlan_init_module(void)
	if (rc)
		goto out4;

	vxlan_vnifilter_init();

	return 0;
out4:
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
@@ -4819,6 +4730,7 @@ late_initcall(vxlan_init_module);

static void __exit vxlan_cleanup_module(void)
{
	vxlan_vnifilter_uninit();
	rtnl_link_unregister(&vxlan_link_ops);
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
	unregister_netdevice_notifier(&vxlan_notifier_block);
+272 −0

File added.

Preview size limit exceeded, changes collapsed.

+162 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *	Vxlan private header file
 *
 */

#ifndef _VXLAN_PRIVATE_H
#define _VXLAN_PRIVATE_H

#include <linux/rhashtable.h>

extern unsigned int vxlan_net_id;
extern const u8 all_zeros_mac[ETH_ALEN + 2];
extern const struct rhashtable_params vxlan_vni_rht_params;

#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1 << PORT_HASH_BITS)

/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
	spinlock_t	  sock_lock;
	struct notifier_block nexthop_notifier_block;
};

/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
	struct list_head  remotes;
	u8		  eth_addr[ETH_ALEN];
	u16		  state;	/* see ndm_state */
	__be32		  vni;
	u16		  flags;	/* see ndm_flags and below */
	struct list_head  nh_list;
	struct nexthop __rcu *nh;
	struct vxlan_dev  __rcu *vdev;
};

#define NTF_VXLAN_ADDED_BY_USER 0x100

/* Virtual Network hash table head */
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
{
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
	if (rcu_access_pointer(fdb->nh))
		return NULL;
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
	if (rcu_access_pointer(fdb->nh))
		return NULL;
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}

#endif

static inline struct vxlan_vni_node *
vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni)
{
	struct vxlan_vni_group *vg;

	vg = rcu_dereference_rtnl(vxlan->vnigrp);
	if (!vg)
		return NULL;

	return rhashtable_lookup_fast(&vg->vni_hash, &vni,
				      vxlan_vni_rht_params);
}

/* vxlan_core.c */
int vxlan_fdb_create(struct vxlan_dev *vxlan,
		     const u8 *mac, union vxlan_addr *ip,
		     __u16 state, __be16 port, __be32 src_vni,
		     __be32 vni, __u32 ifindex, __u16 ndm_flags,
		     u32 nhid, struct vxlan_fdb **fdb,
		     struct netlink_ext_ack *extack);
int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
		       const unsigned char *addr, union vxlan_addr ip,
		       __be16 port, __be32 src_vni, __be32 vni,
		       u32 ifindex, bool swdev_notify);
u32 eth_vni_hash(const unsigned char *addr, __be32 vni);
u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni);
int vxlan_fdb_update(struct vxlan_dev *vxlan,
		     const u8 *mac, union vxlan_addr *ip,
		     __u16 state, __u16 flags,
		     __be16 port, __be32 src_vni, __be32 vni,
		     __u32 ifindex, __u16 ndm_flags, u32 nhid,
		     bool swdev_notify, struct netlink_ext_ack *extack);
int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan,
		     struct vxlan_config *conf, __be32 vni);

/* vxlan_vnifilter.c */
int vxlan_vnigroup_init(struct vxlan_dev *vxlan);
void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan);

void vxlan_vnifilter_init(void);
void vxlan_vnifilter_uninit(void);
void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni,
			   struct vxlan_vni_node *vninode,
			   int type, unsigned int len);

void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan,
			 struct vxlan_sock *vs,
			 bool ipv6);
void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan);
int vxlan_vnilist_update_group(struct vxlan_dev *vxlan,
			       union vxlan_addr *old_remote_ip,
			       union vxlan_addr *new_remote_ip,
			       struct netlink_ext_ack *extack);


/* vxlan_multicast.c */
int vxlan_multicast_join(struct vxlan_dev *vxlan);
int vxlan_multicast_leave(struct vxlan_dev *vxlan);
bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev,
		      __be32 vni, union vxlan_addr *rip, int rifindex);
int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
		    int rifindex);
int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
		     int rifindex);
#endif
Loading