Commit 2ac24d6d authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'Support-PMTU-discovery-with-bridged-UDP-tunnels'



Stefano Brivio says:

====================
Support PMTU discovery with bridged UDP tunnels

Currently, PMTU discovery for UDP tunnels only works if packets are
routed to the encapsulating interfaces, not bridged.

This results from the fact that we generally don't have valid routes
to the senders we can use to relay ICMP and ICMPv6 errors, and makes
PMTU discovery completely non-functional for VXLAN and GENEVE ports of
both regular bridges and Open vSwitch instances.

If the sender is local, and packets are forwarded to the port by a
regular bridge, all it takes is to generate a corresponding route
exception on the encapsulating device. The bridge then finds the route
exception carrying the PMTU value estimate as it forwards frames, and
relays ICMP messages back to the socket of the local sender. Patch 1/6
fixes this case.

If the sender resides on another node, we actually need to reply to
IP and IPv6 packets ourselves and send these ICMP or ICMPv6 errors
back, using the same encapsulating device. Patch 2/6, based on an
original idea by Florian Westphal, adds the needed functionality,
while patches 3/6 and 4/6 add matching support for VXLAN and GENEVE.

Finally, 5/6 and 6/6 introduce selftests for all combinations of
inner and outer IP versions, covering both VXLAN and GENEVE, with
both regular bridges and Open vSwitch instances.

v2: Add helper to check for any bridge port, skip oif check for PMTU
    routes for bridge ports only, split IPv4 and IPv6 helpers and
    functions (all suggested by David Ahern)
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents cabf06e5 7b53682c
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -308,7 +308,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
		return PTR_ERR(rt);

	skb_tunnel_check_pmtu(skb, &rt->dst,
			      BAREUDP_IPV4_HLEN + info->options_len);
			      BAREUDP_IPV4_HLEN + info->options_len, false);

	sport = udp_flow_src_port(bareudp->net, skb,
				  bareudp->sport_min, USHRT_MAX,
@@ -369,7 +369,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
	if (IS_ERR(dst))
		return PTR_ERR(dst);

	skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len);
	skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len,
			      false);

	sport = udp_flow_src_port(bareudp->net, skb,
				  bareudp->sport_min, USHRT_MAX,
+51 −4
Original line number Diff line number Diff line
@@ -893,8 +893,31 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
	if (IS_ERR(rt))
		return PTR_ERR(rt);

	skb_tunnel_check_pmtu(skb, &rt->dst,
			      GENEVE_IPV4_HLEN + info->options_len);
	err = skb_tunnel_check_pmtu(skb, &rt->dst,
				    GENEVE_IPV4_HLEN + info->options_len,
				    netif_is_any_bridge_port(dev));
	if (err < 0) {
		dst_release(&rt->dst);
		return err;
	} else if (err) {
		struct ip_tunnel_info *info;

		info = skb_tunnel_info(skb);
		if (info) {
			info->key.u.ipv4.dst = fl4.saddr;
			info->key.u.ipv4.src = fl4.daddr;
		}

		if (!pskb_may_pull(skb, ETH_HLEN)) {
			dst_release(&rt->dst);
			return -EINVAL;
		}

		skb->protocol = eth_type_trans(skb, geneve->dev);
		netif_rx(skb);
		dst_release(&rt->dst);
		return -EMSGSIZE;
	}

	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
	if (geneve->cfg.collect_md) {
@@ -955,7 +978,30 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
	if (IS_ERR(dst))
		return PTR_ERR(dst);

	skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len);
	err = skb_tunnel_check_pmtu(skb, dst,
				    GENEVE_IPV6_HLEN + info->options_len,
				    netif_is_any_bridge_port(dev));
	if (err < 0) {
		dst_release(dst);
		return err;
	} else if (err) {
		struct ip_tunnel_info *info = skb_tunnel_info(skb);

		if (info) {
			info->key.u.ipv6.dst = fl6.saddr;
			info->key.u.ipv6.src = fl6.daddr;
		}

		if (!pskb_may_pull(skb, ETH_HLEN)) {
			dst_release(dst);
			return -EINVAL;
		}

		skb->protocol = eth_type_trans(skb, geneve->dev);
		netif_rx(skb);
		dst_release(dst);
		return -EMSGSIZE;
	}

	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
	if (geneve->cfg.collect_md) {
@@ -1012,6 +1058,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
	if (likely(!err))
		return NETDEV_TX_OK;

	if (err != -EMSGSIZE)
		dev_kfree_skb(skb);

	if (err == -ELOOP)
+41 −6
Original line number Diff line number Diff line
@@ -2500,7 +2500,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,

/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
			       struct vxlan_dev *dst_vxlan, __be32 vni)
			       struct vxlan_dev *dst_vxlan, __be32 vni,
			       bool snoop)
{
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
	union vxlan_addr loopback;
@@ -2532,7 +2533,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
		goto drop;
	}

	if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
	if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);

	u64_stats_update_begin(&tx_stats->syncp);
@@ -2581,7 +2582,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,

			return -ENOENT;
		}
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
		return 1;
	}

@@ -2617,7 +2618,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
		if (vxlan_addr_any(dst)) {
			if (did_rsc) {
				/* short-circuited back to local bridge */
				vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
				vxlan_encap_bypass(skb, vxlan, vxlan,
						   default_vni, true);
				return;
			}
			goto drop;
@@ -2720,7 +2722,23 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
		}

		ndst = &rt->dst;
		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
					    netif_is_any_bridge_port(dev));
		if (err < 0) {
			goto tx_error;
		} else if (err) {
			if (info) {
				struct in_addr src, dst;

				src = remote_ip.sin.sin_addr;
				dst = local_ip.sin.sin_addr;
				info->key.u.ipv4.src = src.s_addr;
				info->key.u.ipv4.dst = dst.s_addr;
			}
			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
			dst_release(ndst);
			goto out_unlock;
		}

		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
@@ -2760,7 +2778,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
				goto out_unlock;
		}

		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
					    netif_is_any_bridge_port(dev));
		if (err < 0) {
			goto tx_error;
		} else if (err) {
			if (info) {
				struct in6_addr src, dst;

				src = remote_ip.sin6.sin6_addr;
				dst = local_ip.sin6.sin6_addr;
				info->key.u.ipv6.src = src;
				info->key.u.ipv6.dst = dst;
			}

			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
			dst_release(ndst);
			goto out_unlock;
		}

		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
+5 −0
Original line number Diff line number Diff line
@@ -4840,6 +4840,11 @@ static inline bool netif_is_ovs_port(const struct net_device *dev)
	return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
	return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
	return dev->priv_flags & IFF_TEAM;
+0 −10
Original line number Diff line number Diff line
@@ -535,14 +535,4 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
		dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

static inline void skb_tunnel_check_pmtu(struct sk_buff *skb,
					 struct dst_entry *encap_dst,
					 int headroom)
{
	u32 encap_mtu = dst_mtu(encap_dst);

	if (skb->len > encap_mtu - headroom)
		skb_dst_update_pmtu_no_confirm(skb, encap_mtu - headroom);
}

#endif /* _NET_DST_H */
Loading