Commit 03e2777c authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'ipv4-handle-tos-and-scope-properly-for-icmp-redirects-and-pmtu-updates'

Guillaume Nault says:

====================
ipv4: Handle TOS and scope properly for ICMP redirects and PMTU updates

ICMPv4 PMTU and redirect handlers didn't properly initialise the
struct flowi4 they used for route lookups:

  * ECN bits sometimes weren't cleared from ->flowi4_tos.
  * The RTO_ONLINK flag wasn't taken into account for ->flowi4_scope.

In some special cases, this resulted in ICMP redirects and PMTU updates
not being taken into account because fib_lookup() couldn't retrieve the
correct route.
====================

Link: https://lore.kernel.org/r/cover.1647519748.git.gnault@redhat.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 6bd0c76b ec730c3e
Loading
Loading
Loading
Loading
+14 −4
Original line number Diff line number Diff line
@@ -499,6 +499,15 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);

static void ip_rt_fix_tos(struct flowi4 *fl4)
{
	__u8 tos = RT_FL_TOS(fl4);

	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
	fl4->flowi4_scope = tos & RTO_ONLINK ?
			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
}

static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
			     const struct sock *sk,
			     const struct iphdr *iph,
@@ -824,6 +833,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
	rt = (struct rtable *) dst;

	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
	ip_rt_fix_tos(&fl4);
	__ip_do_redirect(rt, skb, &fl4, true);
}

@@ -1048,6 +1058,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
	struct flowi4 fl4;

	ip_rt_build_flow_key(&fl4, sk, skb);
	ip_rt_fix_tos(&fl4);

	/* Don't make lookup fail for bridged encapsulations */
	if (skb && netif_is_any_bridge_port(skb->dev))
@@ -1122,6 +1133,8 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
			goto out;

		new = true;
	} else {
		ip_rt_fix_tos(&fl4);
	}

	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
@@ -2603,7 +2616,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
					const struct sk_buff *skb)
{
	__u8 tos = RT_FL_TOS(fl4);
	struct fib_result res = {
		.type		= RTN_UNSPEC,
		.fi		= NULL,
@@ -2613,9 +2625,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
	struct rtable *rth;

	fl4->flowi4_iif = LOOPBACK_IFINDEX;
	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
	ip_rt_fix_tos(fl4);

	rcu_read_lock();
	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
+137 −4
Original line number Diff line number Diff line
@@ -26,6 +26,15 @@
# - pmtu_ipv6
#	Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
#
# - pmtu_ipv4_dscp_icmp_exception
#	Set up the same network topology as pmtu_ipv4, but use non-default
#	routing table in A. A fib-rule is used to jump to this routing table
#	based on DSCP. Send ICMPv4 packets with the expected DSCP value and
#	verify that ECN doesn't interfere with the creation of PMTU exceptions.
#
# - pmtu_ipv4_dscp_udp_exception
#	Same as pmtu_ipv4_dscp_icmp_exception, but use UDP instead of ICMP.
#
# - pmtu_ipv4_vxlan4_exception
#	Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel
#	over IPv4 between A and B, routed via R1. On the link between R1 and B,
@@ -203,6 +212,8 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
tests="
	pmtu_ipv4_exception		ipv4: PMTU exceptions			1
	pmtu_ipv6_exception		ipv6: PMTU exceptions			1
	pmtu_ipv4_dscp_icmp_exception	ICMPv4 with DSCP and ECN: PMTU exceptions	1
	pmtu_ipv4_dscp_udp_exception	UDPv4 with DSCP and ECN: PMTU exceptions	1
	pmtu_ipv4_vxlan4_exception	IPv4 over vxlan4: PMTU exceptions	1
	pmtu_ipv6_vxlan4_exception	IPv6 over vxlan4: PMTU exceptions	1
	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions	1
@@ -323,6 +334,9 @@ routes_nh="
	B	6	default			61
"

policy_mark=0x04
rt_table=main

veth4_a_addr="192.168.1.1"
veth4_b_addr="192.168.1.2"
veth4_c_addr="192.168.2.10"
@@ -346,6 +360,7 @@ dummy6_mask="64"
err_buf=
tcpdump_pids=
nettest_pids=
socat_pids=

err() {
	err_buf="${err_buf}${1}
@@ -723,7 +738,7 @@ setup_routing_old() {

		ns_name="$(nsname ${ns})"

		ip -n ${ns_name} route add ${addr} via ${gw}
		ip -n "${ns_name}" route add "${addr}" table "${rt_table}" via "${gw}"

		ns=""; addr=""; gw=""
	done
@@ -753,7 +768,7 @@ setup_routing_new() {

		ns_name="$(nsname ${ns})"

		ip -n ${ns_name} -${fam} route add ${addr} nhid ${nhid}
		ip -n "${ns_name}" -"${fam}" route add "${addr}" table "${rt_table}" nhid "${nhid}"

		ns=""; fam=""; addr=""; nhid=""
	done
@@ -798,6 +813,24 @@ setup_routing() {
	return 0
}

setup_policy_routing() {
	setup_routing

	ip -netns "${NS_A}" -4 rule add dsfield "${policy_mark}" \
		table "${rt_table}"

	# Set the IPv4 Don't Fragment bit with tc, since socat doesn't seem to
	# have an option do to it.
	tc -netns "${NS_A}" qdisc replace dev veth_A-R1 root prio
	tc -netns "${NS_A}" qdisc replace dev veth_A-R2 root prio
	tc -netns "${NS_A}" filter add dev veth_A-R1                      \
		protocol ipv4 flower ip_proto udp                         \
		action pedit ex munge ip df set 0x40 pipe csum ip and udp
	tc -netns "${NS_A}" filter add dev veth_A-R2                      \
		protocol ipv4 flower ip_proto udp                         \
		action pedit ex munge ip df set 0x40 pipe csum ip and udp
}

setup_bridge() {
	run_cmd ${ns_a} ip link add br0 type bridge || return $ksft_skip
	run_cmd ${ns_a} ip link set br0 up
@@ -903,6 +936,11 @@ cleanup() {
	done
	nettest_pids=

	for pid in ${socat_pids}; do
		kill "${pid}"
	done
	socat_pids=

	for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
		ip netns del ${n} 2> /dev/null
	done
@@ -950,15 +988,21 @@ link_get_mtu() {
route_get_dst_exception() {
	ns_cmd="${1}"
	dst="${2}"
	dsfield="${3}"

	${ns_cmd} ip route get "${dst}"
	if [ -z "${dsfield}" ]; then
		dsfield=0
	fi

	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
}

route_get_dst_pmtu_from_exception() {
	ns_cmd="${1}"
	dst="${2}"
	dsfield="${3}"

	mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
}

check_pmtu_value() {
@@ -1068,6 +1112,95 @@ test_pmtu_ipv6_exception() {
	test_pmtu_ipvX 6
}

test_pmtu_ipv4_dscp_icmp_exception() {
	rt_table=100

	setup namespaces policy_routing || return $ksft_skip
	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2

	# Set up initial MTU values
	mtu "${ns_a}"  veth_A-R1 2000
	mtu "${ns_r1}" veth_R1-A 2000
	mtu "${ns_r1}" veth_R1-B 1400
	mtu "${ns_b}"  veth_B-R1 1400

	mtu "${ns_a}"  veth_A-R2 2000
	mtu "${ns_r2}" veth_R2-A 2000
	mtu "${ns_r2}" veth_R2-B 1500
	mtu "${ns_b}"  veth_B-R2 1500

	len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1

	dst1="${prefix4}.${b_r1}.1"
	dst2="${prefix4}.${b_r2}.1"

	# Create route exceptions
	dsfield=${policy_mark} # No ECN bit set (Not-ECT)
	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst1}"

	dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0))
	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"

	# Check that exceptions have been created with the correct PMTU
	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1

	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
}

test_pmtu_ipv4_dscp_udp_exception() {
	rt_table=100

	if ! which socat > /dev/null 2>&1; then
		echo "'socat' command not found; skipping tests"
		return $ksft_skip
	fi

	setup namespaces policy_routing || return $ksft_skip
	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2

	# Set up initial MTU values
	mtu "${ns_a}"  veth_A-R1 2000
	mtu "${ns_r1}" veth_R1-A 2000
	mtu "${ns_r1}" veth_R1-B 1400
	mtu "${ns_b}"  veth_B-R1 1400

	mtu "${ns_a}"  veth_A-R2 2000
	mtu "${ns_r2}" veth_R2-A 2000
	mtu "${ns_r2}" veth_R2-B 1500
	mtu "${ns_b}"  veth_B-R2 1500

	len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1

	dst1="${prefix4}.${b_r1}.1"
	dst2="${prefix4}.${b_r2}.1"

	# Create route exceptions
	run_cmd_bg "${ns_b}" socat UDP-LISTEN:50000 OPEN:/dev/null,wronly=1
	socat_pids="${socat_pids} $!"

	dsfield=${policy_mark} # No ECN bit set (Not-ECT)
	run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \
		UDP:"${dst1}":50000,tos="${dsfield}"

	dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0))
	run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \
		UDP:"${dst2}":50000,tos="${dsfield}"

	# Check that exceptions have been created with the correct PMTU
	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
}

test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {
	type=${1}
	family=${2}