Commit 1cb9d3b6 authored by Haiyang Zhang's avatar Haiyang Zhang Committed by Jakub Kicinski
Browse files

hv_netvsc: Add support for XDP_REDIRECT



Handle XDP_REDIRECT action in netvsc driver.
Also, transparently pass ndo_xdp_xmit to VF when available.

Signed-off-by: default avatarHaiyang Zhang <haiyangz@microsoft.com>
Link: https://lore.kernel.org/r/1649362894-20077-1-git-send-email-haiyangz@microsoft.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 2e36437f
Loading
Loading
Loading
Loading
+66 −3
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/hyperv.h>
#include <linux/rndis.h>
#include <linux/jhash.h>

/* RSS related */
#define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203  /* query only */
@@ -237,6 +238,7 @@ int netvsc_recv_callback(struct net_device *net,
void netvsc_channel_cb(void *context);
int netvsc_poll(struct napi_struct *napi, int budget);

void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev);
u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
		   struct xdp_buff *xdp);
unsigned int netvsc_xdp_fraglen(unsigned int len);
@@ -246,6 +248,8 @@ int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog,
		   struct netvsc_device *nvdev);
int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog);
int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf);
int netvsc_ndoxdp_xmit(struct net_device *ndev, int n,
		       struct xdp_frame **frames, u32 flags);

int rndis_set_subchannel(struct net_device *ndev,
			 struct netvsc_device *nvdev,
@@ -942,12 +946,21 @@ struct nvsc_rsc {
#define NVSC_RSC_CSUM_INFO	BIT(1)	/* valid/present bit for 'csum_info' */
#define NVSC_RSC_HASH_INFO	BIT(2)	/* valid/present bit for 'hash_info' */

struct netvsc_stats {
struct netvsc_stats_tx {
	u64 packets;
	u64 bytes;
	u64 xdp_xmit;
	struct u64_stats_sync syncp;
};

struct netvsc_stats_rx {
	u64 packets;
	u64 bytes;
	u64 broadcast;
	u64 multicast;
	u64 xdp_drop;
	u64 xdp_redirect;
	u64 xdp_tx;
	struct u64_stats_sync syncp;
};

@@ -1046,6 +1059,55 @@ struct net_device_context {
	struct netvsc_device_info *saved_netvsc_dev_info;
};

/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
 * packets. We can use ethtool to change UDP hash level when necessary.
 */
static inline u32 netvsc_get_hash(struct sk_buff *skb,
				  const struct net_device_context *ndc)
{
	struct flow_keys flow;
	u32 hash, pkt_proto = 0;
	static u32 hashrnd __read_mostly;

	net_get_random_once(&hashrnd, sizeof(hashrnd));

	if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
		return 0;

	switch (flow.basic.ip_proto) {
	case IPPROTO_TCP:
		if (flow.basic.n_proto == htons(ETH_P_IP))
			pkt_proto = HV_TCP4_L4HASH;
		else if (flow.basic.n_proto == htons(ETH_P_IPV6))
			pkt_proto = HV_TCP6_L4HASH;

		break;

	case IPPROTO_UDP:
		if (flow.basic.n_proto == htons(ETH_P_IP))
			pkt_proto = HV_UDP4_L4HASH;
		else if (flow.basic.n_proto == htons(ETH_P_IPV6))
			pkt_proto = HV_UDP6_L4HASH;

		break;
	}

	if (pkt_proto & ndc->l4_hash) {
		return skb_get_hash(skb);
	} else {
		if (flow.basic.n_proto == htons(ETH_P_IP))
			hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
		else if (flow.basic.n_proto == htons(ETH_P_IPV6))
			hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
		else
			return 0;

		__skb_set_sw_hash(skb, hash, false);
	}

	return hash;
}

/* Per channel data */
struct netvsc_channel {
	struct vmbus_channel *channel;
@@ -1060,9 +1122,10 @@ struct netvsc_channel {

	struct bpf_prog __rcu *bpf_prog;
	struct xdp_rxq_info xdp_rxq;
	bool xdp_flush;

	struct netvsc_stats tx_stats;
	struct netvsc_stats rx_stats;
	struct netvsc_stats_tx tx_stats;
	struct netvsc_stats_rx rx_stats;
};

/* Per netvsc device */
+7 −1
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/prefetch.h>
#include <linux/filter.h>

#include <asm/sync_bitops.h>
#include <asm/mshyperv.h>
@@ -805,7 +806,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
		struct hv_netvsc_packet *packet
			= (struct hv_netvsc_packet *)skb->cb;
		u32 send_index = packet->send_buf_index;
		struct netvsc_stats *tx_stats;
		struct netvsc_stats_tx *tx_stats;

		if (send_index != NETVSC_INVALID_INDEX)
			netvsc_free_send_slot(net_device, send_index);
@@ -1670,12 +1671,17 @@ int netvsc_poll(struct napi_struct *napi, int budget)
	if (!nvchan->desc)
		nvchan->desc = hv_pkt_iter_first(channel);

	nvchan->xdp_flush = false;

	while (nvchan->desc && work_done < budget) {
		work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
						    ndev, nvchan->desc, budget);
		nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
	}

	if (nvchan->xdp_flush)
		xdp_do_flush();

	/* Send any pending receive completions */
	ret = send_recv_completions(ndev, net_device, nvchan);

+94 −1
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/netpoll.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/kernel.h>
@@ -23,11 +24,13 @@
u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
		   struct xdp_buff *xdp)
{
	struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
	void *data = nvchan->rsc.data[0];
	u32 len = nvchan->rsc.len[0];
	struct page *page = NULL;
	struct bpf_prog *prog;
	u32 act = XDP_PASS;
	bool drop = true;

	xdp->data_hard_start = NULL;

@@ -60,9 +63,34 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
	switch (act) {
	case XDP_PASS:
	case XDP_TX:
		drop = false;
		break;

	case XDP_DROP:
		break;

	case XDP_REDIRECT:
		if (!xdp_do_redirect(ndev, xdp, prog)) {
			nvchan->xdp_flush = true;
			drop = false;

			u64_stats_update_begin(&rx_stats->syncp);

			rx_stats->xdp_redirect++;
			rx_stats->packets++;
			rx_stats->bytes += nvchan->rsc.pktlen;

			u64_stats_update_end(&rx_stats->syncp);

			break;
		} else {
			u64_stats_update_begin(&rx_stats->syncp);
			rx_stats->xdp_drop++;
			u64_stats_update_end(&rx_stats->syncp);
		}

		fallthrough;

	case XDP_ABORTED:
		trace_xdp_exception(ndev, prog, act);
		break;
@@ -74,7 +102,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
out:
	rcu_read_unlock();

	if (page && act != XDP_PASS && act != XDP_TX) {
	if (page && drop) {
		__free_page(page);
		xdp->data_hard_start = NULL;
	}
@@ -197,3 +225,68 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
		return -EINVAL;
	}
}

static int netvsc_ndoxdp_xmit_fm(struct net_device *ndev,
				 struct xdp_frame *frame, u16 q_idx)
{
	struct sk_buff *skb;

	skb = xdp_build_skb_from_frame(frame, ndev);
	if (unlikely(!skb))
		return -ENOMEM;

	netvsc_get_hash(skb, netdev_priv(ndev));

	skb_record_rx_queue(skb, q_idx);

	netvsc_xdp_xmit(skb, ndev);

	return 0;
}

int netvsc_ndoxdp_xmit(struct net_device *ndev, int n,
		       struct xdp_frame **frames, u32 flags)
{
	struct net_device_context *ndev_ctx = netdev_priv(ndev);
	const struct net_device_ops *vf_ops;
	struct netvsc_stats_tx *tx_stats;
	struct netvsc_device *nvsc_dev;
	struct net_device *vf_netdev;
	int i, count = 0;
	u16 q_idx;

	/* Don't transmit if netvsc_device is gone */
	nvsc_dev = rcu_dereference_bh(ndev_ctx->nvdev);
	if (unlikely(!nvsc_dev || nvsc_dev->destroy))
		return 0;

	/* If VF is present and up then redirect packets to it.
	 * Skip the VF if it is marked down or has no carrier.
	 * If netpoll is in uses, then VF can not be used either.
	 */
	vf_netdev = rcu_dereference_bh(ndev_ctx->vf_netdev);
	if (vf_netdev && netif_running(vf_netdev) &&
	    netif_carrier_ok(vf_netdev) && !netpoll_tx_running(ndev) &&
	    vf_netdev->netdev_ops->ndo_xdp_xmit &&
	    ndev_ctx->data_path_is_vf) {
		vf_ops = vf_netdev->netdev_ops;
		return vf_ops->ndo_xdp_xmit(vf_netdev, n, frames, flags);
	}

	q_idx = smp_processor_id() % ndev->real_num_tx_queues;

	for (i = 0; i < n; i++) {
		if (netvsc_ndoxdp_xmit_fm(ndev, frames[i], q_idx))
			break;

		count++;
	}

	tx_stats = &nvsc_dev->chan_table[q_idx].tx_stats;

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->xdp_xmit += count;
	u64_stats_update_end(&tx_stats->syncp);

	return count;
}
+61 −89
Original line number Diff line number Diff line
@@ -242,56 +242,6 @@ static inline void *init_ppi_data(struct rndis_message *msg,
	return ppi + 1;
}

/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
 * packets. We can use ethtool to change UDP hash level when necessary.
 */
static inline u32 netvsc_get_hash(
	struct sk_buff *skb,
	const struct net_device_context *ndc)
{
	struct flow_keys flow;
	u32 hash, pkt_proto = 0;
	static u32 hashrnd __read_mostly;

	net_get_random_once(&hashrnd, sizeof(hashrnd));

	if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
		return 0;

	switch (flow.basic.ip_proto) {
	case IPPROTO_TCP:
		if (flow.basic.n_proto == htons(ETH_P_IP))
			pkt_proto = HV_TCP4_L4HASH;
		else if (flow.basic.n_proto == htons(ETH_P_IPV6))
			pkt_proto = HV_TCP6_L4HASH;

		break;

	case IPPROTO_UDP:
		if (flow.basic.n_proto == htons(ETH_P_IP))
			pkt_proto = HV_UDP4_L4HASH;
		else if (flow.basic.n_proto == htons(ETH_P_IPV6))
			pkt_proto = HV_UDP6_L4HASH;

		break;
	}

	if (pkt_proto & ndc->l4_hash) {
		return skb_get_hash(skb);
	} else {
		if (flow.basic.n_proto == htons(ETH_P_IP))
			hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
		else if (flow.basic.n_proto == htons(ETH_P_IPV6))
			hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
		else
			return 0;

		__skb_set_sw_hash(skb, hash, false);
	}

	return hash;
}

static inline int netvsc_get_tx_queue(struct net_device *ndev,
				      struct sk_buff *skb, int old_idx)
{
@@ -804,7 +754,7 @@ void netvsc_linkstatus_callback(struct net_device *net,
}

/* This function should only be called after skb_record_rx_queue() */
static void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
{
	int rc;

@@ -925,7 +875,7 @@ int netvsc_recv_callback(struct net_device *net,
	struct vmbus_channel *channel = nvchan->channel;
	u16 q_idx = channel->offermsg.offer.sub_channel_index;
	struct sk_buff *skb;
	struct netvsc_stats *rx_stats = &nvchan->rx_stats;
	struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
	struct xdp_buff xdp;
	u32 act;

@@ -934,6 +884,9 @@ int netvsc_recv_callback(struct net_device *net,

	act = netvsc_run_xdp(net, nvchan, &xdp);

	if (act == XDP_REDIRECT)
		return NVSP_STAT_SUCCESS;

	if (act != XDP_PASS && act != XDP_TX) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->xdp_drop++;
@@ -958,6 +911,9 @@ int netvsc_recv_callback(struct net_device *net,
	 * statistics will not work correctly.
	 */
	u64_stats_update_begin(&rx_stats->syncp);
	if (act == XDP_TX)
		rx_stats->xdp_tx++;

	rx_stats->packets++;
	rx_stats->bytes += nvchan->rsc.pktlen;

@@ -1353,28 +1309,29 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
	/* fetch percpu stats of netvsc */
	for (i = 0; i < nvdev->num_chn; i++) {
		const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
		const struct netvsc_stats *stats;
		const struct netvsc_stats_tx *tx_stats;
		const struct netvsc_stats_rx *rx_stats;
		struct netvsc_ethtool_pcpu_stats *this_tot =
			&pcpu_tot[nvchan->channel->target_cpu];
		u64 packets, bytes;
		unsigned int start;

		stats = &nvchan->tx_stats;
		tx_stats = &nvchan->tx_stats;
		do {
			start = u64_stats_fetch_begin_irq(&stats->syncp);
			packets = stats->packets;
			bytes = stats->bytes;
		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
			packets = tx_stats->packets;
			bytes = tx_stats->bytes;
		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));

		this_tot->tx_bytes	+= bytes;
		this_tot->tx_packets	+= packets;

		stats = &nvchan->rx_stats;
		rx_stats = &nvchan->rx_stats;
		do {
			start = u64_stats_fetch_begin_irq(&stats->syncp);
			packets = stats->packets;
			bytes = stats->bytes;
		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
			packets = rx_stats->packets;
			bytes = rx_stats->bytes;
		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));

		this_tot->rx_bytes	+= bytes;
		this_tot->rx_packets	+= packets;
@@ -1406,27 +1363,28 @@ static void netvsc_get_stats64(struct net_device *net,

	for (i = 0; i < nvdev->num_chn; i++) {
		const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
		const struct netvsc_stats *stats;
		const struct netvsc_stats_tx *tx_stats;
		const struct netvsc_stats_rx *rx_stats;
		u64 packets, bytes, multicast;
		unsigned int start;

		stats = &nvchan->tx_stats;
		tx_stats = &nvchan->tx_stats;
		do {
			start = u64_stats_fetch_begin_irq(&stats->syncp);
			packets = stats->packets;
			bytes = stats->bytes;
		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
			packets = tx_stats->packets;
			bytes = tx_stats->bytes;
		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));

		t->tx_bytes	+= bytes;
		t->tx_packets	+= packets;

		stats = &nvchan->rx_stats;
		rx_stats = &nvchan->rx_stats;
		do {
			start = u64_stats_fetch_begin_irq(&stats->syncp);
			packets = stats->packets;
			bytes = stats->bytes;
			multicast = stats->multicast + stats->broadcast;
		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
			packets = rx_stats->packets;
			bytes = rx_stats->bytes;
			multicast = rx_stats->multicast + rx_stats->broadcast;
		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));

		t->rx_bytes	+= bytes;
		t->rx_packets	+= packets;
@@ -1515,8 +1473,8 @@ static const struct {
/* statistics per queue (rx/tx packets/bytes) */
#define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))

/* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5)
/* 8 statistics per queue (rx/tx packets/bytes, XDP actions) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 8)

static int netvsc_get_sset_count(struct net_device *dev, int string_set)
{
@@ -1543,12 +1501,16 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
	struct net_device_context *ndc = netdev_priv(dev);
	struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
	const void *nds = &ndc->eth_stats;
	const struct netvsc_stats *qstats;
	const struct netvsc_stats_tx *tx_stats;
	const struct netvsc_stats_rx *rx_stats;
	struct netvsc_vf_pcpu_stats sum;
	struct netvsc_ethtool_pcpu_stats *pcpu_sum;
	unsigned int start;
	u64 packets, bytes;
	u64 xdp_drop;
	u64 xdp_redirect;
	u64 xdp_tx;
	u64 xdp_xmit;
	int i, j, cpu;

	if (!nvdev)
@@ -1562,26 +1524,32 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
		data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);

	for (j = 0; j < nvdev->num_chn; j++) {
		qstats = &nvdev->chan_table[j].tx_stats;
		tx_stats = &nvdev->chan_table[j].tx_stats;

		do {
			start = u64_stats_fetch_begin_irq(&qstats->syncp);
			packets = qstats->packets;
			bytes = qstats->bytes;
		} while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
			packets = tx_stats->packets;
			bytes = tx_stats->bytes;
			xdp_xmit = tx_stats->xdp_xmit;
		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
		data[i++] = packets;
		data[i++] = bytes;
		data[i++] = xdp_xmit;

		qstats = &nvdev->chan_table[j].rx_stats;
		rx_stats = &nvdev->chan_table[j].rx_stats;
		do {
			start = u64_stats_fetch_begin_irq(&qstats->syncp);
			packets = qstats->packets;
			bytes = qstats->bytes;
			xdp_drop = qstats->xdp_drop;
		} while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
			packets = rx_stats->packets;
			bytes = rx_stats->bytes;
			xdp_drop = rx_stats->xdp_drop;
			xdp_redirect = rx_stats->xdp_redirect;
			xdp_tx = rx_stats->xdp_tx;
		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
		data[i++] = packets;
		data[i++] = bytes;
		data[i++] = xdp_drop;
		data[i++] = xdp_redirect;
		data[i++] = xdp_tx;
	}

	pcpu_sum = kvmalloc_array(num_possible_cpus(),
@@ -1622,9 +1590,12 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
		for (i = 0; i < nvdev->num_chn; i++) {
			ethtool_sprintf(&p, "tx_queue_%u_packets", i);
			ethtool_sprintf(&p, "tx_queue_%u_bytes", i);
			ethtool_sprintf(&p, "tx_queue_%u_xdp_xmit", i);
			ethtool_sprintf(&p, "rx_queue_%u_packets", i);
			ethtool_sprintf(&p, "rx_queue_%u_bytes", i);
			ethtool_sprintf(&p, "rx_queue_%u_xdp_drop", i);
			ethtool_sprintf(&p, "rx_queue_%u_xdp_redirect", i);
			ethtool_sprintf(&p, "rx_queue_%u_xdp_tx", i);
		}

		for_each_present_cpu(cpu) {
@@ -2057,6 +2028,7 @@ static const struct net_device_ops device_ops = {
	.ndo_select_queue =		netvsc_select_queue,
	.ndo_get_stats64 =		netvsc_get_stats64,
	.ndo_bpf =			netvsc_bpf,
	.ndo_xdp_xmit =			netvsc_ndoxdp_xmit,
};

/*