Commit 25fd2634 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'GVE-Raw-Addressing'



David Awogbemil says:

====================
GVE Raw Addressing
Patchset description:
This  patchset introduces "raw addressing" mode to the GVE driver.
Previously (in "queue_page_list" or "qpl" mode), the driver would
pre-allocate and dma_map buffers to be used on egress and ingress.
On egress, it would copy data from the skb provided to the
pre-allocated buffers - this was expensive.
In raw addressing mode, the driver can avoid this copy and simply
dma_map the skb's data so that the NIC can use it.
On ingress, the driver passes buffers up to the networking stack and
then frees and reallocates buffers when necessary instead of using
skb_copy_to_linear_data.
Patch 3 separates the page refcount tracking mechanism
into a function gve_rx_can_recycle_buffer which uses get_page - this will
be changed in a future patch to eliminate the use of get_page in tracking
page refcounts.

Changes from v9:
  Patch 4: Use u64, not u32 for new tx stat counters.
====================

Reviewed-by: default avatarAlexander Duyck <alexanderduyck@fb.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8354bcbe 6f007c64
Loading
Loading
Loading
Loading
+24 −15
Original line number Diff line number Diff line
@@ -38,6 +38,8 @@
#define NIC_TX_STATS_REPORT_NUM	0
#define NIC_RX_STATS_REPORT_NUM	4

#define GVE_DATA_SLOT_ADDR_PAGE_MASK (~(PAGE_SIZE - 1))

/* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */
struct gve_rx_desc_queue {
	struct gve_rx_desc *desc_ring; /* the descriptor ring */
@@ -49,7 +51,8 @@ struct gve_rx_desc_queue {
struct gve_rx_slot_page_info {
	struct page *page;
	void *page_address;
	u32 page_offset; /* offset to write to in page */
	u8 page_offset; /* flipped to second half? */
	u8 can_flip;
};

/* A list of pages registered with the device during setup and used by a queue
@@ -64,10 +67,11 @@ struct gve_queue_page_list {

/* Each slot in the data ring has a 1:1 mapping to a slot in the desc ring */
struct gve_rx_data_queue {
	struct gve_rx_data_slot *data_ring; /* read by NIC */
	union gve_rx_data_slot *data_ring; /* read by NIC */
	dma_addr_t data_bus; /* dma mapping of the slots */
	struct gve_rx_slot_page_info *page_info; /* page info of the buffers */
	struct gve_queue_page_list *qpl; /* qpl assigned to this queue */
	u8 raw_addressing; /* use raw_addressing? */
};

struct gve_priv;
@@ -82,6 +86,7 @@ struct gve_rx_ring {
	u32 cnt; /* free-running total number of completed packets */
	u32 fill_cnt; /* free-running total number of descs and buffs posted */
	u32 mask; /* masks the cnt and fill_cnt to the size of the ring */
	u32 db_threshold; /* threshold for posting new buffs and descs */
	u64 rx_copybreak_pkt; /* free-running count of copybreak packets */
	u64 rx_copied_pkt; /* free-running total number of copied packets */
	u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */
@@ -107,12 +112,20 @@ struct gve_tx_iovec {
	u32 iov_padding; /* padding associated with this segment */
};

struct gve_tx_dma_buf {
	DEFINE_DMA_UNMAP_ADDR(dma);
	DEFINE_DMA_UNMAP_LEN(len);
};

/* Tracks the memory in the fifo occupied by the skb. Mapped 1:1 to a desc
 * ring entry but only used for a pkt_desc not a seg_desc
 */
struct gve_tx_buffer_state {
	struct sk_buff *skb; /* skb for this pkt */
	union {
		struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */
		struct gve_tx_dma_buf buf;
	};
};

/* A TX buffer - each queue has one */
@@ -135,13 +148,17 @@ struct gve_tx_ring {
	__be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */
	u64 pkt_done; /* free-running - total packets completed */
	u64 bytes_done; /* free-running - total bytes completed */
	u64 dropped_pkt; /* free-running - total packets dropped */
	u64 dma_mapping_error; /* count of dma mapping errors */

	/* Cacheline 2 -- Read-mostly fields */
	union gve_tx_desc *desc ____cacheline_aligned;
	struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */
	struct netdev_queue *netdev_txq;
	struct gve_queue_resources *q_resources; /* head and tail pointer idx */
	struct device *dev;
	u32 mask; /* masks req and done down to queue size */
	u8 raw_addressing; /* use raw_addressing? */

	/* Slow-path fields */
	u32 q_num ____cacheline_aligned; /* queue idx */
@@ -194,11 +211,12 @@ struct gve_priv {
	u16 tx_desc_cnt; /* num desc per ring */
	u16 rx_desc_cnt; /* num desc per ring */
	u16 tx_pages_per_qpl; /* tx buffer length */
	u16 rx_pages_per_qpl; /* rx buffer length */
	u16 rx_data_slot_cnt; /* rx buffer length */
	u64 max_registered_pages;
	u64 num_registered_pages; /* num pages registered with NIC */
	u32 rx_copybreak; /* copy packets smaller than this */
	u16 default_num_queues; /* default num queues to set up */
	u8 raw_addressing; /* 1 if this dev supports raw addressing, 0 otherwise */

	struct gve_queue_config tx_cfg;
	struct gve_queue_config rx_cfg;
@@ -436,14 +454,14 @@ static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx)
 */
static inline u32 gve_num_tx_qpls(struct gve_priv *priv)
{
	return priv->tx_cfg.num_queues;
	return priv->raw_addressing ? 0 : priv->tx_cfg.num_queues;
}

/* Returns the number of rx queue page lists
 */
static inline u32 gve_num_rx_qpls(struct gve_priv *priv)
{
	return priv->rx_cfg.num_queues;
	return priv->raw_addressing ? 0 : priv->rx_cfg.num_queues;
}

/* Returns a pointer to the next available tx qpl in the list of qpls
@@ -497,15 +515,6 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv,
		return DMA_FROM_DEVICE;
}

/* Returns true if the max mtu allows page recycling */
static inline bool gve_can_recycle_pages(struct net_device *dev)
{
	/* We can't recycle the pages if we can't fit a packet into half a
	 * page.
	 */
	return dev->max_mtu <= PAGE_SIZE / 2;
}

/* buffers */
int gve_alloc_page(struct gve_priv *priv, struct device *dev,
		   struct page **page, dma_addr_t *dma,
+82 −7
Original line number Diff line number Diff line
@@ -14,6 +14,57 @@
#define GVE_ADMINQ_SLEEP_LEN		20
#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK	100

#define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \
"Expected: length=%d, feature_mask=%x.\n" \
"Actual: length=%d, feature_mask=%x.\n"

static
struct gve_device_option *gve_get_next_option(struct gve_device_descriptor *descriptor,
					      struct gve_device_option *option)
{
	void *option_end, *descriptor_end;

	option_end = (void *)(option + 1) + be16_to_cpu(option->option_length);
	descriptor_end = (void *)descriptor + be16_to_cpu(descriptor->total_length);

	return option_end > descriptor_end ? NULL : (struct gve_device_option *)option_end;
}

static
void gve_parse_device_option(struct gve_priv *priv,
			     struct gve_device_descriptor *device_descriptor,
			     struct gve_device_option *option)
{
	u16 option_length = be16_to_cpu(option->option_length);
	u16 option_id = be16_to_cpu(option->option_id);

	switch (option_id) {
	case GVE_DEV_OPT_ID_RAW_ADDRESSING:
		/* If the length or feature mask doesn't match,
		 * continue without enabling the feature.
		 */
		if (option_length != GVE_DEV_OPT_LEN_RAW_ADDRESSING ||
		    option->feat_mask != cpu_to_be32(GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING)) {
			dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, "Raw Addressing",
				 GVE_DEV_OPT_LEN_RAW_ADDRESSING,
				 cpu_to_be32(GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING),
				 option_length, option->feat_mask);
			priv->raw_addressing = 0;
		} else {
			dev_info(&priv->pdev->dev,
				 "Raw addressing device option enabled.\n");
			priv->raw_addressing = 1;
		}
		break;
	default:
		/* If we don't recognize the option just continue
		 * without doing anything.
		 */
		dev_dbg(&priv->pdev->dev, "Unrecognized device option 0x%hx not enabled.\n",
			option_id);
	}
}

int gve_adminq_alloc(struct device *dev, struct gve_priv *priv)
{
	priv->adminq = dma_alloc_coherent(dev, PAGE_SIZE,
@@ -318,8 +369,10 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
{
	struct gve_tx_ring *tx = &priv->tx[queue_index];
	union gve_adminq_command cmd;
	u32 qpl_id;
	int err;

	qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : tx->tx_fifo.qpl->id;
	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE);
	cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) {
@@ -328,7 +381,7 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
		.queue_resources_addr =
			cpu_to_be64(tx->q_resources_bus),
		.tx_ring_addr = cpu_to_be64(tx->bus),
		.queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id),
		.queue_page_list_id = cpu_to_be32(qpl_id),
		.ntfy_id = cpu_to_be32(tx->ntfy_id),
	};

@@ -357,8 +410,10 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
{
	struct gve_rx_ring *rx = &priv->rx[queue_index];
	union gve_adminq_command cmd;
	u32 qpl_id;
	int err;

	qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id;
	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);
	cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
@@ -369,7 +424,7 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
		.queue_resources_addr = cpu_to_be64(rx->q_resources_bus),
		.rx_desc_ring_addr = cpu_to_be64(rx->desc.bus),
		.rx_data_ring_addr = cpu_to_be64(rx->data.data_bus),
		.queue_page_list_id = cpu_to_be32(rx->data.qpl->id),
		.queue_page_list_id = cpu_to_be32(qpl_id),
	};

	err = gve_adminq_issue_cmd(priv, &cmd);
@@ -460,11 +515,14 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
int gve_adminq_describe_device(struct gve_priv *priv)
{
	struct gve_device_descriptor *descriptor;
	struct gve_device_option *dev_opt;
	union gve_adminq_command cmd;
	dma_addr_t descriptor_bus;
	u16 num_options;
	int err = 0;
	u8 *mac;
	u16 mtu;
	int i;

	memset(&cmd, 0, sizeof(cmd));
	descriptor = dma_alloc_coherent(&priv->pdev->dev, PAGE_SIZE,
@@ -511,13 +569,30 @@ int gve_adminq_describe_device(struct gve_priv *priv)
	mac = descriptor->mac;
	dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac);
	priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl);
	priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl);
	if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) {
		dev_err(&priv->pdev->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
			priv->rx_pages_per_qpl);
		priv->rx_desc_cnt = priv->rx_pages_per_qpl;
	priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl);
	if (priv->rx_data_slot_cnt < priv->rx_desc_cnt) {
		dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
			priv->rx_data_slot_cnt);
		priv->rx_desc_cnt = priv->rx_data_slot_cnt;
	}
	priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
	dev_opt = (void *)(descriptor + 1);

	num_options = be16_to_cpu(descriptor->num_device_options);
	for (i = 0; i < num_options; i++) {
		struct gve_device_option *next_opt;

		next_opt = gve_get_next_option(descriptor, dev_opt);
		if (!next_opt) {
			dev_err(&priv->dev->dev,
				"options exceed device_descriptor's total length.\n");
			err = -EINVAL;
			goto free_device_descriptor;
		}

		gve_parse_device_option(priv, descriptor, dev_opt);
		dev_opt = next_opt;
	}

free_device_descriptor:
	dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor,
+11 −4
Original line number Diff line number Diff line
@@ -79,12 +79,17 @@ struct gve_device_descriptor {

static_assert(sizeof(struct gve_device_descriptor) == 40);

struct device_option {
	__be32 option_id;
	__be32 option_length;
struct gve_device_option {
	__be16 option_id;
	__be16 option_length;
	__be32 feat_mask;
};

static_assert(sizeof(struct device_option) == 8);
static_assert(sizeof(struct gve_device_option) == 8);

#define GVE_DEV_OPT_ID_RAW_ADDRESSING 0x1
#define GVE_DEV_OPT_LEN_RAW_ADDRESSING 0x0
#define GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING 0x0

struct gve_adminq_configure_device_resources {
	__be64 counter_array;
@@ -111,6 +116,8 @@ struct gve_adminq_unregister_page_list {

static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4);

#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF

struct gve_adminq_create_tx_queue {
	__be32 queue_id;
	__be32 reserved;
+12 −7
Original line number Diff line number Diff line
@@ -16,9 +16,11 @@
 * Base addresses encoded in seg_addr are not assumed to be physical
 * addresses. The ring format assumes these come from some linear address
 * space. This could be physical memory, kernel virtual memory, user virtual
 * memory. gVNIC uses lists of registered pages. Each queue is assumed
 * to be associated with a single such linear address space to ensure a
 * consistent meaning for seg_addrs posted to its rings.
 * memory.
 * If raw dma addressing is not supported then gVNIC uses lists of registered
 * pages. Each queue is assumed to be associated with a single such linear
 * address space to ensure a consistent meaning for seg_addrs posted to its
 * rings.
 */

struct gve_tx_pkt_desc {
@@ -72,12 +74,15 @@ struct gve_rx_desc {
} __packed;
static_assert(sizeof(struct gve_rx_desc) == 64);

/* As with the Tx ring format, the qpl_offset entries below are offsets into an
 * ordered list of registered pages.
/* If the device supports raw dma addressing then the addr in data slot is
 * the dma address of the buffer.
 * If the device only supports registered segments then the addr is a byte
 * offset into the registered segment (an ordered list of pages) where the
 * buffer is.
 */
struct gve_rx_data_slot {
	/* byte offset into the rx registered segment of this slot */
union gve_rx_data_slot {
	__be64 qpl_offset;
	__be64 addr;
};

/* GVE Recive Packet Descriptor Seq No */
+2 −0
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@ static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = {
static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = {
	"tx_posted_desc[%u]", "tx_completed_desc[%u]", "tx_bytes[%u]",
	"tx_wake[%u]", "tx_stop[%u]", "tx_event_counter[%u]",
	"tx_dma_mapping_error[%u]",
};

static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = {
@@ -323,6 +324,7 @@ gve_get_ethtool_stats(struct net_device *netdev,
			data[i++] = tx->stop_queue;
			data[i++] = be32_to_cpu(gve_tx_load_event_counter(priv,
									  tx));
			data[i++] = tx->dma_mapping_error;
			/* stats from NIC */
			if (skip_nic_stats) {
				/* skip NIC tx stats */
Loading