Commit 2117fce8 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'psample-Add-additional-metadata-attributes'

Ido Schimmel says:

====================
psample: Add additional metadata attributes

This series extends the psample module to expose additional metadata to
user space for packets sampled via act_sample. The new metadata (e.g.,
transit delay) can then be consumed by applications such as hsflowd [1]
for better network observability.

netdevsim is extended with a dummy psample implementation that
periodically reports "sampled" packets to the psample module. In
addition to testing of the psample module, it enables the development
and demonstration of user space applications (e.g., hsflowd) that are
interested in the new metadata even without access to specialized
hardware (e.g., Spectrum ASIC) that can provide it.

mlxsw is also extended to provide the new metadata to psample.

A Wireshark dissector for psample netlink packets [2] will be submitted
upstream after the kernel patches are accepted. In addition, a libpcap
capture module for psample is currently in the works. Eventually, users
should be able to run:

 # tshark -i psample

In order to consume sampled packets along with their metadata.

Series overview:

Patch #1 makes it easier to extend the metadata provided to psample

Patch #2 adds the new metadata attributes to psample

Patch #3 extends netdevsim to periodically report "sampled" packets to
psample. Various debugfs knobs are added to control the reporting

Patch #4 adds a selftest over netdevsim

Patches #5-#10 gradually add support for the new metadata in mlxsw

Patch #11 adds a selftest over mlxsw

[1] https://sflow.org/draft4_sflow_transit.txt
[2] https://gitlab.com/amitcohen1/wireshark/-/commit/3d711143024e032aef1b056dd23f0266c54fab56


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c6baf7ee bb24d592
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -579,6 +579,7 @@ config NETDEVSIM
	depends on DEBUG_FS
	depends on INET
	depends on IPV6 || IPV6=n
	depends on PSAMPLE || PSAMPLE=n
	select NET_DEVLINK
	help
	  This driver is a developer testing tool and software model that can
+20 −1
Original line number Diff line number Diff line
@@ -58,6 +58,25 @@ struct mlxsw_tx_info {
	bool is_emad;
};

struct mlxsw_rx_md_info {
	u32 cookie_index;
	u32 latency;
	u32 tx_congestion;
	union {
		/* Valid when 'tx_port_valid' is set. */
		u16 tx_sys_port;
		u16 tx_lag_id;
	};
	u8 tx_lag_port_index; /* Valid when 'tx_port_is_lag' is set. */
	u8 tx_tc;
	u8 latency_valid:1,
	   tx_congestion_valid:1,
	   tx_tc_valid:1,
	   tx_port_valid:1,
	   tx_port_is_lag:1,
	   unused:3;
};

bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
				  const struct mlxsw_tx_info *tx_info);
int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
@@ -515,7 +534,7 @@ enum mlxsw_devlink_param_id {
struct mlxsw_skb_cb {
	union {
		struct mlxsw_tx_info tx_info;
		u32 cookie_index; /* Only used during receive */
		struct mlxsw_rx_md_info rx_md_info;
	};
};

+54 −1
Original line number Diff line number Diff line
@@ -540,6 +540,55 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
	spin_unlock(&q->lock);
}

static void mlxsw_pci_cqe_rdq_md_tx_port_init(struct sk_buff *skb,
					      const char *cqe)
{
	struct mlxsw_skb_cb *cb = mlxsw_skb_cb(skb);

	if (mlxsw_pci_cqe2_tx_lag_get(cqe)) {
		cb->rx_md_info.tx_port_is_lag = true;
		cb->rx_md_info.tx_lag_id = mlxsw_pci_cqe2_tx_lag_id_get(cqe);
		cb->rx_md_info.tx_lag_port_index =
			mlxsw_pci_cqe2_tx_lag_subport_get(cqe);
	} else {
		cb->rx_md_info.tx_port_is_lag = false;
		cb->rx_md_info.tx_sys_port =
			mlxsw_pci_cqe2_tx_system_port_get(cqe);
	}

	if (cb->rx_md_info.tx_sys_port != MLXSW_PCI_CQE2_TX_PORT_MULTI_PORT &&
	    cb->rx_md_info.tx_sys_port != MLXSW_PCI_CQE2_TX_PORT_INVALID)
		cb->rx_md_info.tx_port_valid = 1;
	else
		cb->rx_md_info.tx_port_valid = 0;
}

static void mlxsw_pci_cqe_rdq_md_init(struct sk_buff *skb, const char *cqe)
{
	struct mlxsw_skb_cb *cb = mlxsw_skb_cb(skb);

	cb->rx_md_info.tx_congestion = mlxsw_pci_cqe2_mirror_cong_get(cqe);
	if (cb->rx_md_info.tx_congestion != MLXSW_PCI_CQE2_MIRROR_CONG_INVALID)
		cb->rx_md_info.tx_congestion_valid = 1;
	else
		cb->rx_md_info.tx_congestion_valid = 0;
	cb->rx_md_info.tx_congestion <<= MLXSW_PCI_CQE2_MIRROR_CONG_SHIFT;

	cb->rx_md_info.latency = mlxsw_pci_cqe2_mirror_latency_get(cqe);
	if (cb->rx_md_info.latency != MLXSW_PCI_CQE2_MIRROR_LATENCY_INVALID)
		cb->rx_md_info.latency_valid = 1;
	else
		cb->rx_md_info.latency_valid = 0;

	cb->rx_md_info.tx_tc = mlxsw_pci_cqe2_mirror_tclass_get(cqe);
	if (cb->rx_md_info.tx_tc != MLXSW_PCI_CQE2_MIRROR_TCLASS_INVALID)
		cb->rx_md_info.tx_tc_valid = 1;
	else
		cb->rx_md_info.tx_tc_valid = 0;

	mlxsw_pci_cqe_rdq_md_tx_port_init(skb, cqe);
}

static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
				     struct mlxsw_pci_queue *q,
				     u16 consumer_counter_limit,
@@ -581,11 +630,15 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,

		if (mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2)
			cookie_index = mlxsw_pci_cqe2_user_def_val_orig_pkt_len_get(cqe);
		mlxsw_skb_cb(skb)->cookie_index = cookie_index;
		mlxsw_skb_cb(skb)->rx_md_info.cookie_index = cookie_index;
	} else if (rx_info.trap_id >= MLXSW_TRAP_ID_MIRROR_SESSION0 &&
		   rx_info.trap_id <= MLXSW_TRAP_ID_MIRROR_SESSION7 &&
		   mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) {
		rx_info.mirror_reason = mlxsw_pci_cqe2_mirror_reason_get(cqe);
		mlxsw_pci_cqe_rdq_md_init(skb, cqe);
	} else if (rx_info.trap_id == MLXSW_TRAP_ID_PKT_SAMPLE &&
		   mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) {
		mlxsw_pci_cqe_rdq_md_tx_port_init(skb, cqe);
	}

	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
+71 −0
Original line number Diff line number Diff line
@@ -173,6 +173,15 @@ MLXSW_ITEM32(pci, cqe, wqe_counter, 0x04, 16, 16);
 */
MLXSW_ITEM32(pci, cqe, byte_count, 0x04, 0, 14);

#define MLXSW_PCI_CQE2_MIRROR_CONG_INVALID	0xFFFF

/* pci_cqe_mirror_cong_high
 * Congestion level in units of 8KB of the egress traffic class of the original
 * packet that does mirroring to the CPU. Value of 0xFFFF means that the
 * congestion level is invalid.
 */
MLXSW_ITEM32(pci, cqe2, mirror_cong_high, 0x08, 16, 4);

/* pci_cqe_trap_id
 * Trap ID that captured the packet.
 */
@@ -208,6 +217,59 @@ MLXSW_ITEM32(pci, cqe0, dqn, 0x0C, 1, 5);
MLXSW_ITEM32(pci, cqe12, dqn, 0x0C, 1, 6);
mlxsw_pci_cqe_item_helpers(dqn, 0, 12, 12);

#define MLXSW_PCI_CQE2_MIRROR_TCLASS_INVALID	0x1F

/* pci_cqe_mirror_tclass
 * The egress traffic class of the original packet that does mirroring to the
 * CPU. Value of 0x1F means that the traffic class is invalid.
 */
MLXSW_ITEM32(pci, cqe2, mirror_tclass, 0x10, 27, 5);

/* pci_cqe_tx_lag
 * The Tx port of a packet that is mirrored / sampled to the CPU is a LAG.
 */
MLXSW_ITEM32(pci, cqe2, tx_lag, 0x10, 24, 1);

/* pci_cqe_tx_lag_subport
 * The port index within the LAG of a packet that is mirrored / sampled to the
 * CPU. Reserved when tx_lag is 0.
 */
MLXSW_ITEM32(pci, cqe2, tx_lag_subport, 0x10, 16, 8);

#define MLXSW_PCI_CQE2_TX_PORT_MULTI_PORT	0xFFFE
#define MLXSW_PCI_CQE2_TX_PORT_INVALID		0xFFFF

/* pci_cqe_tx_lag_id
 * The Tx LAG ID of the original packet that is mirrored / sampled to the CPU.
 * Value of 0xFFFE means multi-port. Value fo 0xFFFF means that the Tx LAG ID
 * is invalid. Reserved when tx_lag is 0.
 */
MLXSW_ITEM32(pci, cqe2, tx_lag_id, 0x10, 0, 16);

/* pci_cqe_tx_system_port
 * The Tx port of the original packet that is mirrored / sampled to the CPU.
 * Value of 0xFFFE means multi-port. Value fo 0xFFFF means that the Tx port is
 * invalid. Reserved when tx_lag is 1.
 */
MLXSW_ITEM32(pci, cqe2, tx_system_port, 0x10, 0, 16);

/* pci_cqe_mirror_cong_low
 * Congestion level in units of 8KB of the egress traffic class of the original
 * packet that does mirroring to the CPU. Value of 0xFFFF means that the
 * congestion level is invalid.
 */
MLXSW_ITEM32(pci, cqe2, mirror_cong_low, 0x14, 20, 12);

#define MLXSW_PCI_CQE2_MIRROR_CONG_SHIFT	13	/* Units of 8KB. */

static inline u16 mlxsw_pci_cqe2_mirror_cong_get(const char *cqe)
{
	u16 cong_high = mlxsw_pci_cqe2_mirror_cong_high_get(cqe);
	u16 cong_low = mlxsw_pci_cqe2_mirror_cong_low_get(cqe);

	return cong_high << 12 | cong_low;
}

/* pci_cqe_user_def_val_orig_pkt_len
 * When trap_id is an ACL: User defined value from policy engine action.
 */
@@ -218,6 +280,15 @@ MLXSW_ITEM32(pci, cqe2, user_def_val_orig_pkt_len, 0x14, 0, 20);
 */
MLXSW_ITEM32(pci, cqe2, mirror_reason, 0x18, 24, 8);

#define MLXSW_PCI_CQE2_MIRROR_LATENCY_INVALID	0xFFFFFF

/* pci_cqe_mirror_latency
 * End-to-end latency of the original packet that does mirroring to the CPU.
 * Value of 0xFFFFFF means that the latency is invalid. Units are according to
 * MOGCR.mirror_latency_units.
 */
MLXSW_ITEM32(pci, cqe2, mirror_latency, 0x1C, 8, 24);

/* pci_cqe_owner
 * Ownership bit.
 */
+0 −26
Original line number Diff line number Diff line
@@ -2212,32 +2212,6 @@ void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb,
	mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port);
}

void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb,
			     u8 local_port)
{
	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
	struct mlxsw_sp_port_sample *sample;
	u32 size;

	if (unlikely(!mlxsw_sp_port)) {
		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n",
				     local_port);
		goto out;
	}

	rcu_read_lock();
	sample = rcu_dereference(mlxsw_sp_port->sample);
	if (!sample)
		goto out_unlock;
	size = sample->truncate ? sample->trunc_size : skb->len;
	psample_sample_packet(sample->psample_group, skb, size,
			      mlxsw_sp_port->dev->ifindex, 0, sample->rate);
out_unlock:
	rcu_read_unlock();
out:
	consume_skb(skb);
}

#define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
	MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action,	\
		  _is_ctrl, SP_##_trap_group, DISCARD)
Loading