Commit 4f948b34 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mctp-core-updates'



Matt Johnston says:

====================
Updates to MCTP core

This series adds timeouts for MCTP tags (a limited resource), and a few
other improvements to the MCTP core.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7c2dcfa2 7b1871af
Loading
Loading
Loading
Loading
+59 −0
Original line number Diff line number Diff line
@@ -211,3 +211,62 @@ remote address is already known, or the message does not require a reply.

Like the send calls, sockets will only receive responses to requests they have
sent (TO=1) and may only respond (TO=0) to requests they have received.

Kernel internals
================

There are a few possible packet flows in the MCTP stack:

1. local TX to remote endpoint, message <= MTU::

	sendmsg()
	 -> mctp_local_output()
	    : route lookup
	    -> rt->output() (== mctp_route_output)
	       -> dev_queue_xmit()

2. local TX to remote endpoint, message > MTU::

	sendmsg()
	-> mctp_local_output()
	    -> mctp_do_fragment_route()
	       : creates packet-sized skbs. For each new skb:
	       -> rt->output() (== mctp_route_output)
	          -> dev_queue_xmit()

3. remote TX to local endpoint, single-packet message::

	mctp_pkttype_receive()
	: route lookup
	-> rt->output() (== mctp_route_input)
	   : sk_key lookup
	   -> sock_queue_rcv_skb()

4. remote TX to local endpoint, multiple-packet message::

	mctp_pkttype_receive()
	: route lookup
	-> rt->output() (== mctp_route_input)
	   : sk_key lookup
	   : stores skb in struct sk_key->reasm_head

	mctp_pkttype_receive()
	: route lookup
	-> rt->output() (== mctp_route_input)
	   : sk_key lookup
	   : finds existing reassembly in sk_key->reasm_head
	   : appends new fragment
	   -> sock_queue_rcv_skb()

Key refcounts
-------------

 * keys are refed by:

   - a skb: during route output, stored in ``skb->cb``.

   - netns and sock lists.

 * keys can be associated with a device, in which case they hold a
   reference to the dev (set through ``key->dev``, counted through
   ``dev->key_count``). Multiple keys can reference the device.
+41 −15
Original line number Diff line number Diff line
@@ -62,35 +62,46 @@ struct mctp_sock {
	 * by sk->net->keys_lock
	 */
	struct hlist_head keys;

	/* mechanism for expiring allocated keys; will release an allocated
	 * tag, and any netdev state for a request/response pairing
	 */
	struct timer_list key_expiry;
};

/* Key for matching incoming packets to sockets or reassembly contexts.
 * Packets are matched on (src,dest,tag).
 *
 * Lifetime requirements:
 * Lifetime / locking requirements:
 *
 *  - individual key data (ie, the struct itself) is protected by key->lock;
 *    changes must be made with that lock held.
 *
 *  - the lookup fields: peer_addr, local_addr and tag are set before the
 *    key is added to lookup lists, and never updated.
 *
 *  - keys are free()ed via RCU
 *  - A ref to the key must be held (throuh key->refs) if a pointer to the
 *    key is to be accessed after key->lock is released.
 *
 *  - a mctp_sk_key contains a reference to a struct sock; this is valid
 *    for the life of the key. On sock destruction (through unhash), the key is
 *    removed from lists (see below), and will not be observable after a RCU
 *    grace period.
 *
 *    any RX occurring within that grace period may still queue to the socket,
 *    but will hit the SOCK_DEAD case before the socket is freed.
 *    removed from lists (see below), and marked invalid.
 *
 * - these mctp_sk_keys appear on two lists:
 *     1) the struct mctp_sock->keys list
 *     2) the struct netns_mctp->keys list
 *
 *        updates to either list are performed under the netns_mctp->keys
 *        lock.
 *   presences on these lists requires a (single) refcount to be held; both
 *   lists are updated as a single operation.
 *
 *   Updates and lookups in either list are performed under the
 *   netns_mctp->keys lock. Lookup functions will need to lock the key and
 *   take a reference before unlocking the keys_lock. Consequently, the list's
 *   keys_lock *cannot* be acquired with the individual key->lock held.
 *
 * - a key may have a sk_buff attached as part of an in-progress message
 *   reassembly (->reasm_head). The reassembly context is protected by
 *   reasm_lock, which may be acquired with the keys lock (above) held, if
 *   necessary. Consequently, keys lock *cannot* be acquired with the
 *   reasm_lock held.
 *   reassembly (->reasm_head). The reasm data is protected by the individual
 *   key->lock.
 *
 * - there are two destruction paths for a mctp_sk_key:
 *
@@ -101,6 +112,8 @@ struct mctp_sock {
 *      the (complete) reply, or during reassembly errors. Here, we clean up
 *      the reassembly context (marking reasm_dead, to prevent another from
 *      starting), and remove the socket from the netns & socket lists.
 *
 *    - through an expiry timeout, on a per-socket timer
 */
struct mctp_sk_key {
	mctp_eid_t	peer_addr;
@@ -116,14 +129,25 @@ struct mctp_sk_key {
	/* per-socket list */
	struct hlist_node sklist;

	/* lock protects against concurrent updates to the reassembly and
	 * expiry data below.
	 */
	spinlock_t	lock;

	/* Keys are referenced during the output path, which may sleep */
	refcount_t	refs;

	/* incoming fragment reassembly context */
	spinlock_t	reasm_lock;
	struct sk_buff	*reasm_head;
	struct sk_buff	**reasm_tailp;
	bool		reasm_dead;
	u8		last_seq;

	struct rcu_head	rcu;
	/* key validity */
	bool		valid;

	/* expiry timeout; valid (above) cleared on expiry */
	unsigned long	expiry;
};

struct mctp_skb_cb {
@@ -191,6 +215,8 @@ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb);
int mctp_local_output(struct sock *sk, struct mctp_route *rt,
		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);

void mctp_key_unref(struct mctp_sk_key *key);

/* routing <--> device interface */
unsigned int mctp_default_net(struct net *net);
int mctp_default_net_set(struct net *net, unsigned int index);
+5 −0
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@
struct mctp_dev {
	struct net_device	*dev;

	refcount_t		refs;

	unsigned int		net;

	/* Only modified under RTNL. Reads have addrs_lock held */
@@ -32,4 +34,7 @@ struct mctp_dev {
struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
struct mctp_dev *__mctp_dev_get(const struct net_device *dev);

void mctp_dev_hold(struct mctp_dev *mdev);
void mctp_dev_put(struct mctp_dev *mdev);

#endif /* __NET_MCTPDEVICE_H */
+75 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM mctp

#if !defined(_TRACE_MCTP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MCTP_H

#include <linux/tracepoint.h>

#ifndef __TRACE_MCTP_ENUMS
#define __TRACE_MCTP_ENUMS
enum {
	MCTP_TRACE_KEY_TIMEOUT,
	MCTP_TRACE_KEY_REPLIED,
	MCTP_TRACE_KEY_INVALIDATED,
	MCTP_TRACE_KEY_CLOSED,
};
#endif /* __TRACE_MCTP_ENUMS */

TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT);
TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED);
TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED);
TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED);

TRACE_EVENT(mctp_key_acquire,
	TP_PROTO(const struct mctp_sk_key *key),
	TP_ARGS(key),
	TP_STRUCT__entry(
		__field(__u8,	paddr)
		__field(__u8,	laddr)
		__field(__u8,	tag)
	),
	TP_fast_assign(
		__entry->paddr = key->peer_addr;
		__entry->laddr = key->local_addr;
		__entry->tag = key->tag;
	),
	TP_printk("local %d, peer %d, tag %1x",
		__entry->laddr,
		__entry->paddr,
		__entry->tag
	)
);

TRACE_EVENT(mctp_key_release,
	TP_PROTO(const struct mctp_sk_key *key, int reason),
	TP_ARGS(key, reason),
	TP_STRUCT__entry(
		__field(__u8,	paddr)
		__field(__u8,	laddr)
		__field(__u8,	tag)
		__field(int,	reason)
	),
	TP_fast_assign(
		__entry->paddr = key->peer_addr;
		__entry->laddr = key->local_addr;
		__entry->tag = key->tag;
		__entry->reason = reason;
	),
	TP_printk("local %d, peer %d, tag %1x %s",
		__entry->laddr,
		__entry->paddr,
		__entry->tag,
		__print_symbolic(__entry->reason,
				 { MCTP_TRACE_KEY_TIMEOUT, "timeout" },
				 { MCTP_TRACE_KEY_REPLIED, "replied" },
				 { MCTP_TRACE_KEY_INVALIDATED, "invalidated" },
				 { MCTP_TRACE_KEY_CLOSED, "closed" })
	)
);

#endif

#include <trace/define_trace.h>
+58 −8
Original line number Diff line number Diff line
@@ -16,6 +16,9 @@
#include <net/mctpdevice.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mctp.h>

/* socket implementation */

static int mctp_release(struct socket *sock)
@@ -223,16 +226,61 @@ static const struct proto_ops mctp_dgram_ops = {
	.sendpage	= sock_no_sendpage,
};

static void mctp_sk_expire_keys(struct timer_list *timer)
{
	struct mctp_sock *msk = container_of(timer, struct mctp_sock,
					     key_expiry);
	struct net *net = sock_net(&msk->sk);
	unsigned long next_expiry, flags;
	struct mctp_sk_key *key;
	struct hlist_node *tmp;
	bool next_expiry_valid = false;

	spin_lock_irqsave(&net->mctp.keys_lock, flags);

	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
		spin_lock(&key->lock);

		if (!time_after_eq(key->expiry, jiffies)) {
			trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT);
			key->valid = false;
			hlist_del_rcu(&key->hlist);
			hlist_del_rcu(&key->sklist);
			spin_unlock(&key->lock);
			mctp_key_unref(key);
			continue;
		}

		if (next_expiry_valid) {
			if (time_before(key->expiry, next_expiry))
				next_expiry = key->expiry;
		} else {
			next_expiry = key->expiry;
			next_expiry_valid = true;
		}
		spin_unlock(&key->lock);
	}

	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);

	if (next_expiry_valid)
		mod_timer(timer, next_expiry);
}

static int mctp_sk_init(struct sock *sk)
{
	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);

	INIT_HLIST_HEAD(&msk->keys);
	timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0);
	return 0;
}

static void mctp_sk_close(struct sock *sk, long timeout)
{
	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);

	del_timer_sync(&msk->key_expiry);
	sk_common_release(sk);
}

@@ -263,21 +311,23 @@ static void mctp_sk_unhash(struct sock *sk)
	/* remove tag allocations */
	spin_lock_irqsave(&net->mctp.keys_lock, flags);
	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
		hlist_del_rcu(&key->sklist);
		hlist_del_rcu(&key->hlist);
		hlist_del(&key->sklist);
		hlist_del(&key->hlist);

		trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED);

		spin_lock(&key->reasm_lock);
		spin_lock(&key->lock);
		if (key->reasm_head)
			kfree_skb(key->reasm_head);
		key->reasm_head = NULL;
		key->reasm_dead = true;
		spin_unlock(&key->reasm_lock);
		key->valid = false;
		spin_unlock(&key->lock);

		kfree_rcu(key, rcu);
		/* key is no longer on the lookup lists, unref */
		mctp_key_unref(key);
	}
	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);

	synchronize_rcu();
}

static struct proto mctp_proto = {
@@ -385,7 +435,7 @@ static __exit void mctp_exit(void)
	sock_unregister(PF_MCTP);
}

module_init(mctp_init);
subsys_initcall(mctp_init);
module_exit(mctp_exit);

MODULE_DESCRIPTION("MCTP core");
Loading