Commit 4a992bbd authored by Jeremy Kerr's avatar Jeremy Kerr Committed by David S. Miller
Browse files

mctp: Implement message fragmentation & reassembly



This change implements MCTP fragmentation (based on route & device MTU),
and corresponding reassembly.

The MCTP specification only allows for fragmentation on the originating
message endpoint, and reassembly on the destination endpoint -
intermediate nodes do not need to reassemble/refragment.  Consequently,
we only fragment in the local transmit path, and reassemble
locally-bound packets. Messages are required to be in-order, so we
simply cancel reassembly on out-of-order or missing packets.

In the fragmentation path, we just break up the message into MTU-sized
fragments; the skb structure is a simple copy for now, which we can later
improve with a shared data implementation.

For reassembly, we keep track of incoming message fragments using the
existing tag infrastructure, allocating a key on the (src,dest,tag)
tuple, and reassembles matching fragments into a skb->frag_list.

Signed-off-by: default avatarJeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 833ef3b9
Loading
Loading
Loading
Loading
+22 −3
Original line number Diff line number Diff line
@@ -84,9 +84,21 @@ struct mctp_sock {
 *        updates to either list are performed under the netns_mctp->keys
 *        lock.
 *
 * - there is a single destruction path for a mctp_sk_key - through socket
 *   unhash (see mctp_sk_unhash). This performs the list removal under
 *   keys_lock.
 * - a key may have a sk_buff attached as part of an in-progress message
 *   reassembly (->reasm_head). The reassembly context is protected by
 *   reasm_lock, which may be acquired with the keys lock (above) held, if
 *   necessary. Consequently, keys lock *cannot* be acquired with the
 *   reasm_lock held.
 *
 * - there are two destruction paths for a mctp_sk_key:
 *
 *    - through socket unhash (see mctp_sk_unhash). This performs the list
 *      removal under keys_lock.
 *
 *    - where a key is established to receive a reply message: after receiving
 *      the (complete) reply, or during reassembly errors. Here, we clean up
 *      the reassembly context (marking reasm_dead, to prevent another from
 *      starting), and remove the socket from the netns & socket lists.
 */
struct mctp_sk_key {
	mctp_eid_t	peer_addr;
@@ -102,6 +114,13 @@ struct mctp_sk_key {
	/* per-socket list */
	struct hlist_node sklist;

	/* incoming fragment reassembly context */
	spinlock_t	reasm_lock;
	struct sk_buff	*reasm_head;
	struct sk_buff	**reasm_tailp;
	bool		reasm_dead;
	u8		last_seq;

	struct rcu_head	rcu;
};

+8 −0
Original line number Diff line number Diff line
@@ -263,6 +263,14 @@ static void mctp_sk_unhash(struct sock *sk)
	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
		hlist_del_rcu(&key->sklist);
		hlist_del_rcu(&key->hlist);

		spin_lock(&key->reasm_lock);
		if (key->reasm_head)
			kfree_skb(key->reasm_head);
		key->reasm_head = NULL;
		key->reasm_dead = true;
		spin_unlock(&key->reasm_lock);

		kfree_rcu(key, rcu);
	}
	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+331 −41
Original line number Diff line number Diff line
@@ -23,6 +23,8 @@
#include <net/netlink.h>
#include <net/sock.h>

static const unsigned int mctp_message_maxlen = 64 * 1024;

/* route output callbacks */
static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
{
@@ -105,14 +107,125 @@ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
	return ret;
}

static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk,
					  mctp_eid_t local, mctp_eid_t peer,
					  u8 tag, gfp_t gfp)
{
	struct mctp_sk_key *key;

	key = kzalloc(sizeof(*key), gfp);
	if (!key)
		return NULL;

	key->peer_addr = peer;
	key->local_addr = local;
	key->tag = tag;
	key->sk = &msk->sk;
	spin_lock_init(&key->reasm_lock);

	return key;
}

static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
{
	struct net *net = sock_net(&msk->sk);
	struct mctp_sk_key *tmp;
	unsigned long flags;
	int rc = 0;

	spin_lock_irqsave(&net->mctp.keys_lock, flags);

	hlist_for_each_entry(tmp, &net->mctp.keys, hlist) {
		if (mctp_key_match(tmp, key->local_addr, key->peer_addr,
				   key->tag)) {
			rc = -EEXIST;
			break;
		}
	}

	if (!rc) {
		hlist_add_head(&key->hlist, &net->mctp.keys);
		hlist_add_head(&key->sklist, &msk->keys);
	}

	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);

	return rc;
}

/* Must be called with key->reasm_lock, which it will release. Will schedule
 * the key for an RCU free.
 */
static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net,
				   unsigned long flags)
	__releases(&key->reasm_lock)
{
	struct sk_buff *skb;

	skb = key->reasm_head;
	key->reasm_head = NULL;
	key->reasm_dead = true;
	spin_unlock_irqrestore(&key->reasm_lock, flags);

	spin_lock_irqsave(&net->mctp.keys_lock, flags);
	hlist_del_rcu(&key->hlist);
	hlist_del_rcu(&key->sklist);
	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
	kfree_rcu(key, rcu);

	if (skb)
		kfree_skb(skb);
}

static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
{
	struct mctp_hdr *hdr = mctp_hdr(skb);
	u8 exp_seq, this_seq;

	this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT)
		& MCTP_HDR_SEQ_MASK;

	if (!key->reasm_head) {
		key->reasm_head = skb;
		key->reasm_tailp = &(skb_shinfo(skb)->frag_list);
		key->last_seq = this_seq;
		return 0;
	}

	exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK;

	if (this_seq != exp_seq)
		return -EINVAL;

	if (key->reasm_head->len + skb->len > mctp_message_maxlen)
		return -EINVAL;

	skb->next = NULL;
	skb->sk = NULL;
	*key->reasm_tailp = skb;
	key->reasm_tailp = &skb->next;

	key->last_seq = this_seq;

	key->reasm_head->data_len += skb->len;
	key->reasm_head->len += skb->len;
	key->reasm_head->truesize += skb->truesize;

	return 0;
}

static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
{
	struct net *net = dev_net(skb->dev);
	struct mctp_sk_key *key;
	struct mctp_sock *msk;
	struct mctp_hdr *mh;
	unsigned long f;
	u8 tag, flags;
	int rc;

	msk = NULL;
	rc = -EINVAL;

	/* we may be receiving a locally-routed packet; drop source sk
	 * accounting
@@ -121,50 +234,144 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)

	/* ensure we have enough data for a header and a type */
	if (skb->len < sizeof(struct mctp_hdr) + 1)
		goto drop;
		goto out;

	/* grab header, advance data ptr */
	mh = mctp_hdr(skb);
	skb_pull(skb, sizeof(struct mctp_hdr));

	if (mh->ver != 1)
		goto drop;
		goto out;

	/* TODO: reassembly */
	if ((mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
				!= (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
		goto drop;
	flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM);
	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);

	rcu_read_lock();
	/* 1. lookup socket matching (src,dest,tag) */

	/* lookup socket / reasm context, exactly matching (src,dest,tag) */
	key = mctp_lookup_key(net, skb, mh->src);

	/* 2. lookup socket macthing (BCAST,dest,tag) */
	if (!key)
	if (flags & MCTP_HDR_FLAG_SOM) {
		if (key) {
			msk = container_of(key->sk, struct mctp_sock, sk);
		} else {
			/* first response to a broadcast? do a more general
			 * key lookup to find the socket, but don't use this
			 * key for reassembly - we'll create a more specific
			 * one for future packets if required (ie, !EOM).
			 */
			key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY);
			if (key) {
				msk = container_of(key->sk,
						   struct mctp_sock, sk);
				key = NULL;
			}
		}

	/* 3. SOM? -> lookup bound socket, conditionally (!EOM) create
	 * mapping for future (1)/(2).
	 */
	if (key)
		msk = container_of(key->sk, struct mctp_sock, sk);
	else if (!msk && (mh->flags_seq_tag & MCTP_HDR_FLAG_SOM))
		if (!key && !msk && (tag & MCTP_HDR_FLAG_TO))
			msk = mctp_lookup_bind(net, skb);

	if (!msk)
		goto unlock_drop;
		if (!msk) {
			rc = -ENOENT;
			goto out_unlock;
		}

		/* single-packet message? deliver to socket, clean up any
		 * pending key.
		 */
		if (flags & MCTP_HDR_FLAG_EOM) {
			sock_queue_rcv_skb(&msk->sk, skb);
			if (key) {
				spin_lock_irqsave(&key->reasm_lock, f);
				/* we've hit a pending reassembly; not much we
				 * can do but drop it
				 */
				__mctp_key_unlock_drop(key, net, f);
			}
			rc = 0;
			goto out_unlock;
		}

	rcu_read_unlock();
		/* broadcast response or a bind() - create a key for further
		 * packets for this message
		 */
		if (!key) {
			key = mctp_key_alloc(msk, mh->dest, mh->src,
					     tag, GFP_ATOMIC);
			if (!key) {
				rc = -ENOMEM;
				goto out_unlock;
			}

	return 0;
			/* we can queue without the reasm lock here, as the
			 * key isn't observable yet
			 */
			mctp_frag_queue(key, skb);

			/* if the key_add fails, we've raced with another
			 * SOM packet with the same src, dest and tag. There's
			 * no way to distinguish future packets, so all we
			 * can do is drop; we'll free the skb on exit from
			 * this function.
			 */
			rc = mctp_key_add(key, msk);
			if (rc)
				kfree(key);

		} else {
			/* existing key: start reassembly */
			spin_lock_irqsave(&key->reasm_lock, f);

			if (key->reasm_head || key->reasm_dead) {
				/* duplicate start? drop everything */
				__mctp_key_unlock_drop(key, net, f);
				rc = -EEXIST;
			} else {
				rc = mctp_frag_queue(key, skb);
				spin_unlock_irqrestore(&key->reasm_lock, f);
			}
		}

	} else if (key) {
		/* this packet continues a previous message; reassemble
		 * using the message-specific key
		 */

		spin_lock_irqsave(&key->reasm_lock, f);

unlock_drop:
		/* we need to be continuing an existing reassembly... */
		if (!key->reasm_head)
			rc = -EINVAL;
		else
			rc = mctp_frag_queue(key, skb);

		/* end of message? deliver to socket, and we're done with
		 * the reassembly/response key
		 */
		if (!rc && flags & MCTP_HDR_FLAG_EOM) {
			sock_queue_rcv_skb(key->sk, key->reasm_head);
			key->reasm_head = NULL;
			__mctp_key_unlock_drop(key, net, f);
		} else {
			spin_unlock_irqrestore(&key->reasm_lock, f);
		}

	} else {
		/* not a start, no matching key */
		rc = -ENOENT;
	}

out_unlock:
	rcu_read_unlock();
drop:
out:
	if (rc)
		kfree_skb(skb);
	return 0;
	return rc;
}

static unsigned int mctp_route_mtu(struct mctp_route *rt)
{
	return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu);
}

static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
@@ -228,8 +435,6 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,

	lockdep_assert_held(&mns->keys_lock);

	key->sk = &msk->sk;

	/* we hold the net->key_lock here, allowing updates to both
	 * then net and sk
	 */
@@ -251,11 +456,9 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk,
	u8 tagbits;

	/* be optimistic, alloc now */
	key = kzalloc(sizeof(*key), GFP_KERNEL);
	key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL);
	if (!key)
		return -ENOMEM;
	key->local_addr = saddr;
	key->peer_addr = daddr;

	/* 8 possible tag values */
	tagbits = 0xff;
@@ -340,6 +543,86 @@ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb)
	return rc;
}

static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
				  unsigned int mtu, u8 tag)
{
	const unsigned int hlen = sizeof(struct mctp_hdr);
	struct mctp_hdr *hdr, *hdr2;
	unsigned int pos, size;
	struct sk_buff *skb2;
	int rc;
	u8 seq;

	hdr = mctp_hdr(skb);
	seq = 0;
	rc = 0;

	if (mtu < hlen + 1) {
		kfree_skb(skb);
		return -EMSGSIZE;
	}

	/* we've got the header */
	skb_pull(skb, hlen);

	for (pos = 0; pos < skb->len;) {
		/* size of message payload */
		size = min(mtu - hlen, skb->len - pos);

		skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL);
		if (!skb2) {
			rc = -ENOMEM;
			break;
		}

		/* generic skb copy */
		skb2->protocol = skb->protocol;
		skb2->priority = skb->priority;
		skb2->dev = skb->dev;
		memcpy(skb2->cb, skb->cb, sizeof(skb2->cb));

		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);

		/* establish packet */
		skb_reserve(skb2, MCTP_HEADER_MAXLEN);
		skb_reset_network_header(skb2);
		skb_put(skb2, hlen + size);
		skb2->transport_header = skb2->network_header + hlen;

		/* copy header fields, calculate SOM/EOM flags & seq */
		hdr2 = mctp_hdr(skb2);
		hdr2->ver = hdr->ver;
		hdr2->dest = hdr->dest;
		hdr2->src = hdr->src;
		hdr2->flags_seq_tag = tag &
			(MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);

		if (pos == 0)
			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM;

		if (pos + size == skb->len)
			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM;

		hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT;

		/* copy message payload */
		skb_copy_bits(skb, pos, skb_transport_header(skb2), size);

		/* do route, but don't drop the rt reference */
		rc = rt->output(rt, skb2);
		if (rc)
			break;

		seq = (seq + 1) & MCTP_HDR_SEQ_MASK;
		pos += size;
	}

	mctp_route_release(rt);
	consume_skb(skb);
	return rc;
}

int mctp_local_output(struct sock *sk, struct mctp_route *rt,
		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
{
@@ -347,6 +630,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
	struct mctp_skb_cb *cb = mctp_cb(skb);
	struct mctp_hdr *hdr;
	unsigned long flags;
	unsigned int mtu;
	mctp_eid_t saddr;
	int rc;
	u8 tag;
@@ -376,26 +660,32 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
		tag = req_tag;
	}

	/* TODO: we have the route MTU here; packetise */

	skb->protocol = htons(ETH_P_MCTP);
	skb->priority = 0;
	skb_reset_transport_header(skb);
	skb_push(skb, sizeof(struct mctp_hdr));
	skb_reset_network_header(skb);
	skb->dev = rt->dev->dev;

	/* cb->net will have been set on initial ingress */
	cb->src = saddr;

	/* set up common header fields */
	hdr = mctp_hdr(skb);
	hdr->ver = 1;
	hdr->dest = daddr;
	hdr->src = saddr;
	hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | /* TODO */
		tag;

	skb->dev = rt->dev->dev;
	skb->protocol = htons(ETH_P_MCTP);
	skb->priority = 0;

	/* cb->net will have been set on initial ingress */
	cb->src = saddr;
	mtu = mctp_route_mtu(rt);

	if (skb->len + sizeof(struct mctp_hdr) <= mtu) {
		hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM |
			tag;
		return mctp_do_route(rt, skb);
	} else {
		return mctp_do_fragment_route(rt, skb, mtu, tag);
	}
}

/* route management */