Commit 7c989b1d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.1/passthrough-2022-10-04' of git://git.kernel.dk/linux

Pull passthrough updates from Jens Axboe:
 "With these changes, passthrough NVMe support over io_uring now
  performs at the same level as block device O_DIRECT, and in many cases
  6-8% better.

  This contains:

   - Add support for fixed buffers for passthrough (Anuj, Kanchan)

   - Enable batched allocations and freeing on passthrough, similarly to
     what we support on the normal storage path (me)

   - Fix from Geert fixing an issue with !CONFIG_IO_URING"

* tag 'for-6.1/passthrough-2022-10-04' of git://git.kernel.dk/linux:
  io_uring: Add missing inline to io_uring_cmd_import_fixed() dummy
  nvme: wire up fixed buffer support for nvme passthrough
  nvme: pass ubuffer as an integer
  block: extend functionality to map bvec iterator
  block: factor out blk_rq_map_bio_alloc helper
  block: rename bio_map_put to blk_mq_map_bio_put
  nvme: refactor nvme_alloc_request
  nvme: refactor nvme_add_user_metadata
  nvme: Use blk_rq_map_user_io helper
  scsi: Use blk_rq_map_user_io helper
  block: add blk_rq_map_user_io
  io_uring: introduce fixed buffer support for io_uring_cmd
  io_uring: add io_uring_cmd_import_fixed
  nvme: enable batched completions of passthrough IO
  nvme: split out metadata vs non metadata end_io uring_cmd completions
  block: allow end_io based requests in the completion batch handling
  block: change request end_io handler to pass back a return value
  block: enable batched allocation for blk_mq_alloc_request()
  block: kill deprecated BUG_ON() in the flush handling
parents 51338980 0e0abad2
Loading
Loading
Loading
Loading
+7 −4
Original line number Diff line number Diff line
@@ -205,7 +205,6 @@ static void blk_flush_complete_seq(struct request *rq,
		 * flush data request completion path.  Restore @rq for
		 * normal completion and end it.
		 */
		BUG_ON(!list_empty(&rq->queuelist));
		list_del_init(&rq->flush.list);
		blk_flush_restore_request(rq);
		blk_mq_end_request(rq, error);
@@ -218,7 +217,8 @@ static void blk_flush_complete_seq(struct request *rq,
	blk_kick_flush(q, fq, cmd_flags);
}

static void flush_end_io(struct request *flush_rq, blk_status_t error)
static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
				       blk_status_t error)
{
	struct request_queue *q = flush_rq->q;
	struct list_head *running;
@@ -232,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
	if (!req_ref_put_and_test(flush_rq)) {
		fq->rq_status = error;
		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
		return;
		return RQ_END_IO_NONE;
	}

	blk_account_io_flush(flush_rq);
@@ -269,6 +269,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
	}

	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
	return RQ_END_IO_NONE;
}

bool is_flush_rq(struct request *rq)
@@ -354,7 +355,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
	blk_flush_queue_rq(flush_rq, false);
}

static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
					       blk_status_t error)
{
	struct request_queue *q = rq->q;
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -376,6 +378,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);

	blk_mq_sched_restart(hctx);
	return RQ_END_IO_NONE;
}

/**
+132 −18
Original line number Diff line number Diff line
@@ -231,7 +231,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
	return ret;
}

static void bio_map_put(struct bio *bio)
static void blk_mq_map_bio_put(struct bio *bio)
{
	if (bio->bi_opf & REQ_ALLOC_CACHE) {
		bio_put(bio);
@@ -241,17 +241,10 @@ static void bio_map_put(struct bio *bio)
	}
}

static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
		gfp_t gfp_mask)
static struct bio *blk_rq_map_bio_alloc(struct request *rq,
		unsigned int nr_vecs, gfp_t gfp_mask)
{
	unsigned int max_sectors = queue_max_hw_sectors(rq->q);
	unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
	struct bio *bio;
	int ret;
	int j;

	if (!iov_iter_count(iter))
		return -EINVAL;

	if (rq->cmd_flags & REQ_POLLED) {
		blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE;
@@ -259,13 +252,31 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
		bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask,
					&fs_bio_set);
		if (!bio)
			return -ENOMEM;
			return NULL;
	} else {
		bio = bio_kmalloc(nr_vecs, gfp_mask);
		if (!bio)
			return -ENOMEM;
			return NULL;
		bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
	}
	return bio;
}

static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
		gfp_t gfp_mask)
{
	unsigned int max_sectors = queue_max_hw_sectors(rq->q);
	unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
	struct bio *bio;
	int ret;
	int j;

	if (!iov_iter_count(iter))
		return -EINVAL;

	bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
	if (bio == NULL)
		return -ENOMEM;

	while (iov_iter_count(iter)) {
		struct page **pages, *stack_pages[UIO_FASTIOV];
@@ -331,7 +342,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,

 out_unmap:
	bio_release_pages(bio, false);
	bio_map_put(bio);
	blk_mq_map_bio_put(bio);
	return ret;
}

@@ -537,6 +548,62 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(blk_rq_append_bio);

/* Prepare bio for passthrough IO given ITER_BVEC iter */
static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
{
	struct request_queue *q = rq->q;
	size_t nr_iter = iov_iter_count(iter);
	size_t nr_segs = iter->nr_segs;
	struct bio_vec *bvecs, *bvprvp = NULL;
	struct queue_limits *lim = &q->limits;
	unsigned int nsegs = 0, bytes = 0;
	struct bio *bio;
	size_t i;

	if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
		return -EINVAL;
	if (nr_segs > queue_max_segments(q))
		return -EINVAL;

	/* no iovecs to alloc, as we already have a BVEC iterator */
	bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL);
	if (bio == NULL)
		return -ENOMEM;

	bio_iov_bvec_set(bio, (struct iov_iter *)iter);
	blk_rq_bio_prep(rq, bio, nr_segs);

	/* loop to perform a bunch of sanity checks */
	bvecs = (struct bio_vec *)iter->bvec;
	for (i = 0; i < nr_segs; i++) {
		struct bio_vec *bv = &bvecs[i];

		/*
		 * If the queue doesn't support SG gaps and adding this
		 * offset would create a gap, fallback to copy.
		 */
		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
			blk_mq_map_bio_put(bio);
			return -EREMOTEIO;
		}
		/* check full condition */
		if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
			goto put_bio;
		if (bytes + bv->bv_len > nr_iter)
			goto put_bio;
		if (bv->bv_offset + bv->bv_len > PAGE_SIZE)
			goto put_bio;

		nsegs++;
		bytes += bv->bv_len;
		bvprvp = bv;
	}
	return 0;
put_bio:
	blk_mq_map_bio_put(bio);
	return -EINVAL;
}

/**
 * blk_rq_map_user_iov - map user data to a request, for passthrough requests
 * @q:		request queue where request should be inserted
@@ -556,24 +623,35 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
			struct rq_map_data *map_data,
			const struct iov_iter *iter, gfp_t gfp_mask)
{
	bool copy = false;
	bool copy = false, map_bvec = false;
	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
	struct bio *bio = NULL;
	struct iov_iter i;
	int ret = -EINVAL;

	if (!iter_is_iovec(iter))
		goto fail;

	if (map_data)
		copy = true;
	else if (blk_queue_may_bounce(q))
		copy = true;
	else if (iov_iter_alignment(iter) & align)
		copy = true;
	else if (iov_iter_is_bvec(iter))
		map_bvec = true;
	else if (!iter_is_iovec(iter))
		copy = true;
	else if (queue_virt_boundary(q))
		copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);

	if (map_bvec) {
		ret = blk_rq_map_user_bvec(rq, iter);
		if (!ret)
			return 0;
		if (ret != -EREMOTEIO)
			goto fail;
		/* fall back to copying the data on limits mismatches */
		copy = true;
	}

	i = *iter;
	do {
		if (copy)
@@ -611,6 +689,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(blk_rq_map_user);

int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data,
		void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask,
		bool vec, int iov_count, bool check_iter_count, int rw)
{
	int ret = 0;

	if (vec) {
		struct iovec fast_iov[UIO_FASTIOV];
		struct iovec *iov = fast_iov;
		struct iov_iter iter;

		ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len,
				UIO_FASTIOV, &iov, &iter);
		if (ret < 0)
			return ret;

		if (iov_count) {
			/* SG_IO howto says that the shorter of the two wins */
			iov_iter_truncate(&iter, buf_len);
			if (check_iter_count && !iov_iter_count(&iter)) {
				kfree(iov);
				return -EINVAL;
			}
		}

		ret = blk_rq_map_user_iov(req->q, req, map_data, &iter,
				gfp_mask);
		kfree(iov);
	} else if (buf_len) {
		ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len,
				gfp_mask);
	}
	return ret;
}
EXPORT_SYMBOL(blk_rq_map_user_io);

/**
 * blk_rq_unmap_user - unmap a request with user data
 * @bio:	       start of bio list
@@ -636,7 +750,7 @@ int blk_rq_unmap_user(struct bio *bio)

		next_bio = bio;
		bio = bio->bi_next;
		bio_map_put(next_bio);
		blk_mq_map_bio_put(next_bio);
	}

	return ret;
+91 −16
Original line number Diff line number Diff line
@@ -510,16 +510,77 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
					alloc_time_ns);
}

static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
					    struct blk_plug *plug,
					    blk_opf_t opf,
					    blk_mq_req_flags_t flags)
{
	struct blk_mq_alloc_data data = {
		.q		= q,
		.flags		= flags,
		.cmd_flags	= opf,
		.nr_tags	= plug->nr_ios,
		.cached_rq	= &plug->cached_rq,
	};
	struct request *rq;

	if (blk_queue_enter(q, flags))
		return NULL;

	plug->nr_ios = 1;

	rq = __blk_mq_alloc_requests(&data);
	if (unlikely(!rq))
		blk_queue_exit(q);
	return rq;
}

static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
						   blk_opf_t opf,
						   blk_mq_req_flags_t flags)
{
	struct blk_plug *plug = current->plug;
	struct request *rq;

	if (!plug)
		return NULL;
	if (rq_list_empty(plug->cached_rq)) {
		if (plug->nr_ios == 1)
			return NULL;
		rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
		if (rq)
			goto got_it;
		return NULL;
	}
	rq = rq_list_peek(&plug->cached_rq);
	if (!rq || rq->q != q)
		return NULL;

	if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
		return NULL;
	if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
		return NULL;

	plug->cached_rq = rq_list_next(rq);
got_it:
	rq->cmd_flags = opf;
	INIT_LIST_HEAD(&rq->queuelist);
	return rq;
}

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
		blk_mq_req_flags_t flags)
{
	struct request *rq;

	rq = blk_mq_alloc_cached_request(q, opf, flags);
	if (!rq) {
		struct blk_mq_alloc_data data = {
			.q		= q,
			.flags		= flags,
			.cmd_flags	= opf,
			.nr_tags	= 1,
		};
	struct request *rq;
		int ret;

		ret = blk_queue_enter(q, flags);
@@ -529,6 +590,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
		rq = __blk_mq_alloc_requests(&data);
		if (!rq)
			goto out_queue_exit;
	}
	rq->__data_len = 0;
	rq->__sector = (sector_t) -1;
	rq->bio = rq->biotail = NULL;
@@ -761,9 +823,11 @@ static void blk_complete_request(struct request *req)
	 * can find how many bytes remain in the request
	 * later.
	 */
	if (!req->end_io) {
		req->bio = NULL;
		req->__data_len = 0;
	}
}

/**
 * blk_update_request - Complete multiple bytes without completing the request
@@ -939,7 +1003,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)

	if (rq->end_io) {
		rq_qos_done(rq->q, rq);
		rq->end_io(rq, error);
		if (rq->end_io(rq, error) == RQ_END_IO_FREE)
			blk_mq_free_request(rq);
	} else {
		blk_mq_free_request(rq);
	}
@@ -992,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)

		rq_qos_done(rq->q, rq);

		/*
		 * If end_io handler returns NONE, then it still has
		 * ownership of the request.
		 */
		if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
			continue;

		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
		if (!req_ref_put_and_test(rq))
			continue;
@@ -1233,12 +1305,13 @@ struct blk_rq_wait {
	blk_status_t ret;
};

static void blk_end_sync_rq(struct request *rq, blk_status_t ret)
static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
{
	struct blk_rq_wait *wait = rq->end_io_data;

	wait->ret = ret;
	complete(&wait->done);
	return RQ_END_IO_NONE;
}

bool blk_rq_is_poll(struct request *rq)
@@ -1472,11 +1545,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)

void blk_mq_put_rq_ref(struct request *rq)
{
	if (is_flush_rq(rq))
		rq->end_io(rq, 0);
	else if (req_ref_put_and_test(rq))
	if (is_flush_rq(rq)) {
		if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
			blk_mq_free_request(rq);
	} else if (req_ref_put_and_test(rq)) {
		__blk_mq_free_request(rq);
	}
}

static bool blk_mq_check_expired(struct request *rq, void *priv)
{
+3 −1
Original line number Diff line number Diff line
@@ -292,11 +292,13 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
	dm_complete_request(rq, error);
}

static void end_clone_request(struct request *clone, blk_status_t error)
static enum rq_end_io_ret end_clone_request(struct request *clone,
					    blk_status_t error)
{
	struct dm_rq_target_io *tio = clone->end_io_data;

	dm_complete_request(tio->orig, error);
	return RQ_END_IO_NONE;
}

static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+4 −2
Original line number Diff line number Diff line
@@ -1172,7 +1172,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
	queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
}

static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
						 blk_status_t status)
{
	struct nvme_ctrl *ctrl = rq->end_io_data;
	unsigned long flags;
@@ -1184,7 +1185,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
		dev_err(ctrl->device,
			"failed nvme_keep_alive_end_io error=%d\n",
				status);
		return;
		return RQ_END_IO_NONE;
	}

	ctrl->comp_seen = false;
@@ -1195,6 +1196,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
	spin_unlock_irqrestore(&ctrl->lock, flags);
	if (startka)
		nvme_queue_keep_alive_work(ctrl);
	return RQ_END_IO_NONE;
}

static void nvme_keep_alive_work(struct work_struct *work)
Loading