Commit 8c052fb3 authored by Jens Axboe's avatar Jens Axboe
Browse files

iomap: support IOCB_DIO_CALLER_COMP



If IOCB_DIO_CALLER_COMP is set, utilize that to set kiocb->dio_complete
handler and data for that callback. Rather than punt the completion to a
workqueue, we pass back the handler and data to the issuer and will get
a callback from a safe task context.

Using the following fio job to randomly dio write 4k blocks at
queue depths of 1..16:

fio --name=dio-write --filename=/data1/file --time_based=1 \
--runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
--cpus_allowed=4 --ioengine=io_uring --iodepth=$depth

shows the following results before and after this patch:

	Stock	Patched		Diff
=======================================
QD1	155K	162K		+ 4.5%
QD2	290K	313K		+ 7.9%
QD4	533K	597K		+12.0%
QD8	604K	827K		+36.9%
QD16	615K	845K		+37.4%

which shows nice wins all around. If we factored in per-IOP efficiency,
the wins look even nicer. This becomes apparent as queue depth rises,
as the offloaded workqueue completions runs out of steam.

Reviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 099ada2c
Loading
Loading
Loading
Loading
+60 −2
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
 * Private flags for iomap_dio, must not overlap with the public ones in
 * iomap.h:
 */
#define IOMAP_DIO_CALLER_COMP	(1U << 26)
#define IOMAP_DIO_INLINE_COMP	(1U << 27)
#define IOMAP_DIO_WRITE_THROUGH	(1U << 28)
#define IOMAP_DIO_NEED_SYNC	(1U << 29)
@@ -132,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);

static ssize_t iomap_dio_deferred_complete(void *data)
{
	return iomap_dio_complete(data);
}

static void iomap_dio_complete_work(struct work_struct *work)
{
	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -182,6 +188,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
		goto release_bio;
	}

	/*
	 * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
	 * our completion that way to avoid an async punt to a workqueue.
	 */
	if (dio->flags & IOMAP_DIO_CALLER_COMP) {
		/* only polled IO cares about private cleared */
		iocb->private = dio;
		iocb->dio_complete = iomap_dio_deferred_complete;

		/*
		 * Invoke ->ki_complete() directly. We've assigned our
		 * dio_complete callback handler, and since the issuer set
		 * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
		 * notice ->dio_complete being set and will defer calling that
		 * handler until it can be done from a safe task context.
		 *
		 * Note that the 'res' being passed in here is not important
		 * for this case. The actual completion value of the request
		 * will be gotten from dio_complete when that is run by the
		 * issuer.
		 */
		iocb->ki_complete(iocb, 0);
		goto release_bio;
	}

	/*
	 * Async DIO completion that requires filesystem level completion work
	 * gets punted to a work queue to complete as the operation may require
@@ -278,12 +309,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
		 * after IO completion such as unwritten extent conversion) and
		 * the underlying device either supports FUA or doesn't have
		 * a volatile write cache. This allows us to avoid cache flushes
		 * on IO completion.
		 * on IO completion. If we can't use writethrough and need to
		 * sync, disable in-task completions as dio completion will
		 * need to call generic_write_sync() which will do a blocking
		 * fsync / cache flush call.
		 */
		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
		    (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
		    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
			use_fua = true;
		else if (dio->flags & IOMAP_DIO_NEED_SYNC)
			dio->flags &= ~IOMAP_DIO_CALLER_COMP;
	}

	/*
@@ -298,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
		goto out;

	/*
	 * We can only poll for single bio I/Os.
	 * We can only do deferred completion for pure overwrites that
	 * don't require additional IO at completion. This rules out
	 * writes that need zeroing or extent conversion, extend
	 * the file size, or issue journal IO or cache flushes
	 * during completion processing.
	 */
	if (need_zeroout ||
	    ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
		dio->flags &= ~IOMAP_DIO_CALLER_COMP;

	/*
	 * The rules for polled IO completions follow the guidelines as the
	 * ones we set for inline and deferred completions. If none of those
	 * are available for this IO, clear the polled flag.
	 */
	if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
		dio->iocb->ki_flags &= ~IOCB_HIPRI;

	if (need_zeroout) {
@@ -547,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
		iomi.flags |= IOMAP_WRITE;
		dio->flags |= IOMAP_DIO_WRITE;

		/*
		 * Flag as supporting deferred completions, if the issuer
		 * groks it. This can avoid a workqueue punt for writes.
		 * We may later clear this flag if we need to do other IO
		 * as part of this IO completion.
		 */
		if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
			dio->flags |= IOMAP_DIO_CALLER_COMP;

		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
			ret = -EAGAIN;
			if (iomi.pos >= dio->i_size ||