Commit 0eb79294 authored by Josef Bacik's avatar Josef Bacik Committed by David Sterba
Browse files

btrfs: dio iomap DSYNC workaround



iomap dio will run generic_write_sync() for us if the iocb is DSYNC.
This is problematic for us because of 2 reasons:

1. we hold the inode_lock() during this operation, and we take it in
   generic_write_sync()
2. we hold a read lock on the dio_sem but take the write lock in fsync

Since we don't want to rip out this code right now, but reworking the
locking is a bit much to do at this point, work around this problem with
this masterpiece of a patch.

First, we clear DSYNC on the iocb so that the iomap stuff doesn't know
that it needs to handle the sync.  We save this fact in
current->journal_info, because we need to see do special things once
we're in iomap_begin, and we have no way to pass private information
into iomap_dio_rw().

Next we specify a separate iomap_dio_ops for sync, which implements an
->end_io() callback that gets called when the dio completes.  This is
important for AIO, because we really do need to run generic_write_sync()
if we complete asynchronously.  However if we're still in the submitting
context when we enter ->end_io() we clear the flag so that the submitter
knows they're the ones that needs to run generic_write_sync().

This is meant to be temporary.  We need to work out how to eliminate the
inode_lock() and the dio_sem in our fsync and use another mechanism to
protect these operations.

Tested-by: default avatarJohannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: default avatarJosef Bacik <josef@toxicpanda.com>
Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent f85781fb
Loading
Loading
Loading
Loading
+33 −0
Original line number Diff line number Diff line
@@ -2023,7 +2023,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
		atomic_inc(&BTRFS_I(inode)->sync_writers);

	if (iocb->ki_flags & IOCB_DIRECT) {
		/*
		 * 1. We must always clear IOCB_DSYNC in order to not deadlock
		 *    in iomap, as it calls generic_write_sync() in this case.
		 * 2. If we are async, we can call iomap_dio_complete() either
		 *    in
		 *
		 *    2.1. A worker thread from the last bio completed.  In this
		 *	   case we need to mark the btrfs_dio_data that it is
		 *	   async in order to call generic_write_sync() properly.
		 *	   This is handled by setting BTRFS_DIO_SYNC_STUB in the
		 *	   current->journal_info.
		 *    2.2  The submitter context, because all IO completed
		 *         before we exited iomap_dio_rw().  In this case we can
		 *         just re-set the IOCB_DSYNC on the iocb and we'll do
		 *         the sync below.  If our ->end_io() gets called and
		 *         current->journal_info is set, then we know we're in
		 *         our current context and we will clear
		 *         current->journal_info to indicate that we need to
		 *         sync below.
		 */
		if (sync) {
			ASSERT(current->journal_info == NULL);
			iocb->ki_flags &= ~IOCB_DSYNC;
			current->journal_info = BTRFS_DIO_SYNC_STUB;
		}
		num_written = __btrfs_direct_write(iocb, from);

		/*
		 * As stated above, we cleared journal_info, so we need to do
		 * the sync ourselves.
		 */
		if (sync && current->journal_info == NULL)
			iocb->ki_flags |= IOCB_DSYNC;
		current->journal_info = NULL;
	} else {
		num_written = btrfs_buffered_write(iocb, from);
		if (num_written > 0)
+60 −2
Original line number Diff line number Diff line
@@ -62,6 +62,7 @@ struct btrfs_dio_data {
	loff_t length;
	ssize_t submitted;
	struct extent_changeset *data_reserved;
	bool sync;
};

static const struct inode_operations btrfs_dir_inode_operations;
@@ -7337,6 +7338,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
	int ret = 0;
	u64 len = length;
	bool unlock_extents = false;
	bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);

	/*
	 * We used current->journal_info here to see if we were sync, but
	 * there's a lot of tests in the enospc machinery to not do flushing if
	 * we have a journal_info set, so we need to clear this out and re-set
	 * it in iomap_end.
	 */
	ASSERT(current->journal_info == NULL ||
	       current->journal_info == BTRFS_DIO_SYNC_STUB);
	current->journal_info = NULL;

	if (!write)
		len = min_t(u64, len, fs_info->sectorsize);
@@ -7362,6 +7374,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
	if (!dio_data)
		return -ENOMEM;

	dio_data->sync = sync;
	dio_data->length = length;
	if (write) {
		dio_data->reserve = round_up(length, fs_info->sectorsize);
@@ -7509,6 +7522,14 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
		extent_changeset_free(dio_data->data_reserved);
	}
out:
	/*
	 * We're all done, we can re-set the current->journal_info now safely
	 * for our endio.
	 */
	if (dio_data->sync) {
		ASSERT(current->journal_info == NULL);
		current->journal_info = BTRFS_DIO_SYNC_STUB;
	}
	kfree(dio_data);
	iomap->private = NULL;

@@ -7917,6 +7938,30 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
	return retval;
}

static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
					   int error, unsigned flags)
{
	/*
	 * Now if we're still in the context of our submitter we know we can't
	 * safely run generic_write_sync(), so clear our flag here so that the
	 * caller knows to follow up with a sync.
	 */
	if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
		current->journal_info = NULL;
		return error;
	}

	if (error)
		return error;

	if (size) {
		iocb->ki_flags |= IOCB_DSYNC;
		return generic_write_sync(iocb, size);
	}

	return 0;
}

static const struct iomap_ops btrfs_dio_iomap_ops = {
	.iomap_begin            = btrfs_dio_iomap_begin,
	.iomap_end              = btrfs_dio_iomap_end,
@@ -7926,6 +7971,11 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
	.submit_io		= btrfs_submit_direct,
};

static const struct iomap_dio_ops btrfs_sync_dops = {
	.submit_io		= btrfs_submit_direct,
	.end_io			= btrfs_maybe_fsync_end_io,
};

ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
@@ -7954,8 +8004,16 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
		down_read(&BTRFS_I(inode)->dio_sem);
	}

	ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
			is_sync_kiocb(iocb));
	/*
	 * We have are actually a sync iocb, so we need our fancy endio to know
	 * if we need to sync.
	 */
	if (current->journal_info)
		ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
				   &btrfs_sync_dops, is_sync_kiocb(iocb));
	else
		ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
				   &btrfs_dio_ops, is_sync_kiocb(iocb));

	if (ret == -ENOTBLK)
		ret = 0;
+1 −0
Original line number Diff line number Diff line
@@ -112,6 +112,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)

#define BTRFS_SEND_TRANS_STUB	((void *)1)
#define BTRFS_DIO_SYNC_STUB	((void *)2)

struct btrfs_trans_handle {
	u64 transid;