Commit 6070dcc8 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fix from David Sterba:
 "Fix for a deadlock when direct/buffered IO is done on a mmaped file
  and a fault happens (details in the patch). There's a fstest
  generic/647 that triggers the problem and makes testing hard"

* tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix deadlock due to page faults during direct IO reads and writes
parents 38764c73 51bd9563
Loading
Loading
Loading
Loading
+123 −16
Original line number Diff line number Diff line
@@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,

static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
	const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	loff_t pos;
	ssize_t written = 0;
	ssize_t written_buffered;
	size_t prev_left = 0;
	loff_t endbyte;
	ssize_t err;
	unsigned int ilock_flags = 0;
	struct iomap_dio *dio = NULL;

	if (iocb->ki_flags & IOCB_NOWAIT)
		ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1964,23 +1965,80 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
		goto buffered;
	}

	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
			     0, 0);

	btrfs_inode_unlock(inode, ilock_flags);
	/*
	 * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
	 * calls generic_write_sync() (through iomap_dio_complete()), because
	 * that results in calling fsync (btrfs_sync_file()) which will try to
	 * lock the inode in exclusive/write mode.
	 */
	if (is_sync_write)
		iocb->ki_flags &= ~IOCB_DSYNC;

	if (IS_ERR_OR_NULL(dio)) {
		err = PTR_ERR_OR_ZERO(dio);
		if (err < 0 && err != -ENOTBLK)
			goto out;
	/*
	 * The iov_iter can be mapped to the same file range we are writing to.
	 * If that's the case, then we will deadlock in the iomap code, because
	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
	 * an ordered extent, and after that it will fault in the pages that the
	 * iov_iter refers to. During the fault in we end up in the readahead
	 * pages code (starting at btrfs_readahead()), which will lock the range,
	 * find that ordered extent and then wait for it to complete (at
	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
	 * obviously the ordered extent can never complete as we didn't submit
	 * yet the respective bio(s). This always happens when the buffer is
	 * memory mapped to the same file range, since the iomap DIO code always
	 * invalidates pages in the target file range (after starting and waiting
	 * for any writeback).
	 *
	 * So here we disable page faults in the iov_iter and then retry if we
	 * got -EFAULT, faulting in the pages before the retry.
	 */
again:
	from->nofault = true;
	err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
			   IOMAP_DIO_PARTIAL, written);
	from->nofault = false;

	/* No increment (+=) because iomap returns a cumulative value. */
	if (err > 0)
		written = err;

	if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
		const size_t left = iov_iter_count(from);
		/*
		 * We have more data left to write. Try to fault in as many as
		 * possible of the remainder pages and retry. We do this without
		 * releasing and locking again the inode, to prevent races with
		 * truncate.
		 *
		 * Also, in case the iov refers to pages in the file range of the
		 * file we want to write to (due to a mmap), we could enter an
		 * infinite loop if we retry after faulting the pages in, since
		 * iomap will invalidate any pages in the range early on, before
		 * it tries to fault in the pages of the iov. So we keep track of
		 * how much was left of iov in the previous EFAULT and fallback
		 * to buffered IO in case we haven't made any progress.
		 */
		if (left == prev_left) {
			err = -ENOTBLK;
		} else {
		written = iomap_dio_complete(dio);
			fault_in_iov_iter_readable(from, left);
			prev_left = left;
			goto again;
		}
	}

	btrfs_inode_unlock(inode, ilock_flags);

	/*
	 * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
	 * the fsync (call generic_write_sync()).
	 */
	if (is_sync_write)
		iocb->ki_flags |= IOCB_DSYNC;

	if (written < 0 || !iov_iter_count(from)) {
		err = written;
	/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
	if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
		goto out;
	}

buffered:
	pos = iocb->ki_pos;
@@ -2005,7 +2063,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
				 endbyte >> PAGE_SHIFT);
out:
	return written ? written : err;
	return err < 0 ? err : written;
}

static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
	struct inode *inode = file_inode(iocb->ki_filp);
	size_t prev_left = 0;
	ssize_t read = 0;
	ssize_t ret;

	if (fsverity_active(inode))
@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
		return 0;

	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
again:
	/*
	 * This is similar to what we do for direct IO writes, see the comment
	 * at btrfs_direct_write(), but we also disable page faults in addition
	 * to disabling them only at the iov_iter level. This is because when
	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
	 * which can still trigger page fault ins despite having set ->nofault
	 * to true of our 'to' iov_iter.
	 *
	 * The difference to direct IO writes is that we deadlock when trying
	 * to lock the extent range in the inode's tree during he page reads
	 * triggered by the fault in (while for writes it is due to waiting for
	 * our own ordered extent). This is because for direct IO reads,
	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
	 */
	pagefault_disable();
	to->nofault = true;
	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
			   0, 0);
			   IOMAP_DIO_PARTIAL, read);
	to->nofault = false;
	pagefault_enable();

	/* No increment (+=) because iomap returns a cumulative value. */
	if (ret > 0)
		read = ret;

	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
		const size_t left = iov_iter_count(to);

		if (left == prev_left) {
			/*
			 * We didn't make any progress since the last attempt,
			 * fallback to a buffered read for the remainder of the
			 * range. This is just to avoid any possibility of looping
			 * for too long.
			 */
			ret = read;
		} else {
			/*
			 * We made some progress since the last retry or this is
			 * the first time we are retrying. Fault in as many pages
			 * as possible and retry.
			 */
			fault_in_iov_iter_writeable(to, left);
			prev_left = left;
			goto again;
		}
	}
	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
	return ret;
	return ret < 0 ? ret : read;
}

static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)