Merge tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux (6070dcc8) · Commits · EulixOS / Software / Kernel

fs/btrfs/file.c

+123 −16

Original line number	Diff line number	Diff line
		@@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,

		static ssize_t btrfs_direct_write(struct kiocb iocb, struct iov_iter from)
		{
		const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
		struct file *file = iocb->ki_filp;
		struct inode *inode = file_inode(file);
		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
		loff_t pos;
		ssize_t written = 0;
		ssize_t written_buffered;
		size_t prev_left = 0;
		loff_t endbyte;
		ssize_t err;
		unsigned int ilock_flags = 0;
		struct iomap_dio *dio = NULL;

		if (iocb->ki_flags & IOCB_NOWAIT)
		ilock_flags \|= BTRFS_ILOCK_TRY;
		@@ -1964,23 +1965,80 @@ static ssize_t btrfs_direct_write(struct kiocb iocb, struct iov_iter from)
		goto buffered;
		}

		dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
		0, 0);

		btrfs_inode_unlock(inode, ilock_flags);
		/*
		* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
		* calls generic_write_sync() (through iomap_dio_complete()), because
		* that results in calling fsync (btrfs_sync_file()) which will try to
		* lock the inode in exclusive/write mode.
		*/
		if (is_sync_write)
		iocb->ki_flags &= ~IOCB_DSYNC;

		if (IS_ERR_OR_NULL(dio)) {
		err = PTR_ERR_OR_ZERO(dio);
		if (err < 0 && err != -ENOTBLK)
		goto out;
		/*
		* The iov_iter can be mapped to the same file range we are writing to.
		* If that's the case, then we will deadlock in the iomap code, because
		* it first calls our callback btrfs_dio_iomap_begin(), which will create
		* an ordered extent, and after that it will fault in the pages that the
		* iov_iter refers to. During the fault in we end up in the readahead
		* pages code (starting at btrfs_readahead()), which will lock the range,
		* find that ordered extent and then wait for it to complete (at
		* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
		* obviously the ordered extent can never complete as we didn't submit
		* yet the respective bio(s). This always happens when the buffer is
		* memory mapped to the same file range, since the iomap DIO code always
		* invalidates pages in the target file range (after starting and waiting
		* for any writeback).
		*
		* So here we disable page faults in the iov_iter and then retry if we
		* got -EFAULT, faulting in the pages before the retry.
		*/
		again:
		from->nofault = true;
		err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
		IOMAP_DIO_PARTIAL, written);
		from->nofault = false;

		/* No increment (+=) because iomap returns a cumulative value. */
		if (err > 0)
		written = err;

		if (iov_iter_count(from) > 0 && (err == -EFAULT \|\| err > 0)) {
		const size_t left = iov_iter_count(from);
		/*
		* We have more data left to write. Try to fault in as many as
		* possible of the remainder pages and retry. We do this without
		* releasing and locking again the inode, to prevent races with
		* truncate.
		*
		* Also, in case the iov refers to pages in the file range of the
		* file we want to write to (due to a mmap), we could enter an
		* infinite loop if we retry after faulting the pages in, since
		* iomap will invalidate any pages in the range early on, before
		* it tries to fault in the pages of the iov. So we keep track of
		* how much was left of iov in the previous EFAULT and fallback
		* to buffered IO in case we haven't made any progress.
		*/
		if (left == prev_left) {
		err = -ENOTBLK;
		} else {
		written = iomap_dio_complete(dio);
		fault_in_iov_iter_readable(from, left);
		prev_left = left;
		goto again;
		}
		}

		btrfs_inode_unlock(inode, ilock_flags);

		/*
		* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
		* the fsync (call generic_write_sync()).
		*/
		if (is_sync_write)
		iocb->ki_flags \|= IOCB_DSYNC;

		if (written < 0 \|\| !iov_iter_count(from)) {
		err = written;
		/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
		if ((err < 0 && err != -ENOTBLK) \|\| !iov_iter_count(from))
		goto out;
		}

		buffered:
		pos = iocb->ki_pos;
		@@ -2005,7 +2063,7 @@ static ssize_t btrfs_direct_write(struct kiocb iocb, struct iov_iter from)
		invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
		endbyte >> PAGE_SHIFT);
		out:
		return written ? written : err;
		return err < 0 ? err : written;
		}

		static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
		@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
		static ssize_t btrfs_direct_read(struct kiocb iocb, struct iov_iter to)
		{
		struct inode *inode = file_inode(iocb->ki_filp);
		size_t prev_left = 0;
		ssize_t read = 0;
		ssize_t ret;

		if (fsverity_active(inode))
		@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb iocb, struct iov_iter to)
		return 0;

		btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
		again:
		/*
		* This is similar to what we do for direct IO writes, see the comment
		* at btrfs_direct_write(), but we also disable page faults in addition
		* to disabling them only at the iov_iter level. This is because when
		* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
		* which can still trigger page fault ins despite having set ->nofault
		* to true of our 'to' iov_iter.
		*
		* The difference to direct IO writes is that we deadlock when trying
		* to lock the extent range in the inode's tree during he page reads
		* triggered by the fault in (while for writes it is due to waiting for
		* our own ordered extent). This is because for direct IO reads,
		* btrfs_dio_iomap_begin() returns with the extent range locked, which
		* is only unlocked in the endio callback (end_bio_extent_readpage()).
		*/
		pagefault_disable();
		to->nofault = true;
		ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
		0, 0);
		IOMAP_DIO_PARTIAL, read);
		to->nofault = false;
		pagefault_enable();

		/* No increment (+=) because iomap returns a cumulative value. */
		if (ret > 0)
		read = ret;

		if (iov_iter_count(to) > 0 && (ret == -EFAULT \|\| ret > 0)) {
		const size_t left = iov_iter_count(to);

		if (left == prev_left) {
		/*
		* We didn't make any progress since the last attempt,
		* fallback to a buffered read for the remainder of the
		* range. This is just to avoid any possibility of looping
		* for too long.
		*/
		ret = read;
		} else {
		/*
		* We made some progress since the last retry or this is
		* the first time we are retrying. Fault in as many pages
		* as possible and retry.
		*/
		fault_in_iov_iter_writeable(to, left);
		prev_left = left;
		goto again;
		}
		}
		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
		return ret;
		return ret < 0 ? ret : read;
		}

		static ssize_t btrfs_file_read_iter(struct kiocb iocb, struct iov_iter to)