Commit 3ee65c0f authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:
 "A few more fixes for various problems that have user visible effects
  or seem to be urgent:

   - fix corruption when combining DIO and non-blocking io_uring over
     multiple extents (seen on MariaDB)

   - fix relocation crash due to premature return from commit

   - fix quota deadlock between rescan and qgroup removal

   - fix item data bounds checks in tree-checker (found on a fuzzed
     image)

   - fix fsync of prealloc extents after EOF

   - add missing run of delayed items after unlink during log replay

   - don't start relocation until snapshot drop is finished

   - fix reversed condition for subpage writers locking

   - fix warning on page error"

* tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fallback to blocking mode when doing async dio over multiple extents
  btrfs: add missing run of delayed items after unlink during log replay
  btrfs: qgroup: fix deadlock between rescan worker and remove qgroup
  btrfs: fix relocation crash due to premature return from btrfs_commit_transaction()
  btrfs: do not start relocation until in progress drops are done
  btrfs: tree-checker: use u64 for item data end to avoid overflow
  btrfs: do not WARN_ON() if we have PageError set
  btrfs: fix lost prealloc extents beyond eof after full fsync
  btrfs: subpage: fix a wrong check on subpage->writers
parents f81664f7 ca93e44b
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -602,6 +602,9 @@ enum {
	/* Indicate that we want the transaction kthread to commit right now. */
	BTRFS_FS_COMMIT_TRANS,

	/* Indicate we have half completed snapshot deletions pending. */
	BTRFS_FS_UNFINISHED_DROPS,

#if BITS_PER_LONG == 32
	/* Indicate if we have error/warn message printed on 32bit systems */
	BTRFS_FS_32BIT_ERROR,
@@ -1106,8 +1109,15 @@ enum {
	BTRFS_ROOT_QGROUP_FLUSHING,
	/* We started the orphan cleanup for this root. */
	BTRFS_ROOT_ORPHAN_CLEANUP,
	/* This root has a drop operation that was started previously. */
	BTRFS_ROOT_UNFINISHED_DROP,
};

static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
	clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
}

/*
 * Record swapped tree blocks of a subvolume tree for delayed subtree trace
 * code. For detail check comment in fs/btrfs/qgroup.c.
+10 −0
Original line number Diff line number Diff line
@@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device

	set_bit(BTRFS_FS_OPEN, &fs_info->flags);

	/* Kick the cleaner thread so it'll start deleting snapshots. */
	if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
		wake_up_process(fs_info->cleaner_kthread);

clear_oneshot:
	btrfs_clear_oneshot_options(fs_info);
	return 0;
@@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
	 */
	kthread_park(fs_info->cleaner_kthread);

	/*
	 * If we had UNFINISHED_DROPS we could still be processing them, so
	 * clear that bit and wake up relocation so it can stop.
	 */
	btrfs_wake_unfinished_drop(fs_info);

	/* wait for the qgroup rescan worker to stop */
	btrfs_qgroup_wait_for_completion(fs_info, false);

+10 −0
Original line number Diff line number Diff line
@@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
	int ret;
	int level;
	bool root_dropped = false;
	bool unfinished_drop = false;

	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);

@@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);

	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
		level = btrfs_header_level(root->node);
		path->nodes[level] = btrfs_lock_root_node(root);
@@ -5838,6 +5841,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
	kfree(wc);
	btrfs_free_path(path);
out:
	/*
	 * We were an unfinished drop root, check to see if there are any
	 * pending, and if not clear and wake up any waiters.
	 */
	if (!err && unfinished_drop)
		btrfs_maybe_wake_unfinished_drop(fs_info);

	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
+13 −3
Original line number Diff line number Diff line
@@ -6841,14 +6841,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
{
	struct btrfs_fs_info *fs_info = eb->fs_info;

	/*
	 * If we are using the commit root we could potentially clear a page
	 * Uptodate while we're using the extent buffer that we've previously
	 * looked up.  We don't want to complain in this case, as the page was
	 * valid before, we just didn't write it out.  Instead we want to catch
	 * the case where we didn't actually read the block properly, which
	 * would have !PageUptodate && !PageError, as we clear PageError before
	 * reading.
	 */
	if (fs_info->sectorsize < PAGE_SIZE) {
		bool uptodate;
		bool uptodate, error;

		uptodate = btrfs_subpage_test_uptodate(fs_info, page,
						       eb->start, eb->len);
		WARN_ON(!uptodate);
		error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
		WARN_ON(!uptodate && !error);
	} else {
		WARN_ON(!PageUptodate(page));
		WARN_ON(!PageUptodate(page) && !PageError(page));
	}
}

+28 −0
Original line number Diff line number Diff line
@@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
	}

	len = min(len, em->len - (start - em->start));

	/*
	 * If we have a NOWAIT request and the range contains multiple extents
	 * (or a mix of extents and holes), then we return -EAGAIN to make the
	 * caller fallback to a context where it can do a blocking (without
	 * NOWAIT) request. This way we avoid doing partial IO and returning
	 * success to the caller, which is not optimal for writes and for reads
	 * it can result in unexpected behaviour for an application.
	 *
	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
	 * iomap_dio_rw(), we can end up returning less data then what the caller
	 * asked for, resulting in an unexpected, and incorrect, short read.
	 * That is, the caller asked to read N bytes and we return less than that,
	 * which is wrong unless we are crossing EOF. This happens if we get a
	 * page fault error when trying to fault in pages for the buffer that is
	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
	 * have previously submitted bios for other extents in the range, in
	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
	 * those bios have completed by the time we get the page fault error,
	 * which we return back to our caller - we should only return EIOCBQUEUED
	 * after we have submitted bios for all the extents in the range.
	 */
	if ((flags & IOMAP_NOWAIT) && len < length) {
		free_extent_map(em);
		ret = -EAGAIN;
		goto unlock_err;
	}

	if (write) {
		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
						    start, len);
Loading