Commit 6ab608fe authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:

 - scan block devices in non-exclusive mode to avoid temporary mkfs
   failures

 - fix race between quota disable and quota assign ioctls

 - fix deadlock when aborting transaction during relocation with scrub

 - ignore fiemap path cache when there are multiple paths for a node

* tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: ignore fiemap path cache when there are multiple paths for a node
  btrfs: fix deadlock when aborting transaction during relocation with scrub
  btrfs: scan device in non-exclusive mode
  btrfs: fix race between quota disable and quota assign ioctls
parents f95b8ea7 2280d425
Loading
Loading
Loading
Loading
+63 −22
Original line number Diff line number Diff line
@@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
	level = -1;
	ULIST_ITER_INIT(&uiter);
	while (1) {
		bool is_shared;
		bool cached;
		const unsigned long prev_ref_count = ctx->refs.nnodes;

		walk_ctx.bytenr = bytenr;
		ret = find_parent_nodes(&walk_ctx, &shared);
@@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
		ret = 0;

		/*
		 * If our data extent was not directly shared (without multiple
		 * reference items), than it might have a single reference item
		 * with a count > 1 for the same offset, which means there are 2
		 * (or more) file extent items that point to the data extent -
		 * this happens when a file extent item needs to be split and
		 * then one item gets moved to another leaf due to a b+tree leaf
		 * split when inserting some item. In this case the file extent
		 * items may be located in different leaves and therefore some
		 * of the leaves may be referenced through shared subtrees while
		 * others are not. Since our extent buffer cache only works for
		 * a single path (by far the most common case and simpler to
		 * deal with), we can not use it if we have multiple leaves
		 * (which implies multiple paths).
		 */
		if (level == -1 && ctx->refs.nnodes > 1)
		 * More than one extent buffer (bytenr) may have been added to
		 * the ctx->refs ulist, in which case we have to check multiple
		 * tree paths in case the first one is not shared, so we can not
		 * use the path cache which is made for a single path. Multiple
		 * extent buffers at the current level happen when:
		 *
		 * 1) level -1, the data extent: If our data extent was not
		 *    directly shared (without multiple reference items), then
		 *    it might have a single reference item with a count > 1 for
		 *    the same offset, which means there are 2 (or more) file
		 *    extent items that point to the data extent - this happens
		 *    when a file extent item needs to be split and then one
		 *    item gets moved to another leaf due to a b+tree leaf split
		 *    when inserting some item. In this case the file extent
		 *    items may be located in different leaves and therefore
		 *    some of the leaves may be referenced through shared
		 *    subtrees while others are not. Since our extent buffer
		 *    cache only works for a single path (by far the most common
		 *    case and simpler to deal with), we can not use it if we
		 *    have multiple leaves (which implies multiple paths).
		 *
		 * 2) level >= 0, a tree node/leaf: We can have a mix of direct
		 *    and indirect references on a b+tree node/leaf, so we have
		 *    to check multiple paths, and the extent buffer (the
		 *    current bytenr) may be shared or not. One example is
		 *    during relocation as we may get a shared tree block ref
		 *    (direct ref) and a non-shared tree block ref (indirect
		 *    ref) for the same node/leaf.
		 */
		if ((ctx->refs.nnodes - prev_ref_count) > 1)
			ctx->use_path_cache = false;

		if (level >= 0)
@@ -1964,18 +1978,45 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
		if (!node)
			break;
		bytenr = node->val;
		if (ctx->use_path_cache) {
			bool is_shared;
			bool cached;

			level++;
		cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
						     &is_shared);
			cached = lookup_backref_shared_cache(ctx, root, bytenr,
							     level, &is_shared);
			if (cached) {
				ret = (is_shared ? 1 : 0);
				break;
			}
		}
		shared.share_count = 0;
		shared.have_delayed_delete_refs = false;
		cond_resched();
	}

	/*
	 * If the path cache is disabled, then it means at some tree level we
	 * got multiple parents due to a mix of direct and indirect backrefs or
	 * multiple leaves with file extent items pointing to the same data
	 * extent. We have to invalidate the cache and cache only the sharedness
	 * result for the levels where we got only one node/reference.
	 */
	if (!ctx->use_path_cache) {
		int i = 0;

		level--;
		if (ret >= 0 && level >= 0) {
			bytenr = ctx->path_cache_entries[level].bytenr;
			ctx->use_path_cache = true;
			store_backref_shared_cache(ctx, root, bytenr, level, ret);
			i = level + 1;
		}

		for ( ; i < BTRFS_MAX_LEVEL; i++)
			ctx->path_cache_entries[i].bytenr = 0;
	}

	/*
	 * Cache the sharedness result for the data extent if we know our inode
	 * has more than 1 file extent item that refers to the data extent.
+2 −0
Original line number Diff line number Diff line
@@ -3732,7 +3732,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
	}

	/* update qgroup status and info */
	mutex_lock(&fs_info->qgroup_ioctl_lock);
	err = btrfs_run_qgroups(trans);
	mutex_unlock(&fs_info->qgroup_ioctl_lock);
	if (err < 0)
		btrfs_handle_fs_error(fs_info, err,
				      "failed to update qgroup status and info");
+10 −1
Original line number Diff line number Diff line
@@ -2828,13 +2828,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
}

/*
 * called from commit_transaction. Writes all changed qgroups to disk.
 * Writes all changed qgroups to disk.
 * Called by the transaction commit path and the qgroup assign ioctl.
 */
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	int ret = 0;

	/*
	 * In case we are called from the qgroup assign ioctl, assert that we
	 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
	 * disable operation (ioctl) and access a freed quota root.
	 */
	if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
		lockdep_assert_held(&fs_info->qgroup_ioctl_lock);

	if (!fs_info->quota_root)
		return ret;

+14 −1
Original line number Diff line number Diff line
@@ -2035,6 +2035,19 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)

	if (current->journal_info == trans)
		current->journal_info = NULL;

	/*
	 * If relocation is running, we can't cancel scrub because that will
	 * result in a deadlock. Before relocating a block group, relocation
	 * pauses scrub, then starts and commits a transaction before unpausing
	 * scrub. If the transaction commit is being done by the relocation
	 * task or triggered by another task and the relocation task is waiting
	 * for the commit, and we end up here due to an error in the commit
	 * path, then calling btrfs_scrub_cancel() will deadlock, as we are
	 * asking for scrub to stop while having it asked to be paused higher
	 * above in relocation code.
	 */
	if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
		btrfs_scrub_cancel(fs_info);

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+18 −2
Original line number Diff line number Diff line
@@ -1366,8 +1366,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	flags |= FMODE_EXCL;

	/*
	 * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
	 * initiate the device scan which may race with the user's mount
	 * or mkfs command, resulting in failure.
	 * Since the device scan is solely for reading purposes, there is
	 * no need for FMODE_EXCL. Additionally, the devices are read again
	 * during the mount process. It is ok to get some inconsistent
	 * values temporarily, as the device paths of the fsid are the only
	 * required information for assembling the volume.
	 */
	bdev = blkdev_get_by_path(path, flags, holder);
	if (IS_ERR(bdev))
		return ERR_CAST(bdev);
@@ -3266,8 +3275,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
	btrfs_scrub_pause(fs_info);
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
	btrfs_scrub_continue(fs_info);
	if (ret)
	if (ret) {
		/*
		 * If we had a transaction abort, stop all running scrubs.
		 * See transaction.c:cleanup_transaction() why we do it here.
		 */
		if (BTRFS_FS_ERROR(fs_info))
			btrfs_scrub_cancel(fs_info);
		return ret;
	}

	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
	if (!block_group)