Commit f02bf857 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs zoned mode fixes from David Sterba:

 - fix deadlock when allocating system chunk

 - fix wrong mutex unlock on an error path

 - fix extent map splitting for append operation

 - update and fix message reporting unusable chunk space

 - don't block when background zone reclaim runs with balance in
   parallel

* tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
  btrfs: don't block if we can't acquire the reclaim lock
  btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
  btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
  btrfs: fix deadlock with concurrent chunk allocations involving system chunks
  btrfs: zoned: print unusable percentage when reclaiming block groups
  btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work
parents 7fef2edf ea32af47
Loading
Loading
Loading
Loading
+271 −96
Original line number Diff line number Diff line
@@ -1498,9 +1498,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
		return;

	mutex_lock(&fs_info->reclaim_bgs_lock);
	/*
	 * Long running balances can keep us blocked here for eternity, so
	 * simply skip reclaim if we're unable to get the mutex.
	 */
	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
		btrfs_exclop_finish(fs_info);
		return;
	}

	spin_lock(&fs_info->unused_bgs_lock);
	while (!list_empty(&fs_info->reclaim_bgs)) {
		u64 zone_unusable;
		int ret = 0;

		bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1534,13 +1543,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
			goto next;
		}

		/*
		 * Cache the zone_unusable value before turning the block group
		 * to read only. As soon as the blog group is read only it's
		 * zone_unusable value gets moved to the block group's read-only
		 * bytes and isn't available for calculations anymore.
		 */
		zone_unusable = bg->zone_unusable;
		ret = inc_block_group_ro(bg, 0);
		up_write(&space_info->groups_sem);
		if (ret < 0)
			goto next;

		btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
				bg->start, div_u64(bg->used * 100, bg->length));
		btrfs_info(fs_info,
			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
				bg->start, div_u64(bg->used * 100, bg->length),
				div64_u64(zone_unusable * 100, bg->length));
		trace_btrfs_reclaim_block_group(bg);
		ret = btrfs_relocate_chunk(fs_info, bg->start);
		if (ret)
@@ -2197,6 +2215,13 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
	return ret;
}

/*
 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
 * allocation.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
static int insert_block_group_item(struct btrfs_trans_handle *trans,
				   struct btrfs_block_group *block_group)
{
@@ -2219,15 +2244,19 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}

/*
 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
 * chunk allocation.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_block_group *block_group;
	int ret = 0;

	if (!trans->can_flush_pending_bgs)
		return;

	while (!list_empty(&trans->new_bgs)) {
		int index;

@@ -2242,6 +2271,13 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		ret = insert_block_group_item(trans, block_group);
		if (ret)
			btrfs_abort_transaction(trans, ret);
		if (!block_group->chunk_item_inserted) {
			mutex_lock(&fs_info->chunk_mutex);
			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
			mutex_unlock(&fs_info->chunk_mutex);
			if (ret)
				btrfs_abort_transaction(trans, ret);
		}
		ret = btrfs_finish_chunk_alloc(trans, block_group->start,
					block_group->length);
		if (ret)
@@ -2265,8 +2301,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
	btrfs_trans_release_chunk_metadata(trans);
}

int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
			   u64 type, u64 chunk_offset, u64 size)
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
						 u64 bytes_used, u64 type,
						 u64 chunk_offset, u64 size)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_block_group *cache;
@@ -2276,7 +2313,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,

	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
	if (!cache)
		return -ENOMEM;
		return ERR_PTR(-ENOMEM);

	cache->length = size;
	set_free_space_tree_thresholds(cache);
@@ -2290,7 +2327,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
	ret = btrfs_load_block_group_zone_info(cache, true);
	if (ret) {
		btrfs_put_block_group(cache);
		return ret;
		return ERR_PTR(ret);
	}

	ret = exclude_super_stripes(cache);
@@ -2298,7 +2335,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		/* We may have excluded something, so call this just in case */
		btrfs_free_excluded_extents(cache);
		btrfs_put_block_group(cache);
		return ret;
		return ERR_PTR(ret);
	}

	add_new_free_space(cache, chunk_offset, chunk_offset + size);
@@ -2325,7 +2362,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
	if (ret) {
		btrfs_remove_free_space_cache(cache);
		btrfs_put_block_group(cache);
		return ret;
		return ERR_PTR(ret);
	}

	/*
@@ -2344,7 +2381,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
	btrfs_update_delayed_refs_rsv(trans);

	set_avail_alloc_bits(fs_info, type);
	return 0;
	return cache;
}

/*
@@ -3222,11 +3259,203 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}

static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
	struct btrfs_block_group *bg;
	int ret;

	/*
	 * Check if we have enough space in the system space info because we
	 * will need to update device items in the chunk btree and insert a new
	 * chunk item in the chunk btree as well. This will allocate a new
	 * system block group if needed.
	 */
	check_system_chunk(trans, flags);

	bg = btrfs_alloc_chunk(trans, flags);
	if (IS_ERR(bg)) {
		ret = PTR_ERR(bg);
		goto out;
	}

	/*
 * If force is CHUNK_ALLOC_FORCE:
	 * If this is a system chunk allocation then stop right here and do not
	 * add the chunk item to the chunk btree. This is to prevent a deadlock
	 * because this system chunk allocation can be triggered while COWing
	 * some extent buffer of the chunk btree and while holding a lock on a
	 * parent extent buffer, in which case attempting to insert the chunk
	 * item (or update the device item) would result in a deadlock on that
	 * parent extent buffer. In this case defer the chunk btree updates to
	 * the second phase of chunk allocation and keep our reservation until
	 * the second phase completes.
	 *
	 * This is a rare case and can only be triggered by the very few cases
	 * we have where we need to touch the chunk btree outside chunk allocation
	 * and chunk removal. These cases are basically adding a device, removing
	 * a device or resizing a device.
	 */
	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
		return 0;

	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
	/*
	 * Normally we are not expected to fail with -ENOSPC here, since we have
	 * previously reserved space in the system space_info and allocated one
	 * new system chunk if necessary. However there are two exceptions:
	 *
	 * 1) We may have enough free space in the system space_info but all the
	 *    existing system block groups have a profile which can not be used
	 *    for extent allocation.
	 *
	 *    This happens when mounting in degraded mode. For example we have a
	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
	 *    using the other device in degraded mode. If we then allocate a chunk,
	 *    we may have enough free space in the existing system space_info, but
	 *    none of the block groups can be used for extent allocation since they
	 *    have a RAID1 profile, and because we are in degraded mode with a
	 *    single device, we are forced to allocate a new system chunk with a
	 *    SINGLE profile. Making check_system_chunk() iterate over all system
	 *    block groups and check if they have a usable profile and enough space
	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
	 *    try again after forcing allocation of a new system chunk. Like this
	 *    we avoid paying the cost of that search in normal circumstances, when
	 *    we were not mounted in degraded mode;
	 *
	 * 2) We had enough free space info the system space_info, and one suitable
	 *    block group to allocate from when we called check_system_chunk()
	 *    above. However right after we called it, the only system block group
	 *    with enough free space got turned into RO mode by a running scrub,
	 *    and in this case we have to allocate a new one and retry. We only
	 *    need do this allocate and retry once, since we have a transaction
	 *    handle and scrub uses the commit root to search for block groups.
	 */
	if (ret == -ENOSPC) {
		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
		struct btrfs_block_group *sys_bg;

		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
		if (IS_ERR(sys_bg)) {
			ret = PTR_ERR(sys_bg);
			btrfs_abort_transaction(trans, ret);
			goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
	} else if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
out:
	btrfs_trans_release_chunk_metadata(trans);

	return ret;
}

/*
 * Chunk allocation is done in 2 phases:
 *
 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
 *    the chunk, the chunk mapping, create its block group and add the items
 *    that belong in the chunk btree to it - more specifically, we need to
 *    update device items in the chunk btree and add a new chunk item to it.
 *
 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
 *    group item to the extent btree and the device extent items to the devices
 *    btree.
 *
 * This is done to prevent deadlocks. For example when COWing a node from the
 * extent btree we are holding a write lock on the node's parent and if we
 * trigger chunk allocation and attempted to insert the new block group item
 * in the extent btree right way, we could deadlock because the path for the
 * insertion can include that parent node. At first glance it seems impossible
 * to trigger chunk allocation after starting a transaction since tasks should
 * reserve enough transaction units (metadata space), however while that is true
 * most of the time, chunk allocation may still be triggered for several reasons:
 *
 * 1) When reserving metadata, we check if there is enough free space in the
 *    metadata space_info and therefore don't trigger allocation of a new chunk.
 *    However later when the task actually tries to COW an extent buffer from
 *    the extent btree or from the device btree for example, it is forced to
 *    allocate a new block group (chunk) because the only one that had enough
 *    free space was just turned to RO mode by a running scrub for example (or
 *    device replace, block group reclaim thread, etc), so we can not use it
 *    for allocating an extent and end up being forced to allocate a new one;
 *
 * 2) Because we only check that the metadata space_info has enough free bytes,
 *    we end up not allocating a new metadata chunk in that case. However if
 *    the filesystem was mounted in degraded mode, none of the existing block
 *    groups might be suitable for extent allocation due to their incompatible
 *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
 *    use a RAID1 profile, in degraded mode using a single device). In this case
 *    when the task attempts to COW some extent buffer of the extent btree for
 *    example, it will trigger allocation of a new metadata block group with a
 *    suitable profile (SINGLE profile in the example of the degraded mount of
 *    the RAID1 filesystem);
 *
 * 3) The task has reserved enough transaction units / metadata space, but when
 *    it attempts to COW an extent buffer from the extent or device btree for
 *    example, it does not find any free extent in any metadata block group,
 *    therefore forced to try to allocate a new metadata block group.
 *    This is because some other task allocated all available extents in the
 *    meanwhile - this typically happens with tasks that don't reserve space
 *    properly, either intentionally or as a bug. One example where this is
 *    done intentionally is fsync, as it does not reserve any transaction units
 *    and ends up allocating a variable number of metadata extents for log
 *    tree extent buffers.
 *
 * We also need this 2 phases setup when adding a device to a filesystem with
 * a seed device - we must create new metadata and system chunks without adding
 * any of the block group items to the chunk, extent and device btrees. If we
 * did not do it this way, we would get ENOSPC when attempting to update those
 * btrees, since all the chunks from the seed device are read-only.
 *
 * Phase 1 does the updates and insertions to the chunk btree because if we had
 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
 * parallel, we risk having too many system chunks allocated by many tasks if
 * many tasks reach phase 1 without the previous ones completing phase 2. In the
 * extreme case this leads to exhaustion of the system chunk array in the
 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
 * and with RAID filesystems (so we have more device items in the chunk btree).
 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
 * the system chunk array due to concurrent allocations") provides more details.
 *
 * For allocation of system chunks, we defer the updates and insertions into the
 * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
 * if the chunk allocation is triggered while COWing an extent buffer of the
 * chunk btree, we are holding a lock on the parent of that extent buffer and
 * doing the chunk btree updates and insertions can require locking that parent.
 * This is for the very few and rare cases where we update the chunk btree that
 * are not chunk allocation or chunk removal: adding a device, removing a device
 * or resizing a device.
 *
 * The reservation of system space, done through check_system_chunk(), as well
 * as all the updates and insertions into the chunk btree must be done while
 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
 * an extent buffer from the chunks btree we never trigger allocation of a new
 * system chunk, which would result in a deadlock (trying to lock twice an
 * extent buffer of the chunk btree, first time before triggering the chunk
 * allocation and the second time during chunk allocation while attempting to
 * update the chunks btree). The system chunk array is also updated while holding
 * that mutex. The same logic applies to removing chunks - we must reserve system
 * space, update the chunk btree and the system chunk array in the superblock
 * while holding fs_info->chunk_mutex.
 *
 * This function, btrfs_chunk_alloc(), belongs to phase 1.
 *
 * If @force is CHUNK_ALLOC_FORCE:
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 * If force is NOT CHUNK_ALLOC_FORCE:
 * If @force is NOT CHUNK_ALLOC_FORCE:
 *    - return 0 if it doesn't need to allocate a new chunk,
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
@@ -3243,6 +3472,13 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
	/* Don't re-enter if we're already allocating a chunk */
	if (trans->allocating_chunk)
		return -ENOSPC;
	/*
	 * If we are removing a chunk, don't re-enter or we would deadlock.
	 * System space reservation and system chunk allocation is done by the
	 * chunk remove operation (btrfs_remove_chunk()).
	 */
	if (trans->removing_chunk)
		return -ENOSPC;

	space_info = btrfs_find_space_info(fs_info, flags);
	ASSERT(space_info);
@@ -3306,13 +3542,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
			force_metadata_allocation(fs_info);
	}

	/*
	 * Check if we have enough space in SYSTEM chunk because we may need
	 * to update devices.
	 */
	check_system_chunk(trans, flags);

	ret = btrfs_alloc_chunk(trans, flags);
	ret = do_chunk_alloc(trans, flags);
	trans->allocating_chunk = false;

	spin_lock(&space_info->lock);
@@ -3331,22 +3561,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
	space_info->chunk_alloc = 0;
	spin_unlock(&space_info->lock);
	mutex_unlock(&fs_info->chunk_mutex);
	/*
	 * When we allocate a new chunk we reserve space in the chunk block
	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
	 * add new nodes/leafs to it if we end up needing to do it when
	 * inserting the chunk item and updating device items as part of the
	 * second phase of chunk allocation, performed by
	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
	 * large number of new block groups to create in our transaction
	 * handle's new_bgs list to avoid exhausting the chunk block reserve
	 * in extreme cases - like having a single transaction create many new
	 * block groups when starting to write out the free space caches of all
	 * the block groups that were made dirty during the lifetime of the
	 * transaction.
	 */
	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
		btrfs_create_pending_block_groups(trans);

	return ret;
}
@@ -3367,7 +3581,6 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
 */
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_space_info *info;
	u64 left;
@@ -3382,7 +3595,6 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
	lockdep_assert_held(&fs_info->chunk_mutex);

	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
again:
	spin_lock(&info->lock);
	left = info->total_bytes - btrfs_space_info_used(info, true);
	spin_unlock(&info->lock);
@@ -3401,78 +3613,41 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)

	if (left < thresh) {
		u64 flags = btrfs_system_alloc_profile(fs_info);
		u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);

		/*
		 * If there's not available space for the chunk tree (system
		 * space) and there are other tasks that reserved space for
		 * creating a new system block group, wait for them to complete
		 * the creation of their system block group and release excess
		 * reserved space. We do this because:
		 *
		 * *) We can end up allocating more system chunks than necessary
		 *    when there are multiple tasks that are concurrently
		 *    allocating block groups, which can lead to exhaustion of
		 *    the system array in the superblock;
		 *
		 * *) If we allocate extra and unnecessary system block groups,
		 *    despite being empty for a long time, and possibly forever,
		 *    they end not being added to the list of unused block groups
		 *    because that typically happens only when deallocating the
		 *    last extent from a block group - which never happens since
		 *    we never allocate from them in the first place. The few
		 *    exceptions are when mounting a filesystem or running scrub,
		 *    which add unused block groups to the list of unused block
		 *    groups, to be deleted by the cleaner kthread.
		 *    And even when they are added to the list of unused block
		 *    groups, it can take a long time until they get deleted,
		 *    since the cleaner kthread might be sleeping or busy with
		 *    other work (deleting subvolumes, running delayed iputs,
		 *    defrag scheduling, etc);
		 *
		 * This is rare in practice, but can happen when too many tasks
		 * are allocating blocks groups in parallel (via fallocate())
		 * and before the one that reserved space for a new system block
		 * group finishes the block group creation and releases the space
		 * reserved in excess (at btrfs_create_pending_block_groups()),
		 * other tasks end up here and see free system space temporarily
		 * not enough for updating the chunk tree.
		 *
		 * We unlock the chunk mutex before waiting for such tasks and
		 * lock it again after the wait, otherwise we would deadlock.
		 * It is safe to do so because allocating a system chunk is the
		 * first thing done while allocating a new block group.
		 */
		if (reserved > trans->chunk_bytes_reserved) {
			const u64 min_needed = reserved - thresh;

			mutex_unlock(&fs_info->chunk_mutex);
			wait_event(cur_trans->chunk_reserve_wait,
			   atomic64_read(&cur_trans->chunk_bytes_reserved) <=
			   min_needed);
			mutex_lock(&fs_info->chunk_mutex);
			goto again;
		}
		struct btrfs_block_group *bg;

		/*
		 * Ignore failure to create system chunk. We might end up not
		 * needing it, as we might not need to COW all nodes/leafs from
		 * the paths we visit in the chunk tree (they were already COWed
		 * or created in the current transaction for example).
		 *
		 * Also, if our caller is allocating a system chunk, do not
		 * attempt to insert the chunk item in the chunk btree, as we
		 * could deadlock on an extent buffer since our caller may be
		 * COWing an extent buffer from the chunk btree.
		 */
		bg = btrfs_alloc_chunk(trans, flags);
		if (IS_ERR(bg)) {
			ret = PTR_ERR(bg);
		} else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
			/*
			 * If we fail to add the chunk item here, we end up
			 * trying again at phase 2 of chunk allocation, at
			 * btrfs_create_pending_block_groups(). So ignore
			 * any error here.
			 */
		ret = btrfs_alloc_chunk(trans, flags);
			btrfs_chunk_alloc_add_chunk_item(trans, bg);
		}
	}

	if (!ret) {
		ret = btrfs_block_rsv_add(fs_info->chunk_root,
					  &fs_info->chunk_block_rsv,
					  thresh, BTRFS_RESERVE_NO_FLUSH);
		if (!ret) {
			atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
		if (!ret)
			trans->chunk_bytes_reserved += thresh;
	}
}
}

void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
+4 −2
Original line number Diff line number Diff line
@@ -97,6 +97,7 @@ struct btrfs_block_group {
	unsigned int removed:1;
	unsigned int to_copy:1;
	unsigned int relocating_repair:1;
	unsigned int chunk_item_inserted:1;

	int disk_cache_state;

@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
			   u64 type, u64 chunk_offset, u64 size);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
						 u64 bytes_used, u64 type,
						 u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
			     bool do_chunk_alloc);
+13 −54
Original line number Diff line number Diff line
@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
	return 0;
}

static struct extent_buffer *alloc_tree_block_no_bg_flush(
					  struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  u64 parent_start,
					  const struct btrfs_disk_key *disk_key,
					  int level,
					  u64 hint,
					  u64 empty_size,
					  enum btrfs_lock_nesting nest)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *ret;

	/*
	 * If we are COWing a node/leaf from the extent, chunk, device or free
	 * space trees, make sure that we do not finish block group creation of
	 * pending block groups. We do this to avoid a deadlock.
	 * COWing can result in allocation of a new chunk, and flushing pending
	 * block groups (btrfs_create_pending_block_groups()) can be triggered
	 * when finishing allocation of a new chunk. Creation of a pending block
	 * group modifies the extent, chunk, device and free space trees,
	 * therefore we could deadlock with ourselves since we are holding a
	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
	 * try to COW later.
	 * For similar reasons, we also need to delay flushing pending block
	 * groups when splitting a leaf or node, from one of those trees, since
	 * we are holding a write lock on it and its parent or when inserting a
	 * new root node for one of those trees.
	 */
	if (root == fs_info->extent_root ||
	    root == fs_info->chunk_root ||
	    root == fs_info->dev_root ||
	    root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

	ret = btrfs_alloc_tree_block(trans, root, parent_start,
				     root->root_key.objectid, disk_key, level,
				     hint, empty_size, nest);
	trans->can_flush_pending_bgs = true;

	return ret;
}

/*
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
		parent_start = parent->start;

	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
					   level, search_start, empty_size, nest);
	cow = btrfs_alloc_tree_block(trans, root, parent_start,
				     root->root_key.objectid, &disk_key, level,
				     search_start, empty_size, nest);
	if (IS_ERR(cow))
		return PTR_ERR(cow);

@@ -2458,8 +2416,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
	else
		btrfs_node_key(lower, &lower_key, 0);

	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
					 root->node->start, 0,
	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
				   &lower_key, level, root->node->start, 0,
				   BTRFS_NESTING_NEW_ROOT);
	if (IS_ERR(c))
		return PTR_ERR(c);
@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
	mid = (c_nritems + 1) / 2;
	btrfs_node_key(c, &disk_key, mid);

	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
					     c->start, 0, BTRFS_NESTING_SPLIT);
	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
				       &disk_key, level, c->start, 0,
				       BTRFS_NESTING_SPLIT);
	if (IS_ERR(split))
		return PTR_ERR(split);

@@ -3381,9 +3340,9 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
	 * use BTRFS_NESTING_NEW_ROOT.
	 */
	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
					     l->start, 0, num_doubles ?
					     BTRFS_NESTING_NEW_ROOT :
	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
				       &disk_key, 0, l->start, 0,
				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
				       BTRFS_NESTING_SPLIT);
	if (IS_ERR(right))
		return PTR_ERR(right);
+118 −29

File changed.

Preview size limit exceeded, changes collapsed.

+5 −10

File changed.

Preview size limit exceeded, changes collapsed.

Loading