Merge tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux (f02bf857) · Commits · EulixOS / Software / Kernel

fs/btrfs/block-group.c

+271 −96

Original line number	Diff line number	Diff line
		@@ -1498,9 +1498,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
		return;

		mutex_lock(&fs_info->reclaim_bgs_lock);
		/*
		* Long running balances can keep us blocked here for eternity, so
		* simply skip reclaim if we're unable to get the mutex.
		*/
		if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
		btrfs_exclop_finish(fs_info);
		return;
		}

		spin_lock(&fs_info->unused_bgs_lock);
		while (!list_empty(&fs_info->reclaim_bgs)) {
		u64 zone_unusable;
		int ret = 0;

		bg = list_first_entry(&fs_info->reclaim_bgs,
		@@ -1534,13 +1543,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		goto next;
		}

		/*
		* Cache the zone_unusable value before turning the block group
		* to read only. As soon as the blog group is read only it's
		* zone_unusable value gets moved to the block group's read-only
		* bytes and isn't available for calculations anymore.
		*/
		zone_unusable = bg->zone_unusable;
		ret = inc_block_group_ro(bg, 0);
		up_write(&space_info->groups_sem);
		if (ret < 0)
		goto next;

		btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
		bg->start, div_u64(bg->used * 100, bg->length));
		btrfs_info(fs_info,
		"reclaiming chunk %llu with %llu%% used %llu%% unusable",
		bg->start, div_u64(bg->used * 100, bg->length),
		div64_u64(zone_unusable * 100, bg->length));
		trace_btrfs_reclaim_block_group(bg);
		ret = btrfs_relocate_chunk(fs_info, bg->start);
		if (ret)
		@@ -2197,6 +2215,13 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
		return ret;
		}

		/*
		* This function, insert_block_group_item(), belongs to the phase 2 of chunk
		* allocation.
		*
		* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
		* phases.
		*/
		static int insert_block_group_item(struct btrfs_trans_handle *trans,
		struct btrfs_block_group *block_group)
		{
		@@ -2219,15 +2244,19 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
		return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
		}

		/*
		* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
		* chunk allocation.
		*
		* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
		* phases.
		*/
		void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		{
		struct btrfs_fs_info *fs_info = trans->fs_info;
		struct btrfs_block_group *block_group;
		int ret = 0;

		if (!trans->can_flush_pending_bgs)
		return;

		while (!list_empty(&trans->new_bgs)) {
		int index;

		@@ -2242,6 +2271,13 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		ret = insert_block_group_item(trans, block_group);
		if (ret)
		btrfs_abort_transaction(trans, ret);
		if (!block_group->chunk_item_inserted) {
		mutex_lock(&fs_info->chunk_mutex);
		ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
		mutex_unlock(&fs_info->chunk_mutex);
		if (ret)
		btrfs_abort_transaction(trans, ret);
		}
		ret = btrfs_finish_chunk_alloc(trans, block_group->start,
		block_group->length);
		if (ret)
		@@ -2265,8 +2301,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		btrfs_trans_release_chunk_metadata(trans);
		}

		int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		u64 type, u64 chunk_offset, u64 size)
		struct btrfs_block_group btrfs_make_block_group(struct btrfs_trans_handle trans,
		u64 bytes_used, u64 type,
		u64 chunk_offset, u64 size)
		{
		struct btrfs_fs_info *fs_info = trans->fs_info;
		struct btrfs_block_group *cache;
		@@ -2276,7 +2313,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,

		cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
		if (!cache)
		return -ENOMEM;
		return ERR_PTR(-ENOMEM);

		cache->length = size;
		set_free_space_tree_thresholds(cache);
		@@ -2290,7 +2327,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		ret = btrfs_load_block_group_zone_info(cache, true);
		if (ret) {
		btrfs_put_block_group(cache);
		return ret;
		return ERR_PTR(ret);
		}

		ret = exclude_super_stripes(cache);
		@@ -2298,7 +2335,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		/* We may have excluded something, so call this just in case */
		btrfs_free_excluded_extents(cache);
		btrfs_put_block_group(cache);
		return ret;
		return ERR_PTR(ret);
		}

		add_new_free_space(cache, chunk_offset, chunk_offset + size);
		@@ -2325,7 +2362,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		if (ret) {
		btrfs_remove_free_space_cache(cache);
		btrfs_put_block_group(cache);
		return ret;
		return ERR_PTR(ret);
		}

		/*
		@@ -2344,7 +2381,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		btrfs_update_delayed_refs_rsv(trans);

		set_avail_alloc_bits(fs_info, type);
		return 0;
		return cache;
		}

		/*
		@@ -3222,11 +3259,203 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
		return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
		}

		static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
		{
		struct btrfs_block_group *bg;
		int ret;

		/*
		* Check if we have enough space in the system space info because we
		* will need to update device items in the chunk btree and insert a new
		* chunk item in the chunk btree as well. This will allocate a new
		* system block group if needed.
		*/
		check_system_chunk(trans, flags);

		bg = btrfs_alloc_chunk(trans, flags);
		if (IS_ERR(bg)) {
		ret = PTR_ERR(bg);
		goto out;
		}

		/*
		* If force is CHUNK_ALLOC_FORCE:
		* If this is a system chunk allocation then stop right here and do not
		* add the chunk item to the chunk btree. This is to prevent a deadlock
		* because this system chunk allocation can be triggered while COWing
		* some extent buffer of the chunk btree and while holding a lock on a
		* parent extent buffer, in which case attempting to insert the chunk
		* item (or update the device item) would result in a deadlock on that
		* parent extent buffer. In this case defer the chunk btree updates to
		* the second phase of chunk allocation and keep our reservation until
		* the second phase completes.
		*
		* This is a rare case and can only be triggered by the very few cases
		* we have where we need to touch the chunk btree outside chunk allocation
		* and chunk removal. These cases are basically adding a device, removing
		* a device or resizing a device.
		*/
		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
		return 0;

		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
		/*
		* Normally we are not expected to fail with -ENOSPC here, since we have
		* previously reserved space in the system space_info and allocated one
		* new system chunk if necessary. However there are two exceptions:
		*
		* 1) We may have enough free space in the system space_info but all the
		* existing system block groups have a profile which can not be used
		* for extent allocation.
		*
		* This happens when mounting in degraded mode. For example we have a
		* RAID1 filesystem with 2 devices, lose one device and mount the fs
		* using the other device in degraded mode. If we then allocate a chunk,
		* we may have enough free space in the existing system space_info, but
		* none of the block groups can be used for extent allocation since they
		* have a RAID1 profile, and because we are in degraded mode with a
		* single device, we are forced to allocate a new system chunk with a
		* SINGLE profile. Making check_system_chunk() iterate over all system
		* block groups and check if they have a usable profile and enough space
		* can be slow on very large filesystems, so we tolerate the -ENOSPC and
		* try again after forcing allocation of a new system chunk. Like this
		* we avoid paying the cost of that search in normal circumstances, when
		* we were not mounted in degraded mode;
		*
		* 2) We had enough free space info the system space_info, and one suitable
		* block group to allocate from when we called check_system_chunk()
		* above. However right after we called it, the only system block group
		* with enough free space got turned into RO mode by a running scrub,
		* and in this case we have to allocate a new one and retry. We only
		* need do this allocate and retry once, since we have a transaction
		* handle and scrub uses the commit root to search for block groups.
		*/
		if (ret == -ENOSPC) {
		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
		struct btrfs_block_group *sys_bg;

		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
		if (IS_ERR(sys_bg)) {
		ret = PTR_ERR(sys_bg);
		btrfs_abort_transaction(trans, ret);
		goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
		if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
		if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
		}
		} else if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
		}
		out:
		btrfs_trans_release_chunk_metadata(trans);

		return ret;
		}

		/*
		* Chunk allocation is done in 2 phases:
		*
		* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
		* the chunk, the chunk mapping, create its block group and add the items
		* that belong in the chunk btree to it - more specifically, we need to
		* update device items in the chunk btree and add a new chunk item to it.
		*
		* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
		* group item to the extent btree and the device extent items to the devices
		* btree.
		*
		* This is done to prevent deadlocks. For example when COWing a node from the
		* extent btree we are holding a write lock on the node's parent and if we
		* trigger chunk allocation and attempted to insert the new block group item
		* in the extent btree right way, we could deadlock because the path for the
		* insertion can include that parent node. At first glance it seems impossible
		* to trigger chunk allocation after starting a transaction since tasks should
		* reserve enough transaction units (metadata space), however while that is true
		* most of the time, chunk allocation may still be triggered for several reasons:
		*
		* 1) When reserving metadata, we check if there is enough free space in the
		* metadata space_info and therefore don't trigger allocation of a new chunk.
		* However later when the task actually tries to COW an extent buffer from
		* the extent btree or from the device btree for example, it is forced to
		* allocate a new block group (chunk) because the only one that had enough
		* free space was just turned to RO mode by a running scrub for example (or
		* device replace, block group reclaim thread, etc), so we can not use it
		* for allocating an extent and end up being forced to allocate a new one;
		*
		* 2) Because we only check that the metadata space_info has enough free bytes,
		* we end up not allocating a new metadata chunk in that case. However if
		* the filesystem was mounted in degraded mode, none of the existing block
		* groups might be suitable for extent allocation due to their incompatible
		* profile (for e.g. mounting a 2 devices filesystem, where all block groups
		* use a RAID1 profile, in degraded mode using a single device). In this case
		* when the task attempts to COW some extent buffer of the extent btree for
		* example, it will trigger allocation of a new metadata block group with a
		* suitable profile (SINGLE profile in the example of the degraded mount of
		* the RAID1 filesystem);
		*
		* 3) The task has reserved enough transaction units / metadata space, but when
		* it attempts to COW an extent buffer from the extent or device btree for
		* example, it does not find any free extent in any metadata block group,
		* therefore forced to try to allocate a new metadata block group.
		* This is because some other task allocated all available extents in the
		* meanwhile - this typically happens with tasks that don't reserve space
		* properly, either intentionally or as a bug. One example where this is
		* done intentionally is fsync, as it does not reserve any transaction units
		* and ends up allocating a variable number of metadata extents for log
		* tree extent buffers.
		*
		* We also need this 2 phases setup when adding a device to a filesystem with
		* a seed device - we must create new metadata and system chunks without adding
		* any of the block group items to the chunk, extent and device btrees. If we
		* did not do it this way, we would get ENOSPC when attempting to update those
		* btrees, since all the chunks from the seed device are read-only.
		*
		* Phase 1 does the updates and insertions to the chunk btree because if we had
		* it done in phase 2 and have a thundering herd of tasks allocating chunks in
		* parallel, we risk having too many system chunks allocated by many tasks if
		* many tasks reach phase 1 without the previous ones completing phase 2. In the
		* extreme case this leads to exhaustion of the system chunk array in the
		* superblock. This is easier to trigger if using a btree node/leaf size of 64K
		* and with RAID filesystems (so we have more device items in the chunk btree).
		* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
		* the system chunk array due to concurrent allocations") provides more details.
		*
		* For allocation of system chunks, we defer the updates and insertions into the
		* chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
		* if the chunk allocation is triggered while COWing an extent buffer of the
		* chunk btree, we are holding a lock on the parent of that extent buffer and
		* doing the chunk btree updates and insertions can require locking that parent.
		* This is for the very few and rare cases where we update the chunk btree that
		* are not chunk allocation or chunk removal: adding a device, removing a device
		* or resizing a device.
		*
		* The reservation of system space, done through check_system_chunk(), as well
		* as all the updates and insertions into the chunk btree must be done while
		* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
		* an extent buffer from the chunks btree we never trigger allocation of a new
		* system chunk, which would result in a deadlock (trying to lock twice an
		* extent buffer of the chunk btree, first time before triggering the chunk
		* allocation and the second time during chunk allocation while attempting to
		* update the chunks btree). The system chunk array is also updated while holding
		* that mutex. The same logic applies to removing chunks - we must reserve system
		* space, update the chunk btree and the system chunk array in the superblock
		* while holding fs_info->chunk_mutex.
		*
		* This function, btrfs_chunk_alloc(), belongs to phase 1.
		*
		* If @force is CHUNK_ALLOC_FORCE:
		* - return 1 if it successfully allocates a chunk,
		* - return errors including -ENOSPC otherwise.
		* If force is NOT CHUNK_ALLOC_FORCE:
		* If @force is NOT CHUNK_ALLOC_FORCE:
		* - return 0 if it doesn't need to allocate a new chunk,
		* - return 1 if it successfully allocates a chunk,
		* - return errors including -ENOSPC otherwise.
		@@ -3243,6 +3472,13 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
		/* Don't re-enter if we're already allocating a chunk */
		if (trans->allocating_chunk)
		return -ENOSPC;
		/*
		* If we are removing a chunk, don't re-enter or we would deadlock.
		* System space reservation and system chunk allocation is done by the
		* chunk remove operation (btrfs_remove_chunk()).
		*/
		if (trans->removing_chunk)
		return -ENOSPC;

		space_info = btrfs_find_space_info(fs_info, flags);
		ASSERT(space_info);
		@@ -3306,13 +3542,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
		force_metadata_allocation(fs_info);
		}

		/*
		* Check if we have enough space in SYSTEM chunk because we may need
		* to update devices.
		*/
		check_system_chunk(trans, flags);

		ret = btrfs_alloc_chunk(trans, flags);
		ret = do_chunk_alloc(trans, flags);
		trans->allocating_chunk = false;

		spin_lock(&space_info->lock);
		@@ -3331,22 +3561,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
		space_info->chunk_alloc = 0;
		spin_unlock(&space_info->lock);
		mutex_unlock(&fs_info->chunk_mutex);
		/*
		* When we allocate a new chunk we reserve space in the chunk block
		* reserve to make sure we can COW nodes/leafs in the chunk tree or
		* add new nodes/leafs to it if we end up needing to do it when
		* inserting the chunk item and updating device items as part of the
		* second phase of chunk allocation, performed by
		* btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
		* large number of new block groups to create in our transaction
		* handle's new_bgs list to avoid exhausting the chunk block reserve
		* in extreme cases - like having a single transaction create many new
		* block groups when starting to write out the free space caches of all
		* the block groups that were made dirty during the lifetime of the
		* transaction.
		*/
		if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
		btrfs_create_pending_block_groups(trans);

		return ret;
		}
		@@ -3367,7 +3581,6 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
		*/
		void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
		{
		struct btrfs_transaction *cur_trans = trans->transaction;
		struct btrfs_fs_info *fs_info = trans->fs_info;
		struct btrfs_space_info *info;
		u64 left;
		@@ -3382,7 +3595,6 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
		lockdep_assert_held(&fs_info->chunk_mutex);

		info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
		again:
		spin_lock(&info->lock);
		left = info->total_bytes - btrfs_space_info_used(info, true);
		spin_unlock(&info->lock);
		@@ -3401,78 +3613,41 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)

		if (left < thresh) {
		u64 flags = btrfs_system_alloc_profile(fs_info);
		u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);

		/*
		* If there's not available space for the chunk tree (system
		* space) and there are other tasks that reserved space for
		* creating a new system block group, wait for them to complete
		* the creation of their system block group and release excess
		* reserved space. We do this because:
		*
		* *) We can end up allocating more system chunks than necessary
		* when there are multiple tasks that are concurrently
		* allocating block groups, which can lead to exhaustion of
		* the system array in the superblock;
		*
		* *) If we allocate extra and unnecessary system block groups,
		* despite being empty for a long time, and possibly forever,
		* they end not being added to the list of unused block groups
		* because that typically happens only when deallocating the
		* last extent from a block group - which never happens since
		* we never allocate from them in the first place. The few
		* exceptions are when mounting a filesystem or running scrub,
		* which add unused block groups to the list of unused block
		* groups, to be deleted by the cleaner kthread.
		* And even when they are added to the list of unused block
		* groups, it can take a long time until they get deleted,
		* since the cleaner kthread might be sleeping or busy with
		* other work (deleting subvolumes, running delayed iputs,
		* defrag scheduling, etc);
		*
		* This is rare in practice, but can happen when too many tasks
		* are allocating blocks groups in parallel (via fallocate())
		* and before the one that reserved space for a new system block
		* group finishes the block group creation and releases the space
		* reserved in excess (at btrfs_create_pending_block_groups()),
		* other tasks end up here and see free system space temporarily
		* not enough for updating the chunk tree.
		*
		* We unlock the chunk mutex before waiting for such tasks and
		* lock it again after the wait, otherwise we would deadlock.
		* It is safe to do so because allocating a system chunk is the
		* first thing done while allocating a new block group.
		*/
		if (reserved > trans->chunk_bytes_reserved) {
		const u64 min_needed = reserved - thresh;

		mutex_unlock(&fs_info->chunk_mutex);
		wait_event(cur_trans->chunk_reserve_wait,
		atomic64_read(&cur_trans->chunk_bytes_reserved) <=
		min_needed);
		mutex_lock(&fs_info->chunk_mutex);
		goto again;
		}
		struct btrfs_block_group *bg;

		/*
		* Ignore failure to create system chunk. We might end up not
		* needing it, as we might not need to COW all nodes/leafs from
		* the paths we visit in the chunk tree (they were already COWed
		* or created in the current transaction for example).
		*
		* Also, if our caller is allocating a system chunk, do not
		* attempt to insert the chunk item in the chunk btree, as we
		* could deadlock on an extent buffer since our caller may be
		* COWing an extent buffer from the chunk btree.
		*/
		bg = btrfs_alloc_chunk(trans, flags);
		if (IS_ERR(bg)) {
		ret = PTR_ERR(bg);
		} else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
		/*
		* If we fail to add the chunk item here, we end up
		* trying again at phase 2 of chunk allocation, at
		* btrfs_create_pending_block_groups(). So ignore
		* any error here.
		*/
		ret = btrfs_alloc_chunk(trans, flags);
		btrfs_chunk_alloc_add_chunk_item(trans, bg);
		}
		}

		if (!ret) {
		ret = btrfs_block_rsv_add(fs_info->chunk_root,
		&fs_info->chunk_block_rsv,
		thresh, BTRFS_RESERVE_NO_FLUSH);
		if (!ret) {
		atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
		if (!ret)
		trans->chunk_bytes_reserved += thresh;
		}
		}
		}

		void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
		{

fs/btrfs/block-group.h

+4 −2

Original line number	Diff line number	Diff line
		@@ -97,6 +97,7 @@ struct btrfs_block_group {
		unsigned int removed:1;
		unsigned int to_copy:1;
		unsigned int relocating_repair:1;
		unsigned int chunk_item_inserted:1;

		int disk_cache_state;

		@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
		void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
		void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
		int btrfs_read_block_groups(struct btrfs_fs_info *info);
		int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
		u64 type, u64 chunk_offset, u64 size);
		struct btrfs_block_group btrfs_make_block_group(struct btrfs_trans_handle trans,
		u64 bytes_used, u64 type,
		u64 chunk_offset, u64 size);
		void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
		int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
		bool do_chunk_alloc);

fs/btrfs/ctree.c

+13 −54

Original line number	Diff line number	Diff line
		@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
		return 0;
		}

		static struct extent_buffer *alloc_tree_block_no_bg_flush(
		struct btrfs_trans_handle *trans,
		struct btrfs_root *root,
		u64 parent_start,
		const struct btrfs_disk_key *disk_key,
		int level,
		u64 hint,
		u64 empty_size,
		enum btrfs_lock_nesting nest)
		{
		struct btrfs_fs_info *fs_info = root->fs_info;
		struct extent_buffer *ret;

		/*
		* If we are COWing a node/leaf from the extent, chunk, device or free
		* space trees, make sure that we do not finish block group creation of
		* pending block groups. We do this to avoid a deadlock.
		* COWing can result in allocation of a new chunk, and flushing pending
		* block groups (btrfs_create_pending_block_groups()) can be triggered
		* when finishing allocation of a new chunk. Creation of a pending block
		* group modifies the extent, chunk, device and free space trees,
		* therefore we could deadlock with ourselves since we are holding a
		* lock on an extent buffer that btrfs_create_pending_block_groups() may
		* try to COW later.
		* For similar reasons, we also need to delay flushing pending block
		* groups when splitting a leaf or node, from one of those trees, since
		* we are holding a write lock on it and its parent or when inserting a
		* new root node for one of those trees.
		*/
		if (root == fs_info->extent_root \|\|
		root == fs_info->chunk_root \|\|
		root == fs_info->dev_root \|\|
		root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

		ret = btrfs_alloc_tree_block(trans, root, parent_start,
		root->root_key.objectid, disk_key, level,
		hint, empty_size, nest);
		trans->can_flush_pending_bgs = true;

		return ret;
		}

		/*
		* does the dirty work in cow of a single block. The parent block (if
		* supplied) is updated to point to the new cow copy. The new buffer is marked
		@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
		if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
		parent_start = parent->start;

		cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
		level, search_start, empty_size, nest);
		cow = btrfs_alloc_tree_block(trans, root, parent_start,
		root->root_key.objectid, &disk_key, level,
		search_start, empty_size, nest);
		if (IS_ERR(cow))
		return PTR_ERR(cow);

		@@ -2458,8 +2416,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
		else
		btrfs_node_key(lower, &lower_key, 0);

		c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
		root->node->start, 0,
		c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
		&lower_key, level, root->node->start, 0,
		BTRFS_NESTING_NEW_ROOT);
		if (IS_ERR(c))
		return PTR_ERR(c);
		@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
		mid = (c_nritems + 1) / 2;
		btrfs_node_key(c, &disk_key, mid);

		split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
		c->start, 0, BTRFS_NESTING_SPLIT);
		split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
		&disk_key, level, c->start, 0,
		BTRFS_NESTING_SPLIT);
		if (IS_ERR(split))
		return PTR_ERR(split);

		@@ -3381,9 +3340,9 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
		* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
		* use BTRFS_NESTING_NEW_ROOT.
		*/
		right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
		l->start, 0, num_doubles ?
		BTRFS_NESTING_NEW_ROOT :
		right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
		&disk_key, 0, l->start, 0,
		num_doubles ? BTRFS_NESTING_NEW_ROOT :
		BTRFS_NESTING_SPLIT);
		if (IS_ERR(right))
		return PTR_ERR(right);

fs/btrfs/inode.c

+118 −29

File changed.

Preview size limit exceeded, changes collapsed.

fs/btrfs/transaction.c

+5 −10

File changed.

Preview size limit exceeded, changes collapsed.