Commit 76e45035 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "There's a bunch of performance improvements, most notably the FIEMAP
  speedup, the new block group tree to speed up mount on large
  filesystems, more io_uring integration, some sysfs exports and the
  usual fixes and core updates.

  Summary:

  Performance:

   - outstanding FIEMAP speed improvement
      - algorithmic change how extents are enumerated leads to orders of
        magnitude speed boost (uncached and cached)
      - extent sharing check speedup (2.2x uncached, 3x cached)
      - add more cancellation points, allowing to interrupt seeking in
        files with large number of extents
      - more efficient hole and data seeking (4x uncached, 1.3x cached)
      - sample results:
	    256M, 32K extents:   4s ->  29ms  (~150x)
	    512M, 64K extents:  30s ->  59ms  (~550x)
	    1G,  128K extents: 225s -> 120ms (~1800x)

   - improved inode logging, especially for directories (on dbench
     workload throughput +25%, max latency -21%)

   - improved buffered IO, remove redundant extent state tracking,
     lowering memory consumption and avoiding rb tree traversal

   - add sysfs tunable to let qgroup temporarily skip exact accounting
     when deleting snapshot, leading to a speedup but requiring a rescan
     after that, will be used by snapper

   - support io_uring and buffered writes, until now it was just for
     direct IO, with the no-wait semantics implemented in the buffered
     write path it now works and leads to speed improvement in IOPS
     (2x), throughput (2.2x), latency (depends, 2x to 150x)

   - small performance improvements when dropping and searching for
     extent maps as well as when flushing delalloc in COW mode
     (throughput +5MB/s)

  User visible changes:

   - new incompatible feature block-group-tree adding a dedicated tree
     for tracking block groups, this allows a much faster load during
     mount and avoids seeking unlike when it's scattered in the extent
     tree items
      - this reduces mount time for many-terabyte sized filesystems
      - conversion tool will be provided so existing filesystem can also
        be updated in place
      - to reduce test matrix and feature combinations requires no-holes
        and free-space-tree (mkfs defaults since 5.15)

   - improved reporting of super block corruption detected by scrub

   - scrub also tries to repair super block and does not wait until next
     commit

   - discard stats and tunables are exported in sysfs
     (/sys/fs/btrfs/FSID/discard)

   - qgroup status is exported in sysfs
     (/sys/sys/fs/btrfs/FSID/qgroups/)

   - verify that super block was not modified when thawing filesystem

  Fixes:

   - FIEMAP fixes
      - fix extent sharing status, does not depend on the cached status
        where merged
      - flush delalloc so compressed extents are reported correctly

   - fix alignment of VMA for memory mapped files on THP

   - send: fix failures when processing inodes with no links (orphan
     files and directories)

   - fix race between quota enable and quota rescan ioctl

   - handle more corner cases for read-only compat feature verification

   - fix missed extent on fsync after dropping extent maps

  Core:

   - lockdep annotations to validate various transactions states and
     state transitions

   - preliminary support for fs-verity in send

   - more effective memory use in scrub for subpage where sector is
     smaller than page

   - block group caching progress logic has been removed, load is now
     synchronous

   - simplify end IO callbacks and bio handling, use chained bios
     instead of own tracking

   - add no-wait semantics to several functions (tree search, nocow,
     flushing, buffered write

   - cleanups and refactoring

  MM changes:

   - export balance_dirty_pages_ratelimited_flags"

* tag 'for-6.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (177 commits)
  btrfs: set generation before calling btrfs_clean_tree_block in btrfs_init_new_buffer
  btrfs: drop extent map range more efficiently
  btrfs: avoid pointless extent map tree search when flushing delalloc
  btrfs: remove unnecessary next extent map search
  btrfs: remove unnecessary NULL pointer checks when searching extent maps
  btrfs: assert tree is locked when clearing extent map from logging
  btrfs: remove unnecessary extent map initializations
  btrfs: remove the refcount warning/check at free_extent_map()
  btrfs: add helper to replace extent map range with a new extent map
  btrfs: move open coded extent map tree deletion out of inode eviction
  btrfs: use cond_resched_rwlock_write() during inode eviction
  btrfs: use extent_map_end() at btrfs_drop_extent_map_range()
  btrfs: move btrfs_drop_extent_cache() to extent_map.c
  btrfs: fix missed extent on fsync after dropping extent maps
  btrfs: remove stale prototype of btrfs_write_inode
  btrfs: enable nowait async buffered writes
  btrfs: assert nowait mode is not used for some btree search functions
  btrfs: make btrfs_buffered_write nowait compatible
  btrfs: plumb NOWAIT through the write path
  btrfs: make lock_and_cleanup_extent_if_need nowait compatible
  ...
parents 4c0ed7d8 cbddcc4f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
	   subpage.o tree-mod-log.o
	   subpage.o tree-mod-log.o extent-io-tree.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+145 −10
Original line number Diff line number Diff line
@@ -1511,16 +1511,118 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
	return ret;
}

/**
 * Check if an extent is shared or not
/*
 * The caller has joined a transaction or is holding a read lock on the
 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
 * snapshot field changing while updating or checking the cache.
 */
static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
					struct btrfs_root *root,
					u64 bytenr, int level, bool *is_shared)
{
	struct btrfs_backref_shared_cache_entry *entry;

	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
		return false;

	/*
	 * Level -1 is used for the data extent, which is not reliable to cache
	 * because its reference count can increase or decrease without us
	 * realizing. We cache results only for extent buffers that lead from
	 * the root node down to the leaf with the file extent item.
	 */
	ASSERT(level >= 0);

	entry = &cache->entries[level];

	/* Unused cache entry or being used for some other extent buffer. */
	if (entry->bytenr != bytenr)
		return false;

	/*
	 * We cached a false result, but the last snapshot generation of the
	 * root changed, so we now have a snapshot. Don't trust the result.
	 */
	if (!entry->is_shared &&
	    entry->gen != btrfs_root_last_snapshot(&root->root_item))
		return false;

	/*
	 * If we cached a true result and the last generation used for dropping
	 * a root changed, we can not trust the result, because the dropped root
	 * could be a snapshot sharing this extent buffer.
	 */
	if (entry->is_shared &&
	    entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
		return false;

	*is_shared = entry->is_shared;

	return true;
}

/*
 * The caller has joined a transaction or is holding a read lock on the
 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
 * snapshot field changing while updating or checking the cache.
 */
static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
				       struct btrfs_root *root,
				       u64 bytenr, int level, bool is_shared)
{
	struct btrfs_backref_shared_cache_entry *entry;
	u64 gen;

	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
		return;

	/*
	 * Level -1 is used for the data extent, which is not reliable to cache
	 * because its reference count can increase or decrease without us
	 * realizing. We cache results only for extent buffers that lead from
	 * the root node down to the leaf with the file extent item.
	 */
	ASSERT(level >= 0);

	if (is_shared)
		gen = btrfs_get_last_root_drop_gen(root->fs_info);
	else
		gen = btrfs_root_last_snapshot(&root->root_item);

	entry = &cache->entries[level];
	entry->bytenr = bytenr;
	entry->is_shared = is_shared;
	entry->gen = gen;

	/*
	 * If we found an extent buffer is shared, set the cache result for all
	 * extent buffers below it to true. As nodes in the path are COWed,
	 * their sharedness is moved to their children, and if a leaf is COWed,
	 * then the sharedness of a data extent becomes direct, the refcount of
	 * data extent is increased in the extent item at the extent tree.
	 */
	if (is_shared) {
		for (int i = 0; i < level; i++) {
			entry = &cache->entries[i];
			entry->is_shared = is_shared;
			entry->gen = gen;
		}
	}
}

/*
 * Check if a data extent is shared or not.
 *
 * @root:   root inode belongs to
 * @inum:   inode number of the inode whose extent we are checking
 * @bytenr: logical bytenr of the extent we are checking
 * @roots:  list of roots this extent is shared among
 * @tmp:    temporary list used for iteration
 * @root:        The root the inode belongs to.
 * @inum:        Number of the inode whose extent we are checking.
 * @bytenr:      Logical bytenr of the extent we are checking.
 * @extent_gen:  Generation of the extent (file extent item) or 0 if it is
 *               not known.
 * @roots:       List of roots this extent is shared among.
 * @tmp:         Temporary list used for iteration.
 * @cache:       A backref lookup result cache.
 *
 * btrfs_check_shared uses the backref walking code but will short
 * btrfs_is_data_extent_shared uses the backref walking code but will short
 * circuit as soon as it finds a root or inode that doesn't match the
 * one passed in. This provides a significant performance benefit for
 * callers (such as fiemap) which want to know whether the extent is
@@ -1531,8 +1633,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 *
 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
 */
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
		struct ulist *roots, struct ulist *tmp)
int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
				u64 extent_gen,
				struct ulist *roots, struct ulist *tmp,
				struct btrfs_backref_shared_cache *cache)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_trans_handle *trans;
@@ -1545,6 +1649,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
		.inum = inum,
		.share_count = 0,
	};
	int level;

	ulist_init(roots);
	ulist_init(tmp);
@@ -1561,22 +1666,52 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
		btrfs_get_tree_mod_seq(fs_info, &elem);
	}

	/* -1 means we are in the bytenr of the data extent. */
	level = -1;
	ULIST_ITER_INIT(&uiter);
	while (1) {
		bool is_shared;
		bool cached;

		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
					roots, NULL, &shared, false);
		if (ret == BACKREF_FOUND_SHARED) {
			/* this is the only condition under which we return 1 */
			ret = 1;
			if (level >= 0)
				store_backref_shared_cache(cache, root, bytenr,
							   level, true);
			break;
		}
		if (ret < 0 && ret != -ENOENT)
			break;
		ret = 0;
		/*
		 * If our data extent is not shared through reflinks and it was
		 * created in a generation after the last one used to create a
		 * snapshot of the inode's root, then it can not be shared
		 * indirectly through subtrees, as that can only happen with
		 * snapshots. In this case bail out, no need to check for the
		 * sharedness of extent buffers.
		 */
		if (level == -1 &&
		    extent_gen > btrfs_root_last_snapshot(&root->root_item))
			break;

		if (level >= 0)
			store_backref_shared_cache(cache, root, bytenr,
						   level, false);
		node = ulist_next(tmp, &uiter);
		if (!node)
			break;
		bytenr = node->val;
		level++;
		cached = lookup_backref_shared_cache(cache, root, bytenr, level,
						     &is_shared);
		if (cached) {
			ret = (is_shared ? 1 : 0);
			break;
		}
		shared.share_count = 0;
		cond_resched();
	}
+18 −2
Original line number Diff line number Diff line
@@ -17,6 +17,20 @@ struct inode_fs_paths {
	struct btrfs_data_container	*fspath;
};

struct btrfs_backref_shared_cache_entry {
	u64 bytenr;
	u64 gen;
	bool is_shared;
};

struct btrfs_backref_shared_cache {
	/*
	 * A path from a root to a leaf that has a file extent item pointing to
	 * a given data extent should never exceed the maximum b+tree height.
	 */
	struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
};

typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
		void *ctx);

@@ -62,8 +76,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
			  u64 start_off, struct btrfs_path *path,
			  struct btrfs_inode_extref **ret_extref,
			  u64 *found_off);
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
		struct ulist *roots, struct ulist *tmp_ulist);
int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
				u64 extent_gen,
				struct ulist *roots, struct ulist *tmp,
				struct btrfs_backref_shared_cache *cache);

int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
+76 −106
Original line number Diff line number Diff line
@@ -593,8 +593,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)

			if (need_resched() ||
			    rwsem_is_contended(&fs_info->commit_root_sem)) {
				if (wakeup)
					caching_ctl->progress = last;
				btrfs_release_path(path);
				up_read(&fs_info->commit_root_sem);
				mutex_unlock(&caching_ctl->mutex);
@@ -618,9 +616,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
			key.objectid = last;
			key.offset = 0;
			key.type = BTRFS_EXTENT_ITEM_KEY;

			if (wakeup)
				caching_ctl->progress = last;
			btrfs_release_path(path);
			goto next;
		}
@@ -655,7 +650,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)

	total_found += add_new_free_space(block_group, last,
				block_group->start + block_group->length);
	caching_ctl->progress = (u64)-1;

out:
	btrfs_free_path(path);
@@ -725,8 +719,6 @@ static noinline void caching_thread(struct btrfs_work *work)
	}
#endif

	caching_ctl->progress = (u64)-1;

	up_read(&fs_info->commit_root_sem);
	btrfs_free_excluded_extents(block_group);
	mutex_unlock(&caching_ctl->mutex);
@@ -755,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
	mutex_init(&caching_ctl->mutex);
	init_waitqueue_head(&caching_ctl->wait);
	caching_ctl->block_group = cache;
	caching_ctl->progress = cache->start;
	refcount_set(&caching_ctl->count, 2);
	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);

@@ -772,7 +763,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
	WARN_ON(cache->caching_ctl);
	cache->caching_ctl = caching_ctl;
	cache->cached = BTRFS_CACHE_STARTED;
	cache->has_caching_ctl = 1;
	spin_unlock(&cache->lock);

	write_lock(&fs_info->block_group_cache_lock);
@@ -784,8 +774,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)

	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
	/* REVIEW */
	if (wait && caching_ctl)
		ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
		/* wait_event(caching_ctl->wait, space_cache_v1_done(cache)); */
	if (caching_ctl)
		btrfs_put_caching_control(caching_ctl);

@@ -988,32 +980,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
		kobject_put(kobj);
	}

	if (block_group->has_caching_ctl)
		caching_ctl = btrfs_get_caching_control(block_group);
	if (block_group->cached == BTRFS_CACHE_STARTED)
		btrfs_wait_block_group_cache_done(block_group);
	if (block_group->has_caching_ctl) {

	write_lock(&fs_info->block_group_cache_lock);
	caching_ctl = btrfs_get_caching_control(block_group);
	if (!caching_ctl) {
		struct btrfs_caching_control *ctl;

			list_for_each_entry(ctl,
				    &fs_info->caching_block_groups, list)
		list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
			if (ctl->block_group == block_group) {
				caching_ctl = ctl;
				refcount_inc(&caching_ctl->count);
				break;
			}
		}
	}
	if (caching_ctl)
		list_del_init(&caching_ctl->list);
	write_unlock(&fs_info->block_group_cache_lock);

	if (caching_ctl) {
		/* Once for the caching bgs list and once for us. */
		btrfs_put_caching_control(caching_ctl);
		btrfs_put_caching_control(caching_ctl);
	}
	}

	spin_lock(&trans->transaction->dirty_bgs_lock);
	WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1034,12 +1025,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
			< block_group->zone_unusable);
		WARN_ON(block_group->space_info->disk_total
			< block_group->length * factor);
		WARN_ON(block_group->zone_is_active &&
		WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
				 &block_group->runtime_flags) &&
			block_group->space_info->active_total_bytes
			< block_group->length);
	}
	block_group->space_info->total_bytes -= block_group->length;
	if (block_group->zone_is_active)
	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
		block_group->space_info->active_total_bytes -= block_group->length;
	block_group->space_info->bytes_readonly -=
		(block_group->length - block_group->zone_unusable);
@@ -1069,7 +1061,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
		goto out;

	spin_lock(&block_group->lock);
	block_group->removed = 1;
	set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);

	/*
	 * At this point trimming or scrub can't start on this block group,
	 * because we removed the block group from the rbtree
@@ -1304,6 +1297,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
		return;

	if (btrfs_fs_closing(fs_info))
		return;

	/*
	 * Long running balances can keep us blocked here for eternity, so
	 * simply skip deletion if we're unable to get the mutex.
@@ -1543,6 +1539,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
		return;

	if (btrfs_fs_closing(fs_info))
		return;

	if (!btrfs_should_reclaim(fs_info))
		return;

@@ -1890,16 +1889,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
	return 0;
}

static void link_block_group(struct btrfs_block_group *cache)
{
	struct btrfs_space_info *space_info = cache->space_info;
	int index = btrfs_bg_flags_to_raid_index(cache->flags);

	down_write(&space_info->groups_sem);
	list_add_tail(&cache->list, &space_info->block_groups[index]);
	up_write(&space_info->groups_sem);
}

static struct btrfs_block_group *btrfs_create_block_group_cache(
		struct btrfs_fs_info *fs_info, u64 start)
{
@@ -1937,7 +1926,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
	atomic_set(&cache->frozen, 0);
	mutex_init(&cache->free_space_lock);
	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
	cache->full_stripe_locks_root.root = RB_ROOT;
	mutex_init(&cache->full_stripe_locks_root.lock);

	return cache;
}
@@ -2002,7 +1992,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
				int need_clear)
{
	struct btrfs_block_group *cache;
	struct btrfs_space_info *space_info;
	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
	int ret;

@@ -2078,11 +2067,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
		/* Should not have any excluded extents. Just in case, though. */
		btrfs_free_excluded_extents(cache);
	} else if (cache->length == cache->used) {
		cache->last_byte_to_unpin = (u64)-1;
		cache->cached = BTRFS_CACHE_FINISHED;
		btrfs_free_excluded_extents(cache);
	} else if (cache->used == 0) {
		cache->last_byte_to_unpin = (u64)-1;
		cache->cached = BTRFS_CACHE_FINISHED;
		add_new_free_space(cache, cache->start,
				   cache->start + cache->length);
@@ -2095,14 +2082,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
		goto error;
	}
	trace_btrfs_add_block_group(info, cache, 0);
	btrfs_update_space_info(info, cache->flags, cache->length,
				cache->used, cache->bytes_super,
				cache->zone_unusable, cache->zone_is_active,
				&space_info);

	cache->space_info = space_info;

	link_block_group(cache);
	btrfs_add_bg_to_space_info(info, cache);

	set_avail_alloc_bits(info, cache->flags);
	if (btrfs_chunk_writeable(info, cache->start)) {
@@ -2126,7 +2106,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
	struct btrfs_space_info *space_info;
	struct rb_node *node;
	int ret = 0;

@@ -2146,7 +2125,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
		/* Fill dummy cache as FULL */
		bg->length = em->len;
		bg->flags = map->type;
		bg->last_byte_to_unpin = (u64)-1;
		bg->cached = BTRFS_CACHE_FINISHED;
		bg->used = em->len;
		bg->flags = map->type;
@@ -2167,10 +2145,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
			break;
		}

		btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
					0, 0, false, &space_info);
		bg->space_info = space_info;
		link_block_group(bg);
		btrfs_add_bg_to_space_info(fs_info, bg);

		set_avail_alloc_bits(fs_info, bg->flags);
	}
@@ -2190,7 +2165,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
	int need_clear = 0;
	u64 cache_gen;

	if (!root)
	/*
	 * Either no extent root (with ibadroots rescue option) or we have
	 * unsupported RO options. The fs can never be mounted read-write, so no
	 * need to waste time searching block group items.
	 *
	 * This also allows new extent tree related changes to be RO compat,
	 * no need for a full incompat flag.
	 */
	if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
		      ~BTRFS_FEATURE_COMPAT_RO_SUPP))
		return fill_dummy_bgs(info);

	key.objectid = 0;
@@ -2425,7 +2409,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		ret = insert_block_group_item(trans, block_group);
		if (ret)
			btrfs_abort_transaction(trans, ret);
		if (!block_group->chunk_item_inserted) {
		if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
			      &block_group->runtime_flags)) {
			mutex_lock(&fs_info->chunk_mutex);
			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
			mutex_unlock(&fs_info->chunk_mutex);
@@ -2494,7 +2479,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
	set_free_space_tree_thresholds(cache);
	cache->used = bytes_used;
	cache->flags = type;
	cache->last_byte_to_unpin = (u64)-1;
	cache->cached = BTRFS_CACHE_FINISHED;
	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);

@@ -2519,14 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran

	btrfs_free_excluded_extents(cache);

#ifdef CONFIG_BTRFS_DEBUG
	if (btrfs_should_fragment_free_space(cache)) {
		u64 new_bytes_used = size - bytes_used;

		bytes_used += new_bytes_used >> 1;
		fragment_free_space(cache);
	}
#endif
	/*
	 * Ensure the corresponding space_info object is created and
	 * assigned to our block group. We want our bg to be added to the rbtree
@@ -2547,12 +2523,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
	 * the rbtree, update the space info's counters.
	 */
	trace_btrfs_add_block_group(fs_info, cache, 1);
	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
				cache->bytes_super, cache->zone_unusable,
				cache->zone_is_active, &cache->space_info);
	btrfs_add_bg_to_space_info(fs_info, cache);
	btrfs_update_global_block_rsv(fs_info);

	link_block_group(cache);
#ifdef CONFIG_BTRFS_DEBUG
	if (btrfs_should_fragment_free_space(cache)) {
		u64 new_bytes_used = size - bytes_used;

		cache->space_info->bytes_used += new_bytes_used >> 1;
		fragment_free_space(cache);
	}
#endif

	list_add_tail(&cache->bg_list, &trans->new_bgs);
	trans->delayed_ref_updates++;
@@ -2869,7 +2850,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
	cache_size *= fs_info->sectorsize;

	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
					  cache_size);
					  cache_size, false);
	if (ret)
		goto out_put;

@@ -3965,35 +3946,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
	struct btrfs_block_group *block_group;
	u64 last = 0;

	while (1) {
		struct inode *inode;

		block_group = btrfs_lookup_first_block_group(info, last);
	block_group = btrfs_lookup_first_block_group(info, 0);
	while (block_group) {
		btrfs_wait_block_group_cache_done(block_group);
		spin_lock(&block_group->lock);
			if (block_group->iref)
				break;
			spin_unlock(&block_group->lock);
			block_group = btrfs_next_block_group(block_group);
		}
		if (!block_group) {
			if (last == 0)
				break;
			last = 0;
			continue;
		}
		if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
				       &block_group->runtime_flags)) {
			struct inode *inode = block_group->inode;

		inode = block_group->inode;
		block_group->iref = 0;
			block_group->inode = NULL;
			spin_unlock(&block_group->lock);

			ASSERT(block_group->io_ctl.inode == NULL);
			iput(inode);
		last = block_group->start + block_group->length;
		btrfs_put_block_group(block_group);
		} else {
			spin_unlock(&block_group->lock);
		}
		block_group = btrfs_next_block_group(block_group);
	}
}

@@ -4129,7 +4099,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)

	spin_lock(&block_group->lock);
	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
		   block_group->removed);
		   test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
	spin_unlock(&block_group->lock);

	if (cleanup) {
@@ -4150,7 +4120,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
		 * tasks trimming this block group have left 1 entry each one.
		 * Free them if any.
		 */
		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
		btrfs_remove_free_space_cache(block_group);
	}
}

+27 −12
Original line number Diff line number Diff line
@@ -46,19 +46,44 @@ enum btrfs_chunk_alloc_enum {
	CHUNK_ALLOC_FORCE_FOR_EXTENT,
};

/* Block group flags set at runtime */
enum btrfs_block_group_flags {
	BLOCK_GROUP_FLAG_IREF,
	BLOCK_GROUP_FLAG_REMOVED,
	BLOCK_GROUP_FLAG_TO_COPY,
	BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
	BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
	BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
	BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
};

enum btrfs_caching_type {
	BTRFS_CACHE_NO,
	BTRFS_CACHE_STARTED,
	BTRFS_CACHE_FINISHED,
	BTRFS_CACHE_ERROR,
};

struct btrfs_caching_control {
	struct list_head list;
	struct mutex mutex;
	wait_queue_head_t wait;
	struct btrfs_work work;
	struct btrfs_block_group *block_group;
	u64 progress;
	refcount_t count;
};

/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M

/*
 * Tree to record all locked full stripes of a RAID5/6 block group
 */
struct btrfs_full_stripe_locks_tree {
	struct rb_root root;
	struct mutex lock;
};

struct btrfs_block_group {
	struct btrfs_fs_info *fs_info;
	struct inode *inode;
@@ -95,23 +120,15 @@ struct btrfs_block_group {

	/* For raid56, this is a full stripe, without parity */
	unsigned long full_stripe_len;
	unsigned long runtime_flags;

	unsigned int ro;
	unsigned int iref:1;
	unsigned int has_caching_ctl:1;
	unsigned int removed:1;
	unsigned int to_copy:1;
	unsigned int relocating_repair:1;
	unsigned int chunk_item_inserted:1;
	unsigned int zone_is_active:1;
	unsigned int zoned_data_reloc_ongoing:1;

	int disk_cache_state;

	/* Cache tracking stuff */
	int cached;
	struct btrfs_caching_control *caching_ctl;
	u64 last_byte_to_unpin;

	struct btrfs_space_info *space_info;

@@ -305,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
				struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
		       struct block_device *bdev, u64 physical, u64 **logical,
		       int *naddrs, int *stripe_len);
Loading