Commit bd1b7c13 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "Features:

   - subpage:
      - support for PAGE_SIZE > 4K (previously only 64K)
      - make it work with raid56

   - repair super block num_devices automatically if it does not match
     the number of device items

   - defrag can convert inline extents to regular extents, up to now
     inline files were skipped but the setting of mount option
     max_inline could affect the decision logic

   - zoned:
      - minimal accepted zone size is explicitly set to 4MiB
      - make zone reclaim less aggressive and don't reclaim if there are
        enough free zones
      - add per-profile sysfs tunable of the reclaim threshold

   - allow automatic block group reclaim for non-zoned filesystems, with
     sysfs tunables

   - tree-checker: new check, compare extent buffer owner against owner
     rootid

  Performance:

   - avoid blocking on space reservation when doing nowait direct io
     writes (+7% throughput for reads and writes)

   - NOCOW write throughput improvement due to refined locking (+3%)

   - send: reduce pressure to page cache by dropping extent pages right
     after they're processed

  Core:

   - convert all radix trees to xarray

   - add iterators for b-tree node items

   - support printk message index

   - user bulk page allocation for extent buffers

   - switch to bio_alloc API, use on-stack bios where convenient, other
     bio cleanups

   - use rw lock for block groups to favor concurrent reads

   - simplify workques, don't allocate high priority threads for all
     normal queues as we need only one

   - refactor scrub, process chunks based on their constraints and
     similarity

   - allocate direct io structures on stack and pass around only
     pointers, avoids allocation and reduces potential error handling

  Fixes:

   - fix count of reserved transaction items for various inode
     operations

   - fix deadlock between concurrent dio writes when low on free data
     space

   - fix a few cases when zones need to be finished

  VFS, iomap:

   - add helper to check if sb write has started (usable for assertions)

   - new helper iomap_dio_alloc_bio, export iomap_dio_bio_end_io"

* tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (173 commits)
  btrfs: zoned: introduce a minimal zone size 4M and reject mount
  btrfs: allow defrag to convert inline extents to regular extents
  btrfs: add "0x" prefix for unsupported optional features
  btrfs: do not account twice for inode ref when reserving metadata units
  btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer
  btrfs: send: avoid trashing the page cache
  btrfs: send: keep the current inode open while processing it
  btrfs: allocate the btrfs_dio_private as part of the iomap dio bio
  btrfs: move struct btrfs_dio_private to inode.c
  btrfs: remove the disk_bytenr in struct btrfs_dio_private
  btrfs: allocate dio_data on stack
  iomap: add per-iomap_iter private data
  iomap: allow the file system to provide a bio_set for direct I/O
  btrfs: add a btrfs_dio_rw wrapper
  btrfs: zoned: zone finish unused block group
  btrfs: zoned: properly finish block group on metadata write
  btrfs: zoned: finish block group when there are no more allocatable bytes left
  btrfs: zoned: consolidate zone finish functions
  btrfs: zoned: introduce btrfs_zoned_bg_is_full
  btrfs: improve error reporting in lookup_inline_extent_backref
  ...
parents 3842007b 0a05fafe
Loading
Loading
Loading
Loading
+3 −36
Original line number Diff line number Diff line
@@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
	return acl;
}

static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
			   struct user_namespace *mnt_userns,
			   struct inode *inode, struct posix_acl *acl, int type)
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
		    struct posix_acl *acl, int type)
{
	int ret, size = 0;
	const char *name;
@@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
		if (ret)
			return ret;
	}
	ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
	ret = __btrfs_set_acl(NULL, inode, acl, type);
	if (ret)
		inode->i_mode = old_mode;
	return ret;
}

int btrfs_init_acl(struct btrfs_trans_handle *trans,
		   struct inode *inode, struct inode *dir)
{
	struct posix_acl *default_acl, *acl;
	int ret = 0;

	/* this happens with subvols */
	if (!dir)
		return 0;

	ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
	if (ret)
		return ret;

	if (default_acl) {
		ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
				      ACL_TYPE_DEFAULT);
		posix_acl_release(default_acl);
	}

	if (acl) {
		if (!ret)
			ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
					      ACL_TYPE_ACCESS);
		posix_acl_release(acl);
	}

	if (!default_acl && !acl)
		cache_no_acl(inode);
	return ret;
}
+22 −100
Original line number Diff line number Diff line
@@ -15,13 +15,12 @@
enum {
	WORK_DONE_BIT,
	WORK_ORDER_DONE_BIT,
	WORK_HIGH_PRIO_BIT,
};

#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)

struct __btrfs_workqueue {
struct btrfs_workqueue {
	struct workqueue_struct *normal_wq;

	/* File system this workqueue services */
@@ -48,12 +47,7 @@ struct __btrfs_workqueue {
	spinlock_t thres_lock;
};

struct btrfs_workqueue {
	struct __btrfs_workqueue *normal;
	struct __btrfs_workqueue *high;
};

struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
	return wq->fs_info;
}
@@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
	/*
	 * We could compare wq->normal->pending with num_online_cpus()
	 * We could compare wq->pending with num_online_cpus()
	 * to support "thresh == NO_THRESHOLD" case, but it requires
	 * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
	 * postpone it until someone needs the support of that case.
	 */
	if (wq->normal->thresh == NO_THRESHOLD)
	if (wq->thresh == NO_THRESHOLD)
		return false;

	return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
	return atomic_read(&wq->pending) > wq->thresh * 2;
}

static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
			unsigned int flags, int limit_active, int thresh)
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
					      const char *name, unsigned int flags,
					      int limit_active, int thresh)
{
	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);

	if (!ret)
		return NULL;
@@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
		ret->thresh = thresh;
	}

	if (flags & WQ_HIGHPRI)
		ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
						 ret->current_active, name);
	else
		ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
						 ret->current_active, name);
	ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
					 name);
	if (!ret->normal_wq) {
		kfree(ret);
		return NULL;
@@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
	INIT_LIST_HEAD(&ret->ordered_list);
	spin_lock_init(&ret->list_lock);
	spin_lock_init(&ret->thres_lock);
	trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
	return ret;
}

static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);

struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
					      const char *name,
					      unsigned int flags,
					      int limit_active,
					      int thresh)
{
	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);

	if (!ret)
		return NULL;

	ret->normal = __btrfs_alloc_workqueue(fs_info, name,
					      flags & ~WQ_HIGHPRI,
					      limit_active, thresh);
	if (!ret->normal) {
		kfree(ret);
		return NULL;
	}

	if (flags & WQ_HIGHPRI) {
		ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
						    limit_active, thresh);
		if (!ret->high) {
			__btrfs_destroy_workqueue(ret->normal);
			kfree(ret);
			return NULL;
		}
	}
	trace_btrfs_workqueue_alloc(ret, name);
	return ret;
}

@@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
 * This hook WILL be called in IRQ handler context,
 * so workqueue_set_max_active MUST NOT be called in this hook
 */
static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
	if (wq->thresh == NO_THRESHOLD)
		return;
@@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
 * This hook is called in kthread content.
 * So workqueue_set_max_active is called here.
 */
static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
	int new_current_active;
	long pending;
@@ -217,7 +173,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
	}
}

static void run_ordered_work(struct __btrfs_workqueue *wq,
static void run_ordered_work(struct btrfs_workqueue *wq,
			     struct btrfs_work *self)
{
	struct list_head *list = &wq->ordered_list;
@@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
	struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
					       normal_work);
	struct __btrfs_workqueue *wq;
	struct btrfs_workqueue *wq = work->wq;
	int need_order = 0;

	/*
@@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work)
	 */
	if (work->ordered_func)
		need_order = 1;
	wq = work->wq;

	trace_btrfs_work_sched(work);
	thresh_exec_hook(wq);
@@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
	work->flags = 0;
}

static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
				      struct btrfs_work *work)
void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
	unsigned long flags;

@@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
	queue_work(wq->normal_wq, &work->normal_work);
}

void btrfs_queue_work(struct btrfs_workqueue *wq,
		      struct btrfs_work *work)
{
	struct __btrfs_workqueue *dest_wq;

	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
		dest_wq = wq->high;
	else
		dest_wq = wq->normal;
	__btrfs_queue_work(dest_wq, work);
}

static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
{
	destroy_workqueue(wq->normal_wq);
	trace_btrfs_workqueue_destroy(wq);
	kfree(wq);
}

void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
	if (!wq)
		return;
	if (wq->high)
		__btrfs_destroy_workqueue(wq->high);
	__btrfs_destroy_workqueue(wq->normal);
	destroy_workqueue(wq->normal_wq);
	trace_btrfs_workqueue_destroy(wq);
	kfree(wq);
}

void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
	if (!wq)
		return;
	wq->normal->limit_active = limit_active;
	if (wq->high)
		wq->high->limit_active = limit_active;
}

void btrfs_set_work_high_priority(struct btrfs_work *work)
{
	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
	if (wq)
		wq->limit_active = limit_active;
}

void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
	if (wq->high)
		flush_workqueue(wq->high->normal_wq);

	flush_workqueue(wq->normal->normal_wq);
	flush_workqueue(wq->normal_wq);
}
+2 −5
Original line number Diff line number Diff line
@@ -11,8 +11,6 @@

struct btrfs_fs_info;
struct btrfs_workqueue;
/* Internal use only */
struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_work_func_t)(struct work_struct *arg);
@@ -25,7 +23,7 @@ struct btrfs_work {
	/* Don't touch things below */
	struct work_struct normal_work;
	struct list_head ordered_list;
	struct __btrfs_workqueue *wq;
	struct btrfs_workqueue *wq;
	unsigned long flags;
};

@@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
		      struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);

+125 −80
Original line number Diff line number Diff line
@@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
	struct rb_node **p;
	struct rb_node *parent = NULL;
	struct btrfs_block_group *cache;
	bool leftmost = true;

	ASSERT(block_group->length != 0);

	spin_lock(&info->block_group_cache_lock);
	p = &info->block_group_cache_tree.rb_node;
	write_lock(&info->block_group_cache_lock);
	p = &info->block_group_cache_tree.rb_root.rb_node;

	while (*p) {
		parent = *p;
@@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
			p = &(*p)->rb_left;
		} else if (block_group->start > cache->start) {
			p = &(*p)->rb_right;
			leftmost = false;
		} else {
			spin_unlock(&info->block_group_cache_lock);
			write_unlock(&info->block_group_cache_lock);
			return -EEXIST;
		}
	}

	rb_link_node(&block_group->cache_node, parent, p);
	rb_insert_color(&block_group->cache_node,
			&info->block_group_cache_tree);

	if (info->first_logical_byte > block_group->start)
		info->first_logical_byte = block_group->start;
	rb_insert_color_cached(&block_group->cache_node,
			       &info->block_group_cache_tree, leftmost);

	spin_unlock(&info->block_group_cache_lock);
	write_unlock(&info->block_group_cache_lock);

	return 0;
}
@@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
	struct rb_node *n;
	u64 end, start;

	spin_lock(&info->block_group_cache_lock);
	n = info->block_group_cache_tree.rb_node;
	read_lock(&info->block_group_cache_lock);
	n = info->block_group_cache_tree.rb_root.rb_node;

	while (n) {
		cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
			break;
		}
	}
	if (ret) {
	if (ret)
		btrfs_get_block_group(ret);
		if (bytenr == 0 && info->first_logical_byte > ret->start)
			info->first_logical_byte = ret->start;
	}
	spin_unlock(&info->block_group_cache_lock);
	read_unlock(&info->block_group_cache_lock);

	return ret;
}
@@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
	struct btrfs_fs_info *fs_info = cache->fs_info;
	struct rb_node *node;

	spin_lock(&fs_info->block_group_cache_lock);
	read_lock(&fs_info->block_group_cache_lock);

	/* If our block group was removed, we need a full search. */
	if (RB_EMPTY_NODE(&cache->cache_node)) {
		const u64 next_bytenr = cache->start + cache->length;

		spin_unlock(&fs_info->block_group_cache_lock);
		read_unlock(&fs_info->block_group_cache_lock);
		btrfs_put_block_group(cache);
		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
		return btrfs_lookup_first_block_group(fs_info, next_bytenr);
	}
	node = rb_next(&cache->cache_node);
	btrfs_put_block_group(cache);
@@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
		btrfs_get_block_group(cache);
	} else
		cache = NULL;
	spin_unlock(&fs_info->block_group_cache_lock);
	read_unlock(&fs_info->block_group_cache_lock);
	return cache;
}

bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
/**
 * Check if we can do a NOCOW write for a given extent.
 *
 * @fs_info:       The filesystem information object.
 * @bytenr:        Logical start address of the extent.
 *
 * Check if we can do a NOCOW write for the given extent, and increments the
 * number of NOCOW writers in the block group that contains the extent, as long
 * as the block group exists and it's currently not in read-only mode.
 *
 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
 *          is responsible for calling btrfs_dec_nocow_writers() later.
 *
 *          Or NULL if we can not do a NOCOW write
 */
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
						  u64 bytenr)
{
	struct btrfs_block_group *bg;
	bool ret = true;
	bool can_nocow = true;

	bg = btrfs_lookup_block_group(fs_info, bytenr);
	if (!bg)
		return false;
		return NULL;

	spin_lock(&bg->lock);
	if (bg->ro)
		ret = false;
		can_nocow = false;
	else
		atomic_inc(&bg->nocow_writers);
	spin_unlock(&bg->lock);

	/* No put on block group, done by btrfs_dec_nocow_writers */
	if (!ret)
	if (!can_nocow) {
		btrfs_put_block_group(bg);
		return NULL;
	}

	return ret;
	/* No put on block group, done by btrfs_dec_nocow_writers(). */
	return bg;
}

void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
/**
 * Decrement the number of NOCOW writers in a block group.
 *
 * @bg:       The block group.
 *
 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
 * and on the block group returned by that call. Typically this is called after
 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
 * relocation.
 *
 * After this call, the caller should not use the block group anymore. It it wants
 * to use it, then it should get a reference on it before calling this function.
 */
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
	struct btrfs_block_group *bg;

	bg = btrfs_lookup_block_group(fs_info, bytenr);
	ASSERT(bg);
	if (atomic_dec_and_test(&bg->nocow_writers))
		wake_up_var(&bg->nocow_writers);
	/*
	 * Once for our lookup and once for the lookup done by a previous call
	 * to btrfs_inc_nocow_writers()
	 */
	btrfs_put_block_group(bg);

	/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
	btrfs_put_block_group(bg);
}

@@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
	cache->has_caching_ctl = 1;
	spin_unlock(&cache->lock);

	spin_lock(&fs_info->block_group_cache_lock);
	write_lock(&fs_info->block_group_cache_lock);
	refcount_inc(&caching_ctl->count);
	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
	spin_unlock(&fs_info->block_group_cache_lock);
	write_unlock(&fs_info->block_group_cache_lock);

	btrfs_get_block_group(cache);

@@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
	if (ret)
		goto out;

	spin_lock(&fs_info->block_group_cache_lock);
	rb_erase(&block_group->cache_node,
	write_lock(&fs_info->block_group_cache_lock);
	rb_erase_cached(&block_group->cache_node,
			&fs_info->block_group_cache_tree);
	RB_CLEAR_NODE(&block_group->cache_node);

	/* Once for the block groups rbtree */
	btrfs_put_block_group(block_group);

	if (fs_info->first_logical_byte == block_group->start)
		fs_info->first_logical_byte = (u64)-1;
	spin_unlock(&fs_info->block_group_cache_lock);
	write_unlock(&fs_info->block_group_cache_lock);

	down_write(&block_group->space_info->groups_sem);
	/*
@@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
	if (block_group->cached == BTRFS_CACHE_STARTED)
		btrfs_wait_block_group_cache_done(block_group);
	if (block_group->has_caching_ctl) {
		spin_lock(&fs_info->block_group_cache_lock);
		write_lock(&fs_info->block_group_cache_lock);
		if (!caching_ctl) {
			struct btrfs_caching_control *ctl;

@@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
		}
		if (caching_ctl)
			list_del_init(&caching_ctl->list);
		spin_unlock(&fs_info->block_group_cache_lock);
		write_unlock(&fs_info->block_group_cache_lock);
		if (caching_ctl) {
			/* Once for the caching bgs list and once for us. */
			btrfs_put_caching_control(caching_ctl);
@@ -1367,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
			goto next;
		}

		ret = btrfs_zone_finish(block_group);
		if (ret < 0) {
			btrfs_dec_block_group_ro(block_group);
			if (ret == -EAGAIN)
				ret = 0;
			goto next;
		}

		/*
		 * Want to do this before we do anything else so we can recover
		 * properly if we fail to join the transaction.
@@ -1512,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
	return bg1->used > bg2->used;
}

static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
{
	if (btrfs_is_zoned(fs_info))
		return btrfs_zoned_should_reclaim(fs_info);
	return true;
}

void btrfs_reclaim_bgs_work(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info =
@@ -1522,6 +1555,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
		return;

	if (!btrfs_should_reclaim(fs_info))
		return;

	sb_start_write(fs_info->sb);

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
@@ -1692,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
	struct btrfs_root *root = btrfs_block_group_root(fs_info);
	int ret;
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
	int slot;

	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
	if (ret < 0)
		return ret;

	while (1) {
		slot = path->slots[0];
		leaf = path->nodes[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto out;
			break;
		}
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

	btrfs_for_each_slot(root, key, &found_key, path, ret) {
		if (found_key.objectid >= key->objectid &&
		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
			ret = read_bg_from_eb(fs_info, &found_key, path);
			break;
			return read_bg_from_eb(fs_info, &found_key, path);
		}

		path->slots[0]++;
	}
out:
	return ret;
}

@@ -3220,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
	return ret;
}

static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
					      u64 bytes_freed)
{
	const struct btrfs_space_info *space_info = bg->space_info;
	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
	const u64 new_val = bg->used;
	const u64 old_val = new_val + bytes_freed;
	u64 thresh;

	if (reclaim_thresh == 0)
		return false;

	thresh = div_factor_fine(bg->length, reclaim_thresh);

	/*
	 * If we were below the threshold before don't reclaim, we are likely a
	 * brand new block group and we don't want to relocate new block groups.
	 */
	if (old_val < thresh)
		return false;
	if (new_val >= thresh)
		return false;
	return true;
}

int btrfs_update_block_group(struct btrfs_trans_handle *trans,
			     u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3242,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
	spin_unlock(&info->delalloc_root_lock);

	while (total) {
		bool reclaim;

		cache = btrfs_lookup_block_group(info, bytenr);
		if (!cache) {
			ret = -ENOENT;
@@ -3287,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
					cache->space_info, num_bytes);
			cache->space_info->bytes_used -= num_bytes;
			cache->space_info->disk_used -= num_bytes * factor;

			reclaim = should_reclaim_block_group(cache, num_bytes);
			spin_unlock(&cache->lock);
			spin_unlock(&cache->space_info->lock);

@@ -3313,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
		if (!alloc && old_val == 0) {
			if (!btrfs_test_opt(info, DISCARD_ASYNC))
				btrfs_mark_bg_unused(cache);
		} else if (!alloc && reclaim) {
			btrfs_mark_bg_to_reclaim(cache);
		}

		btrfs_put_block_group(cache);
@@ -3957,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
	struct btrfs_caching_control *caching_ctl;
	struct rb_node *n;

	spin_lock(&info->block_group_cache_lock);
	write_lock(&info->block_group_cache_lock);
	while (!list_empty(&info->caching_block_groups)) {
		caching_ctl = list_entry(info->caching_block_groups.next,
					 struct btrfs_caching_control, list);
		list_del(&caching_ctl->list);
		btrfs_put_caching_control(caching_ctl);
	}
	spin_unlock(&info->block_group_cache_lock);
	write_unlock(&info->block_group_cache_lock);

	spin_lock(&info->unused_bgs_lock);
	while (!list_empty(&info->unused_bgs)) {
@@ -3994,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
	}
	spin_unlock(&info->zone_active_bgs_lock);

	spin_lock(&info->block_group_cache_lock);
	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
	write_lock(&info->block_group_cache_lock);
	while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
		block_group = rb_entry(n, struct btrfs_block_group,
				       cache_node);
		rb_erase(&block_group->cache_node,
		rb_erase_cached(&block_group->cache_node,
				&info->block_group_cache_tree);
		RB_CLEAR_NODE(&block_group->cache_node);
		spin_unlock(&info->block_group_cache_lock);
		write_unlock(&info->block_group_cache_lock);

		down_write(&block_group->space_info->groups_sem);
		list_del(&block_group->list);
@@ -4024,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
		ASSERT(block_group->swap_extents == 0);
		btrfs_put_block_group(block_group);

		spin_lock(&info->block_group_cache_lock);
		write_lock(&info->block_group_cache_lock);
	}
	spin_unlock(&info->block_group_cache_lock);
	write_unlock(&info->block_group_cache_lock);

	btrfs_release_global_block_rsv(info);

+5 −2
Original line number Diff line number Diff line
@@ -212,6 +212,8 @@ struct btrfs_block_group {
	u64 meta_write_pointer;
	struct map_lookup *physical_map;
	struct list_head active_bg_list;
	struct work_struct zone_finish_work;
	struct extent_buffer *last_eb;
};

static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -254,8 +256,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
					const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
						  u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
				           u64 num_bytes);
Loading