Commit 55ba0fe0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "The updates this time are mostly stabilization, preparation and minor
  improvements.

  User visible improvements:

   - readahead for send, improving run time of full send by 10% and for
     incremental by 25%

   - make reflinks respect O_SYNC, O_DSYNC and S_SYNC flags

   - export supported sectorsize values in sysfs (currently only page
     size, more once full subpage support lands)

   - more graceful errors and warnings on 32bit systems when logical
     addresses for metadata reach the limit posed by unsigned long in
     page::index
      - error: fail mount if there's a metadata block beyond the limit
      - error: new metadata block would be at unreachable address
      - warn when 5/8th of the limit is reached, for 4K page systems
        it's 10T, for 64K page it's 160T

   - zoned mode
      - relocated zones get reset at the end instead of discard
      - automatic background reclaim of zones that have 75%+ of unusable
        space, the threshold is tunable in sysfs

  Fixes:

   - fsync and tree mod log fixes

   - fix inefficient preemptive reclaim calculations

   - fix exhaustion of the system chunk array due to concurrent
     allocations

   - fix fallback to no compression when racing with remount

   - preemptive fix for dm-crypt on zoned device that does not properly
     advertise zoned support

  Core changes:

   - add inode lock to synchronize mmap and other block updates (eg.
     deduplication, fallocate, fsync)

   - kmap conversions to new kmap_local API

   - subpage support (continued)
      - new helpers for page state/extent buffer tracking
      - metadata changes now support read and write

   - error handling through out relocation call paths

   - many other cleanups and code simplifications"

* tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (112 commits)
  btrfs: zoned: automatically reclaim zones
  btrfs: rename delete_unused_bgs_mutex to reclaim_bgs_lock
  btrfs: zoned: reset zones of relocated block groups
  btrfs: more graceful errors/warnings on 32bit systems when reaching limits
  btrfs: zoned: fix unpaired block group unfreeze during device replace
  btrfs: fix race when picking most recent mod log operation for an old root
  btrfs: fix metadata extent leak after failure to create subvolume
  btrfs: handle remount to no compress during compression
  btrfs: zoned: fail mount if the device does not support zone append
  btrfs: fix race between transaction aborts and fsyncs leading to use-after-free
  btrfs: introduce submit_eb_subpage() to submit a subpage metadata page
  btrfs: make lock_extent_buffer_for_io() to be subpage compatible
  btrfs: introduce write_one_subpage_eb() function
  btrfs: introduce end_bio_subpage_eb_writepage() function
  btrfs: check return value of btrfs_commit_transaction in relocation
  btrfs: do proper error handling in merge_reloc_roots
  btrfs: handle extent corruption with select_one_root properly
  btrfs: cleanup error handling in prepare_to_merge
  btrfs: do not panic in __add_reloc_root
  btrfs: handle __add_reloc_root failures in btrfs_recover_relocation
  ...
parents 2a19866b 18bb8bbf
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
	   subpage.o
	   subpage.o tree-mod-log.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+17 −16
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include "delayed-ref.h"
#include "locking.h"
#include "misc.h"
#include "tree-mod-log.h"

/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
	if (path->slots[0] >= btrfs_header_nritems(eb) ||
	    is_shared_data_backref(preftrees, eb->start) ||
	    ref->root_id != btrfs_header_owner(eb)) {
		if (time_seq == SEQ_LAST)
		if (time_seq == BTRFS_SEQ_LAST)
			ret = btrfs_next_leaf(root, path);
		else
			ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
		if (slot == 0 &&
		    (is_shared_data_backref(preftrees, eb->start) ||
		     ref->root_id != btrfs_header_owner(eb))) {
			if (time_seq == SEQ_LAST)
			if (time_seq == BTRFS_SEQ_LAST)
				ret = btrfs_next_leaf(root, path);
			else
				ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
			eie = NULL;
		}
next:
		if (time_seq == SEQ_LAST)
		if (time_seq == BTRFS_SEQ_LAST)
			ret = btrfs_next_item(root, path);
		else
			ret = btrfs_next_old_item(root, path, time_seq);
@@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,

	if (path->search_commit_root)
		root_level = btrfs_header_level(root->commit_root);
	else if (time_seq == SEQ_LAST)
	else if (time_seq == BTRFS_SEQ_LAST)
		root_level = btrfs_header_level(root->node);
	else
		root_level = btrfs_old_root_level(root, time_seq);
@@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
	    search_key.offset >= LLONG_MAX)
		search_key.offset = 0;
	path->lowest_level = level;
	if (time_seq == SEQ_LAST)
	if (time_seq == BTRFS_SEQ_LAST)
		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
	else
		ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
@@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 * indirect refs to their parent bytenr.
 * When roots are found, they're added to the roots list
 *
 * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
 * much like trans == NULL case, the difference only lies in it will not
 * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
 * behave much like trans == NULL case, the difference only lies in it will not
 * commit root.
 * The special case is for qgroup to search roots in commit_transaction().
 *
@@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
		path->skip_locking = 1;
	}

	if (time_seq == SEQ_LAST)
	if (time_seq == BTRFS_SEQ_LAST)
		path->skip_locking = 1;

	/*
@@ -1217,9 +1218,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
	if (trans && likely(trans->type != __TRANS_DUMMY) &&
	    time_seq != SEQ_LAST) {
	    time_seq != BTRFS_SEQ_LAST) {
#else
	if (trans && time_seq != SEQ_LAST) {
	if (trans && time_seq != BTRFS_SEQ_LAST) {
#endif
		/*
		 * look if there are updates for this ref queued and lock the
@@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
	struct btrfs_trans_handle *trans;
	struct ulist_iterator uiter;
	struct ulist_node *node;
	struct seq_list elem = SEQ_LIST_INIT(elem);
	struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
	int ret = 0;
	struct share_check shared = {
		.root_objectid = root->root_key.objectid,
@@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
	struct ulist *roots = NULL;
	struct ulist_node *ref_node = NULL;
	struct ulist_node *root_node = NULL;
	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
	struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
	struct ulist_iterator ref_uiter;
	struct ulist_iterator root_uiter;

@@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
	}

	if (trans)
		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
		btrfs_get_tree_mod_seq(fs_info, &seq_elem);
	else
		down_read(&fs_info->commit_root_sem);

	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
				   tree_mod_seq_elem.seq, &refs,
				   seq_elem.seq, &refs,
				   &extent_item_pos, ignore_offset);
	if (ret)
		goto out;
@@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
	ULIST_ITER_INIT(&ref_uiter);
	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
		ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
						tree_mod_seq_elem.seq, &roots,
						seq_elem.seq, &roots,
						ignore_offset);
		if (ret)
			break;
@@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
	free_leaf_list(refs);
out:
	if (trans) {
		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
		btrfs_put_tree_mod_seq(fs_info, &seq_elem);
		btrfs_end_transaction(trans);
	} else {
		up_read(&fs_info->commit_root_sem);
+184 −23
Original line number Diff line number Diff line
@@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
	 * Long running balances can keep us blocked here for eternity, so
	 * simply skip deletion if we're unable to get the mutex.
	 */
	if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
		return;

	spin_lock(&fs_info->unused_bgs_lock);
@@ -1462,12 +1462,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		spin_lock(&fs_info->unused_bgs_lock);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
	return;

flip_async:
	btrfs_end_transaction(trans);
	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
	btrfs_put_block_group(block_group);
	btrfs_discard_punt_unused_bgs_list(fs_info);
}
@@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
	spin_unlock(&fs_info->unused_bgs_lock);
}

void btrfs_reclaim_bgs_work(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info =
		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
	struct btrfs_block_group *bg;
	struct btrfs_space_info *space_info;
	int ret;

	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
		return;

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
		return;

	mutex_lock(&fs_info->reclaim_bgs_lock);
	spin_lock(&fs_info->unused_bgs_lock);
	while (!list_empty(&fs_info->reclaim_bgs)) {
		bg = list_first_entry(&fs_info->reclaim_bgs,
				      struct btrfs_block_group,
				      bg_list);
		list_del_init(&bg->bg_list);

		space_info = bg->space_info;
		spin_unlock(&fs_info->unused_bgs_lock);

		/* Don't race with allocators so take the groups_sem */
		down_write(&space_info->groups_sem);

		spin_lock(&bg->lock);
		if (bg->reserved || bg->pinned || bg->ro) {
			/*
			 * We want to bail if we made new allocations or have
			 * outstanding allocations in this block group.  We do
			 * the ro check in case balance is currently acting on
			 * this block group.
			 */
			spin_unlock(&bg->lock);
			up_write(&space_info->groups_sem);
			goto next;
		}
		spin_unlock(&bg->lock);

		/* Get out fast, in case we're unmounting the filesystem */
		if (btrfs_fs_closing(fs_info)) {
			up_write(&space_info->groups_sem);
			goto next;
		}

		ret = inc_block_group_ro(bg, 0);
		up_write(&space_info->groups_sem);
		if (ret < 0)
			goto next;

		btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
				bg->start, div_u64(bg->used * 100, bg->length));
		trace_btrfs_reclaim_block_group(bg);
		ret = btrfs_relocate_chunk(fs_info, bg->start);
		if (ret)
			btrfs_err(fs_info, "error relocating chunk %llu",
				  bg->start);

next:
		btrfs_put_block_group(bg);
		spin_lock(&fs_info->unused_bgs_lock);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
	btrfs_exclop_finish(fs_info);
}

void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
	spin_lock(&fs_info->unused_bgs_lock);
	if (!list_empty(&fs_info->reclaim_bgs))
		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
	spin_unlock(&fs_info->unused_bgs_lock);
}

void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
	struct btrfs_fs_info *fs_info = bg->fs_info;

	spin_lock(&fs_info->unused_bgs_lock);
	if (list_empty(&bg->bg_list)) {
		btrfs_get_block_group(bg);
		trace_btrfs_add_reclaim_block_group(bg);
		list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
}

static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
			   struct btrfs_path *path)
{
@@ -2267,16 +2358,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
	struct btrfs_trans_handle *trans;
	u64 alloc_flags;
	int ret;
	bool dirty_bg_running;

again:
	do {
		trans = btrfs_join_transaction(fs_info->extent_root);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

		dirty_bg_running = false;

		/*
	 * we're not allowed to set block groups readonly after the dirty
	 * block groups cache has started writing.  If it already started,
	 * back off and let this transaction commit
		 * We're not allowed to set block groups readonly after the dirty
		 * block group cache has started writing.  If it already started,
		 * back off and let this transaction commit.
		 */
		mutex_lock(&fs_info->ro_block_group_mutex);
		if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
@@ -2288,8 +2382,9 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
			ret = btrfs_wait_for_commit(fs_info, transid);
			if (ret)
				return ret;
		goto again;
			dirty_bg_running = true;
		}
	} while (dirty_bg_running);

	if (do_chunk_alloc) {
		/*
@@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
 */
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_space_info *info;
	u64 left;
@@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
	lockdep_assert_held(&fs_info->chunk_mutex);

	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
again:
	spin_lock(&info->lock);
	left = info->total_bytes - btrfs_space_info_used(info, true);
	spin_unlock(&info->lock);
@@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)

	if (left < thresh) {
		u64 flags = btrfs_system_alloc_profile(fs_info);
		u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);

		/*
		 * If there's not available space for the chunk tree (system
		 * space) and there are other tasks that reserved space for
		 * creating a new system block group, wait for them to complete
		 * the creation of their system block group and release excess
		 * reserved space. We do this because:
		 *
		 * *) We can end up allocating more system chunks than necessary
		 *    when there are multiple tasks that are concurrently
		 *    allocating block groups, which can lead to exhaustion of
		 *    the system array in the superblock;
		 *
		 * *) If we allocate extra and unnecessary system block groups,
		 *    despite being empty for a long time, and possibly forever,
		 *    they end not being added to the list of unused block groups
		 *    because that typically happens only when deallocating the
		 *    last extent from a block group - which never happens since
		 *    we never allocate from them in the first place. The few
		 *    exceptions are when mounting a filesystem or running scrub,
		 *    which add unused block groups to the list of unused block
		 *    groups, to be deleted by the cleaner kthread.
		 *    And even when they are added to the list of unused block
		 *    groups, it can take a long time until they get deleted,
		 *    since the cleaner kthread might be sleeping or busy with
		 *    other work (deleting subvolumes, running delayed iputs,
		 *    defrag scheduling, etc);
		 *
		 * This is rare in practice, but can happen when too many tasks
		 * are allocating blocks groups in parallel (via fallocate())
		 * and before the one that reserved space for a new system block
		 * group finishes the block group creation and releases the space
		 * reserved in excess (at btrfs_create_pending_block_groups()),
		 * other tasks end up here and see free system space temporarily
		 * not enough for updating the chunk tree.
		 *
		 * We unlock the chunk mutex before waiting for such tasks and
		 * lock it again after the wait, otherwise we would deadlock.
		 * It is safe to do so because allocating a system chunk is the
		 * first thing done while allocating a new block group.
		 */
		if (reserved > trans->chunk_bytes_reserved) {
			const u64 min_needed = reserved - thresh;

			mutex_unlock(&fs_info->chunk_mutex);
			wait_event(cur_trans->chunk_reserve_wait,
			   atomic64_read(&cur_trans->chunk_bytes_reserved) <=
			   min_needed);
			mutex_lock(&fs_info->chunk_mutex);
			goto again;
		}

		/*
		 * Ignore failure to create system chunk. We might end up not
@@ -3315,10 +3464,12 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
		ret = btrfs_block_rsv_add(fs_info->chunk_root,
					  &fs_info->chunk_block_rsv,
					  thresh, BTRFS_RESERVE_NO_FLUSH);
		if (!ret)
		if (!ret) {
			atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
			trans->chunk_bytes_reserved += thresh;
		}
	}
}

void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
@@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
	}
	spin_unlock(&info->unused_bgs_lock);

	spin_lock(&info->unused_bgs_lock);
	while (!list_empty(&info->reclaim_bgs)) {
		block_group = list_first_entry(&info->reclaim_bgs,
					       struct btrfs_block_group,
					       bg_list);
		list_del_init(&block_group->bg_list);
		btrfs_put_block_group(block_group);
	}
	spin_unlock(&info->unused_bgs_lock);

	spin_lock(&info->block_group_cache_lock);
	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
		block_group = rb_entry(n, struct btrfs_block_group,
+3 −0
Original line number Diff line number Diff line
@@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
			     u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
			   u64 type, u64 chunk_offset, u64 size);
+20 −13
Original line number Diff line number Diff line
@@ -220,6 +220,7 @@ struct btrfs_inode {
	/* Hook into fs_info->delayed_iputs */
	struct list_head delayed_iput;

	struct rw_semaphore i_mmap_lock;
	struct inode vfs_inode;
};

@@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
						  mod);
}

static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
/*
 * Called every time after doing a buffered, direct IO or memory mapped write.
 *
 * This is to ensure that if we write to a file that was previously fsynced in
 * the current transaction, then try to fsync it again in the same transaction,
 * we will know that there were changes in the file and that it needs to be
 * logged.
 */
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
{
	spin_lock(&inode->lock);
	inode->last_sub_trans = inode->root->log_transid;
	spin_unlock(&inode->lock);
}

static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
	int ret = 0;
	bool ret = false;

	spin_lock(&inode->lock);
	if (inode->logged_trans == generation &&
	    inode->last_sub_trans <= inode->last_log_commit &&
	    inode->last_sub_trans <= inode->root->last_log_commit) {
		/*
		 * After a ranged fsync we might have left some extent maps
		 * (that fall outside the fsync's range). So return false
		 * here if the list isn't empty, to make sure btrfs_log_inode()
		 * will be called and process those extent maps.
		 */
		smp_mb();
		if (list_empty(&inode->extent_tree.modified_extents))
			ret = 1;
	}
	    inode->last_sub_trans <= inode->root->last_log_commit)
		ret = true;
	spin_unlock(&inode->lock);
	return ret;
}
Loading