Commit cc423f63 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "Mainly core changes, refactoring and optimizations.

  Performance is improved in some areas, overall there may be a
  cumulative improvement due to refactoring that removed lookups in the
  IO path or simplified IO submission tracking.

  Core:

   - submit IO synchronously for fast checksums (crc32c and xxhash),
     remove high priority worker kthread

   - read extent buffer in one go, simplify IO tracking, bio submission
     and locking

   - remove additional tracking of redirtied extent buffers, originally
     added for zoned mode but actually not needed

   - track ordered extent pointer in bio to avoid rbtree lookups during
     IO

   - scrub, use recovered data stripes as cache to avoid unnecessary
     read

   - in zoned mode, optimize logical to physical mappings of extents

   - remove PageError handling, not set by VFS nor writeback

   - cleanups, refactoring, better structure packing

   - lots of error handling improvements

   - more assertions, lockdep annotations

   - print assertion failure with the exact line where it happens

   - tracepoint updates

   - more debugging prints

  Performance:

   - speedup in fsync(), better tracking of inode logged status can
     avoid transaction commit

   - IO path structures track logical offsets in data structures and
     does not need to look it up

  User visible changes:

   - don't commit transaction for every created subvolume, this can
     reduce time when many subvolumes are created in a batch

   - print affected files when relocation fails

   - trigger orphan file cleanup during START_SYNC ioctl

  Notable fixes:

   - fix crash when disabling quota and relocation

   - fix crashes when removing roots from drity list

   - fix transacion abort during relocation when converting from newer
     profiles not covered by fallback

   - in zoned mode, stop reclaiming block groups if filesystem becomes
     read-only

   - fix rare race condition in tree mod log rewind that can miss some
     btree node slots

   - with enabled fsverity, drop up-to-date page bit in case the
     verification fails"

* tag 'for-6.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (194 commits)
  btrfs: fix race between quota disable and relocation
  btrfs: add comment to struct btrfs_fs_info::dirty_cowonly_roots
  btrfs: fix race when deleting free space root from the dirty cow roots list
  btrfs: fix race when deleting quota root from the dirty cow roots list
  btrfs: tracepoints: also show actual number of the outstanding extents
  btrfs: update i_version in update_dev_time
  btrfs: make btrfs_compressed_bioset static
  btrfs: add handling for RAID1C23/DUP to btrfs_reduce_alloc_profile
  btrfs: scrub: remove btrfs_fs_info::scrub_wr_completion_workers
  btrfs: scrub: remove scrub_ctx::csum_list member
  btrfs: do not BUG_ON after failure to migrate space during truncation
  btrfs: do not BUG_ON on failure to get dir index for new snapshot
  btrfs: send: do not BUG_ON() on unexpected symlink data extent
  btrfs: do not BUG_ON() when dropping inode items from log root
  btrfs: replace BUG_ON() at split_item() with proper error handling
  btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr()
  btrfs: do not BUG_ON() on tree mod log failures at insert_ptr()
  btrfs: do not BUG_ON() on tree mod log failure at insert_new_root()
  btrfs: do not BUG_ON() on tree mod log failures at push_nodes_for_insert()
  btrfs: abort transaction at update_ref_for_cow() when ref count is zero
  ...
parents e940efa9 8a4a0b2a
Loading
Loading
Loading
Loading
+39 −5
Original line number Diff line number Diff line
@@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
	return atomic_read(&wq->pending) > wq->thresh * 2;
}

static void btrfs_init_workqueue(struct btrfs_workqueue *wq,
				 struct btrfs_fs_info *fs_info)
{
	wq->fs_info = fs_info;
	atomic_set(&wq->pending, 0);
	INIT_LIST_HEAD(&wq->ordered_list);
	spin_lock_init(&wq->list_lock);
	spin_lock_init(&wq->thres_lock);
}

struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
					      const char *name, unsigned int flags,
					      int limit_active, int thresh)
@@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
	if (!ret)
		return NULL;

	ret->fs_info = fs_info;
	btrfs_init_workqueue(ret, fs_info);

	ret->limit_active = limit_active;
	atomic_set(&ret->pending, 0);
	if (thresh == 0)
		thresh = DFT_THRESHOLD;
	/* For low threshold, disabling threshold is a better choice */
@@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
		return NULL;
	}

	INIT_LIST_HEAD(&ret->ordered_list);
	spin_lock_init(&ret->list_lock);
	spin_lock_init(&ret->thres_lock);
	trace_btrfs_workqueue_alloc(ret, name);
	return ret;
}

struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
				struct btrfs_fs_info *fs_info, const char *name,
				unsigned int flags)
{
	struct btrfs_workqueue *ret;

	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
	if (!ret)
		return NULL;

	btrfs_init_workqueue(ret, fs_info);

	/* Ordered workqueues don't allow @max_active adjustments. */
	ret->limit_active = 1;
	ret->current_active = 1;
	ret->thresh = NO_THRESHOLD;

	ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name);
	if (!ret->normal_wq) {
		kfree(ret);
		return NULL;
	}

	trace_btrfs_workqueue_alloc(ret, name);
	return ret;
}
+3 −0
Original line number Diff line number Diff line
@@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
					      unsigned int flags,
					      int limit_active,
					      int thresh);
struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
				struct btrfs_fs_info *fs_info, const char *name,
				unsigned int flags);
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
		     btrfs_func_t ordered_func, btrfs_func_t ordered_free);
void btrfs_queue_work(struct btrfs_workqueue *wq,
+68 −54
Original line number Diff line number Diff line
@@ -27,6 +27,17 @@ struct btrfs_failed_bio {
	atomic_t repair_count;
};

/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
	return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
}

static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
{
	return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
}

/*
 * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
 * is already initialized by the block layer.
@@ -61,20 +72,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
	return bbio;
}

static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
{
	struct btrfs_ordered_extent *ordered;
	int ret;

	ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
	if (WARN_ON_ONCE(!ordered))
		return BLK_STS_IOERR;
	ret = btrfs_extract_ordered_extent(bbio, ordered);
	btrfs_put_ordered_extent(ordered);

	return errno_to_blk_status(ret);
}

static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
					 struct btrfs_bio *orig_bbio,
					 u64 map_length, bool use_append)
@@ -95,13 +92,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
	btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
	bbio->inode = orig_bbio->inode;
	bbio->file_offset = orig_bbio->file_offset;
	if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
	orig_bbio->file_offset += map_length;

	if (bbio_has_ordered_extent(bbio)) {
		refcount_inc(&orig_bbio->ordered->refs);
		bbio->ordered = orig_bbio->ordered;
	}
	atomic_inc(&orig_bbio->pending_ios);
	return bbio;
}

/* Free a bio that was never submitted to the underlying device. */
static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
{
	if (bbio_has_ordered_extent(bbio))
		btrfs_put_ordered_extent(bbio->ordered);
	bio_put(&bbio->bio);
}

static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
{
	if (bbio_has_ordered_extent(bbio)) {
		struct btrfs_ordered_extent *ordered = bbio->ordered;

		bbio->end_io(bbio);
		btrfs_put_ordered_extent(ordered);
	} else {
		bbio->end_io(bbio);
	}
}

void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
	bbio->bio.bi_status = status;
	__btrfs_bio_end_io(bbio);
}

static void btrfs_orig_write_end_io(struct bio *bio);

static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
@@ -130,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)

		if (bbio->bio.bi_status)
			btrfs_bbio_propagate_error(bbio, orig_bbio);
		bio_put(&bbio->bio);
		btrfs_cleanup_bio(bbio);
		bbio = orig_bbio;
	}

	if (atomic_dec_and_test(&bbio->pending_ios))
		bbio->end_io(bbio);
		__btrfs_bio_end_io(bbio);
}

static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
@@ -327,7 +352,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);

	/* Metadata reads are checked and repaired by the submitter. */
	if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
	if (is_data_bbio(bbio))
		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
	else
		btrfs_orig_bbio_end_io(bbio);
@@ -348,7 +373,7 @@ static void btrfs_simple_end_io(struct bio *bio)
		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
		queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
	} else {
		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
		if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
			btrfs_record_physical_zoned(bbio);
		btrfs_orig_bbio_end_io(bbio);
	}
@@ -361,8 +386,7 @@ static void btrfs_raid56_end_io(struct bio *bio)

	btrfs_bio_counter_dec(bioc->fs_info);
	bbio->mirror_num = bioc->mirror_num;
	if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
	    !(bbio->bio.bi_opf & REQ_META))
	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
		btrfs_check_read_bio(bbio, NULL);
	else
		btrfs_orig_bbio_end_io(bbio);
@@ -472,13 +496,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
			       struct btrfs_io_stripe *smap, int mirror_num)
{
	/* Do not leak our private flag into the block layer. */
	bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED;

	if (!bioc) {
		/* Single mirror read/write fast path. */
		btrfs_bio(bio)->mirror_num = mirror_num;
		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
		if (bio_op(bio) != REQ_OP_READ)
			btrfs_bio(bio)->orig_physical = smap->physical;
		bio->bi_private = smap->dev;
		bio->bi_end_io = btrfs_simple_end_io;
		btrfs_submit_dev_bio(smap->dev, bio);
@@ -574,27 +597,20 @@ static void run_one_async_free(struct btrfs_work *work)

static bool should_async_write(struct btrfs_bio *bbio)
{
	/*
	 * If the I/O is not issued by fsync and friends, (->sync_writers != 0),
	 * then try to defer the submission to a workqueue to parallelize the
	 * checksum calculation.
	 */
	if (atomic_read(&bbio->inode->sync_writers))
	/* Submit synchronously if the checksum implementation is fast. */
	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
		return false;

	/*
	 * Submit metadata writes synchronously if the checksum implementation
	 * is fast, or we are on a zoned device that wants I/O to be submitted
	 * in order.
	 * Try to defer the submission to a workqueue to parallelize the
	 * checksum calculation unless the I/O is issued synchronously.
	 */
	if (bbio->bio.bi_opf & REQ_META) {
		struct btrfs_fs_info *fs_info = bbio->fs_info;

		if (btrfs_is_zoned(fs_info))
	if (op_is_sync(bbio->bio.bi_opf))
		return false;
		if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))

	/* Zoned devices require I/O to be submitted in order. */
	if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
		return false;
	}

	return true;
}
@@ -622,9 +638,6 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,

	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
			run_one_async_free);
	if (op_is_sync(bbio->bio.bi_opf))
		btrfs_queue_work(fs_info->hipri_workers, &async->work);
	else
	btrfs_queue_work(fs_info->workers, &async->work);
	return true;
}
@@ -635,7 +648,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
	struct btrfs_fs_info *fs_info = bbio->fs_info;
	struct btrfs_bio *orig_bbio = bbio;
	struct bio *bio = &bbio->bio;
	u64 logical = bio->bi_iter.bi_sector << 9;
	u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
	u64 length = bio->bi_iter.bi_size;
	u64 map_length = length;
	bool use_append = btrfs_use_zone_append(bbio);
@@ -645,7 +658,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
	int error;

	btrfs_bio_counter_inc_blocked(fs_info);
	error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
	error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
				&bioc, &smap, &mirror_num, 1);
	if (error) {
		ret = errno_to_blk_status(error);
@@ -665,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
	 * Save the iter for the end_io handler and preload the checksums for
	 * data reads.
	 */
	if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
		bbio->saved_iter = bio->bi_iter;
		ret = btrfs_lookup_bio_sums(bbio);
		if (ret)
@@ -676,9 +689,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
		if (use_append) {
			bio->bi_opf &= ~REQ_OP_WRITE;
			bio->bi_opf |= REQ_OP_ZONE_APPEND;
			ret = btrfs_bio_extract_ordered_extent(bbio);
			if (ret)
				goto fail_put_bio;
		}

		/*
@@ -695,6 +705,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
			ret = btrfs_bio_csum(bbio);
			if (ret)
				goto fail_put_bio;
		} else if (use_append) {
			ret = btrfs_alloc_dummy_sum(bbio);
			if (ret)
				goto fail_put_bio;
		}
	}

@@ -704,7 +718,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)

fail_put_bio:
	if (map_length < length)
		bio_put(bio);
		btrfs_cleanup_bio(bbio);
fail:
	btrfs_bio_counter_dec(fs_info);
	btrfs_bio_end_io(orig_bbio, ret);
+17 −12
Original line number Diff line number Diff line
@@ -39,8 +39,8 @@ struct btrfs_bio {

	union {
		/*
		 * Data checksumming and original I/O information for internal
		 * use in the btrfs_submit_bio machinery.
		 * For data reads: checksumming and original I/O information.
		 * (for internal use in the btrfs_submit_bio machinery only)
		 */
		struct {
			u8 *csum;
@@ -48,7 +48,20 @@ struct btrfs_bio {
			struct bvec_iter saved_iter;
		};

		/* For metadata parentness verification. */
		/*
		 * For data writes:
		 * - ordered extent covering the bio
		 * - pointer to the checksums for this bio
		 * - original physical address from the allocator
		 *   (for zone append only)
		 */
		struct {
			struct btrfs_ordered_extent *ordered;
			struct btrfs_ordered_sum *sums;
			u64 orig_physical;
		};

		/* For metadata reads: parentness verification. */
		struct btrfs_tree_parent_check parent_check;
	};

@@ -84,15 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
				  struct btrfs_fs_info *fs_info,
				  btrfs_bio_end_io_t end_io, void *private);

static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
	bbio->bio.bi_status = status;
	bbio->end_io(bbio);
}

/* Bio only refers to one ordered extent. */
#define REQ_BTRFS_ONE_ORDERED			REQ_DRV
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);

/* Submit using blkcg_punt_bio_submit. */
#define REQ_BTRFS_CGROUP_PUNT			REQ_FS_PRIVATE
+40 −7
Original line number Diff line number Diff line
@@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
	}
	allowed &= flags;

	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
	/* Select the highest-redundancy RAID level. */
	if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
		allowed = BTRFS_BLOCK_GROUP_RAID1C4;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
		allowed = BTRFS_BLOCK_GROUP_RAID6;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
		allowed = BTRFS_BLOCK_GROUP_RAID1C3;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
		allowed = BTRFS_BLOCK_GROUP_RAID5;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
		allowed = BTRFS_BLOCK_GROUP_RAID10;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
		allowed = BTRFS_BLOCK_GROUP_RAID1;
	else if (allowed & BTRFS_BLOCK_GROUP_DUP)
		allowed = BTRFS_BLOCK_GROUP_DUP;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
		allowed = BTRFS_BLOCK_GROUP_RAID0;

@@ -1633,11 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
	struct btrfs_fs_info *fs_info = bg->fs_info;

	trace_btrfs_add_unused_block_group(bg);
	spin_lock(&fs_info->unused_bgs_lock);
	if (list_empty(&bg->bg_list)) {
		btrfs_get_block_group(bg);
		trace_btrfs_add_unused_block_group(bg);
		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
	} else {
		/* Pull out the block group from the reclaim_bgs list. */
		list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
}
@@ -1791,8 +1801,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		}
		spin_unlock(&bg->lock);

		/* Get out fast, in case we're unmounting the filesystem */
		if (btrfs_fs_closing(fs_info)) {
		/*
		 * Get out fast, in case we're read-only or unmounting the
		 * filesystem. It is OK to drop block groups from the list even
		 * for the read-only case. As we did sb_start_write(),
		 * "mount -o remount,ro" won't happen and read-only filesystem
		 * means it is forced read-only due to a fatal error. So, it
		 * never gets back to read-write to let us reclaim again.
		 */
		if (btrfs_need_cleaner_sleep(fs_info)) {
			up_write(&space_info->groups_sem);
			goto next;
		}
@@ -1823,11 +1840,27 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		}

next:
		if (ret)
			btrfs_mark_bg_to_reclaim(bg);
		btrfs_put_block_group(bg);

		mutex_unlock(&fs_info->reclaim_bgs_lock);
		/*
		 * Reclaiming all the block groups in the list can take really
		 * long.  Prioritize cleaning up unused block groups.
		 */
		btrfs_delete_unused_bgs(fs_info);
		/*
		 * If we are interrupted by a balance, we can just bail out. The
		 * cleaner thread restart again if necessary.
		 */
		if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
			goto end;
		spin_lock(&fs_info->unused_bgs_lock);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
end:
	btrfs_exclop_finish(fs_info);
	sb_end_write(fs_info->sb);
}
@@ -3521,9 +3554,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
			spin_unlock(&cache->lock);
			spin_unlock(&space_info->lock);

			set_extent_dirty(&trans->transaction->pinned_extents,
			set_extent_bit(&trans->transaction->pinned_extents,
				       bytenr, bytenr + num_bytes - 1,
					 GFP_NOFS | __GFP_NOFAIL);
				       EXTENT_DIRTY, NULL);
		}

		spin_lock(&trans->transaction->dirty_bgs_lock);
Loading