Commit 51912904 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "This contains feature updates, performance improvements, preparatory
  and core work and some related VFS updates:

  Features:

   - encoded read/write ioctls, allows user space to read or write raw
     data directly to extents (now compressed, encrypted in the future),
     will be used by send/receive v2 where it saves processing time

   - zoned mode now works with metadata DUP (the mkfs.btrfs default)

   - error message header updates:
      - print error state: transaction abort, other error, log tree
        errors
      - print transient filesystem state: remount, device replace,
        ignored checksum verifications

   - tree-checker: verify the transaction id of the to-be-written dirty
     extent buffer

  Performance improvements for fsync:

   - directory logging speedups (up to -90% run time)

   - avoid logging all directory changes during renames (up to -60% run
     time)

   - avoid inode logging during rename and link when possible (up to
     -60% run time)

   - prepare extents to be logged before locking a log tree path
     (throughput +7%)

   - stop copying old file extents when doing a full fsync()

   - improved logging of old extents after truncate

  Core, fixes:

   - improved stale device identification by dev_t and not just path
     (for devices that are behind other layers like device mapper)

   - continued extent tree v2 preparatory work
      - disable features that won't work yet
      - add wrappers and abstractions for new tree roots

   - improved error handling

   - add super block write annotations around background block group
     reclaim

   - fix device scanning messages potentially accessing stale pointer

   - cleanups and refactoring

  VFS:

   - allow reflinks/deduplication from two different mounts of the same
     filesystem

   - export and add helpers for read/write range verification, for the
     encoded ioctls"

* tag 'for-5.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (98 commits)
  btrfs: zoned: put block group after final usage
  btrfs: don't access possibly stale fs_info data in device_list_add
  btrfs: add lockdep_assert_held to need_preemptive_reclaim
  btrfs: verify the tranisd of the to-be-written dirty extent buffer
  btrfs: unify the error handling of btrfs_read_buffer()
  btrfs: unify the error handling pattern for read_tree_block()
  btrfs: factor out do_free_extent_accounting helper
  btrfs: remove last_ref from the extent freeing code
  btrfs: add a alloc_reserved_extent helper
  btrfs: remove BUG_ON(ret) in alloc_reserved_tree_block
  btrfs: add and use helper for unlinking inode during log replay
  btrfs: extend locking to all space_info members accesses
  btrfs: zoned: mark relocation as writing
  fs: allow cross-vfsmount reflink/dedupe
  btrfs: remove the cross file system checks from remap
  btrfs: pass btrfs_fs_info to btrfs_recover_relocation
  btrfs: pass btrfs_fs_info for deleting snapshots and cleaner
  btrfs: add filesystems state details to error messages
  btrfs: deal with unexpected extent type during reflinking
  btrfs: fix unexpected error path when reflinking an inline extent
  ...
parents 9b03992f d3e29967
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
		if (IS_ERR(eb)) {
			free_pref(ref);
			return PTR_ERR(eb);
		} else if (!extent_buffer_uptodate(eb)) {
		}
		if (!extent_buffer_uptodate(eb)) {
			free_pref(ref);
			free_extent_buffer(eb);
			return -EIO;
		}

		if (lock)
			btrfs_tree_read_lock(eb);
		if (btrfs_header_level(eb) == 0)
@@ -1335,7 +1337,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
				if (IS_ERR(eb)) {
					ret = PTR_ERR(eb);
					goto out;
				} else if (!extent_buffer_uptodate(eb)) {
				}
				if (!extent_buffer_uptodate(eb)) {
					free_extent_buffer(eb);
					ret = -EIO;
					goto out;
+33 −3
Original line number Diff line number Diff line
@@ -1522,8 +1522,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
		return;

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
	sb_start_write(fs_info->sb);

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
		sb_end_write(fs_info->sb);
		return;
	}

	/*
	 * Long running balances can keep us blocked here for eternity, so
@@ -1531,6 +1535,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	 */
	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
		btrfs_exclop_finish(fs_info);
		sb_end_write(fs_info->sb);
		return;
	}

@@ -1605,6 +1610,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
	btrfs_exclop_finish(fs_info);
	sb_end_write(fs_info->sb);
}

void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -2006,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
	cache->length = key->offset;
	cache->used = btrfs_stack_block_group_used(bgi);
	cache->flags = btrfs_stack_block_group_flags(bgi);
	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);

	set_free_space_tree_thresholds(cache);

@@ -2288,7 +2295,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
	spin_lock(&block_group->lock);
	btrfs_set_stack_block_group_used(&bgi, block_group->used);
	btrfs_set_stack_block_group_chunk_objectid(&bgi,
				BTRFS_FIRST_CHUNK_TREE_OBJECTID);
						   block_group->global_root_id);
	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
	key.objectid = block_group->start;
	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2444,6 +2451,27 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
	btrfs_trans_release_chunk_metadata(trans);
}

/*
 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
 * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
 */
static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
{
	u64 div = SZ_1G;
	u64 index;

	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
		return BTRFS_FIRST_CHUNK_TREE_OBJECTID;

	/* If we have a smaller fs index based on 128MiB. */
	if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
		div = SZ_128M;

	offset = div64_u64(offset, div);
	div64_u64_rem(offset, fs_info->nr_global_roots, &index);
	return index;
}

struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
						 u64 bytes_used, u64 type,
						 u64 chunk_offset, u64 size)
@@ -2464,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
	cache->flags = type;
	cache->last_byte_to_unpin = (u64)-1;
	cache->cached = BTRFS_CACHE_FINISHED;
	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);

	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
		cache->needs_free_space = 1;

@@ -2693,7 +2723,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
	btrfs_set_stack_block_group_used(&bgi, cache->used);
	btrfs_set_stack_block_group_chunk_objectid(&bgi,
			BTRFS_FIRST_CHUNK_TREE_OBJECTID);
						   cache->global_root_id);
	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
	btrfs_mark_buffer_dirty(leaf);
+1 −0
Original line number Diff line number Diff line
@@ -68,6 +68,7 @@ struct btrfs_block_group {
	u64 bytes_super;
	u64 flags;
	u64 cache_generation;
	u64 global_root_id;

	/*
	 * If the free space extent count exceeds this number, convert the block
+40 −2
Original line number Diff line number Diff line
@@ -13,6 +13,13 @@
#include "ordered-data.h"
#include "delayed-inode.h"

/*
 * Since we search a directory based on f_pos (struct dir_context::pos) we have
 * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
 * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
 */
#define BTRFS_DIR_START_INDEX 2

/*
 * ordered_data_close is set by truncate when a file that used
 * to have good data has been truncated to zero.  When it is set
@@ -173,8 +180,9 @@ struct btrfs_inode {
	u64 disk_i_size;

	/*
	 * if this is a directory then index_cnt is the counter for the index
	 * number for new files that are created
	 * If this is a directory then index_cnt is the counter for the index
	 * number for new files that are created. For an empty directory, this
	 * must be initialized to BTRFS_DIR_START_INDEX.
	 */
	u64 index_cnt;

@@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
	spin_unlock(&inode->lock);
}

/*
 * Should be called while holding the inode's VFS lock in exclusive mode or in a
 * context where no one else can access the inode concurrently (during inode
 * creation or when loading an inode from disk).
 */
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
{
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
	/*
	 * The inode may have been part of a reflink operation in the last
	 * transaction that modified it, and then a fsync has reset the
	 * last_reflink_trans to avoid subsequent fsyncs in the same
	 * transaction to do unnecessary work. So update last_reflink_trans
	 * to the last_trans value (we have to be pessimistic and assume a
	 * reflink happened).
	 *
	 * The ->last_trans is protected by the inode's spinlock and we can
	 * have a concurrent ordered extent completion update it. Also set
	 * last_reflink_trans to ->last_trans only if the former is less than
	 * the later, because we can be called in a context where
	 * last_reflink_trans was set to the current transaction generation
	 * while ->last_trans was not yet updated in the current transaction,
	 * and therefore has a lower value.
	 */
	spin_lock(&inode->lock);
	if (inode->last_reflink_trans < inode->last_trans)
		inode->last_reflink_trans = inode->last_trans;
	spin_unlock(&inode->lock);
}

static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
	bool ret = false;
+37 −26
Original line number Diff line number Diff line
@@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
		bi_size += bvec->bv_len;

	if (bio->bi_status)
		cb->errors = 1;
		cb->status = bio->bi_status;

	ASSERT(bi_size && bi_size <= cb->compressed_len);
	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
@@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
	return last_io;
}

static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
static void finish_compressed_bio_read(struct compressed_bio *cb)
{
	unsigned int index;
	struct page *page;
@@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi
	}

	/* Do io completion on the original bio */
	if (cb->errors) {
		bio_io_error(cb->orig_bio);
	if (cb->status != BLK_STS_OK) {
		cb->orig_bio->bi_status = cb->status;
		bio_endio(cb->orig_bio);
	} else {
		struct bio_vec *bvec;
		struct bvec_iter_all iter_all;

		ASSERT(bio);
		ASSERT(!bio->bi_status);
		/*
		 * We have verified the checksum already, set page checked so
		 * the end_io handlers know about it
		 */
		ASSERT(!bio_flagged(bio, BIO_CLONED));
		ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
			u64 bvec_start = page_offset(bvec->bv_page) +
					 bvec->bv_offset;
@@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio)
	 * Some IO in this cb have failed, just skip checksum as there
	 * is no way it could be correct.
	 */
	if (cb->errors == 1)
	if (cb->status != BLK_STS_OK)
		goto csum_failed;

	inode = cb->inode;
@@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio)

csum_failed:
	if (ret)
		cb->errors = 1;
	finish_compressed_bio_read(cb, bio);
		cb->status = errno_to_blk_status(ret);
	finish_compressed_bio_read(cb);
out:
	bio_put(bio);
}
@@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode,
	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
	struct page *pages[16];
	unsigned long nr_pages = end_index - index + 1;
	const int errno = blk_status_to_errno(cb->status);
	int i;
	int ret;

	if (cb->errors)
		mapping_set_error(inode->i_mapping, -EIO);
	if (errno)
		mapping_set_error(inode->i_mapping, errno);

	while (nr_pages > 0) {
		ret = find_get_pages_contig(inode->i_mapping, index,
@@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
			continue;
		}
		for (i = 0; i < ret; i++) {
			if (cb->errors)
			if (errno)
				SetPageError(pages[i]);
			btrfs_page_clamp_clear_writeback(fs_info, pages[i],
							 cb->start, cb->len);
@@ -381,8 +381,9 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
	 */
	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
			cb->start, cb->start + cb->len - 1,
			!cb->errors);
			cb->status == BLK_STS_OK);

	if (cb->writeback)
		end_compressed_writeback(inode, cb);
	/* Note, our inode could be gone now */

@@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
				 struct page **compressed_pages,
				 unsigned int nr_pages,
				 unsigned int write_flags,
				 struct cgroup_subsys_state *blkcg_css)
				 struct cgroup_subsys_state *blkcg_css,
				 bool writeback)
{
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct bio *bio = NULL;
@@ -524,13 +526,14 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
	if (!cb)
		return BLK_STS_RESOURCE;
	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
	cb->errors = 0;
	cb->status = BLK_STS_OK;
	cb->inode = &inode->vfs_inode;
	cb->start = start;
	cb->len = len;
	cb->mirror_num = 0;
	cb->compressed_pages = compressed_pages;
	cb->compressed_len = compressed_len;
	cb->writeback = writeback;
	cb->orig_bio = NULL;
	cb->nr_pages = nr_pages;

@@ -591,7 +594,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,

		if (submit) {
			if (!skip_sum) {
				ret = btrfs_csum_one_bio(inode, bio, start, 1);
				ret = btrfs_csum_one_bio(inode, bio, start, true);
				if (ret)
					goto finish_cb;
			}
@@ -808,7 +811,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	u64 em_len;
	u64 em_start;
	struct extent_map *em;
	blk_status_t ret = BLK_STS_RESOURCE;
	blk_status_t ret;
	int faili = 0;
	u8 *sums;

@@ -821,17 +824,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
	read_unlock(&em_tree->lock);
	if (!em)
		return BLK_STS_IOERR;
	if (!em) {
		ret = BLK_STS_IOERR;
		goto out;
	}

	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
	compressed_len = em->block_len;
	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
	if (!cb)
	if (!cb) {
		ret = BLK_STS_RESOURCE;
		goto out;
	}

	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
	cb->errors = 0;
	cb->status = BLK_STS_OK;
	cb->inode = inode;
	cb->mirror_num = mirror_num;
	sums = cb->sums;
@@ -851,8 +858,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
	cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
				       GFP_NOFS);
	if (!cb->compressed_pages)
	if (!cb->compressed_pages) {
		ret = BLK_STS_RESOURCE;
		goto fail1;
	}

	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
@@ -938,7 +947,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
			comp_bio = NULL;
		}
	}
	return 0;
	return BLK_STS_OK;

fail2:
	while (faili >= 0) {
@@ -951,6 +960,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	kfree(cb);
out:
	free_extent_map(em);
	bio->bi_status = ret;
	bio_endio(bio);
	return ret;
finish_cb:
	if (comp_bio) {
@@ -970,7 +981,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	 */
	ASSERT(refcount_read(&cb->pending_sectors));
	/* Now we are the only one referring @cb, can finish it safely. */
	finish_compressed_bio_read(cb, NULL);
	finish_compressed_bio_read(cb);
	return ret;
}

Loading