Commit 11e3235b authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "Mostly core updates with a few user visible bits and fixes.

  Hilights:

   - fsync performance improvements
      - less contention of log mutex (throughput +4%, latency -14%,
        dbench with 32 clients)
      - skip unnecessary commits for link and rename (throughput +6%,
        latency -30%, rename latency -75%, dbench with 16 clients)
      - make fast fsync wait only for writeback (throughput +10..40%,
        runtime -1..-20%, dbench with 1 to 64 clients on various
        file/block sizes)

   - direct io is now implemented using the iomap infrastructure, that's
     the main part, we still have a workaround that requires an iomap
     API update, coming in 5.10

   - new sysfs exports:
      - information about the exclusive filesystem operation status
        (balance, device add/remove/replace, ...)
      - supported send stream version

  Core:

   - use ticket space reservations for data, fair policy using the same
     infrastructure as metadata

   - preparatory work to switch locking from our custom tree locks to
     standard rwsem, now the locking context is propagated to all
     callers, actual switch is expected to happen in the next dev cycle

   - seed device structures are now using list API

   - extent tracepoints print proper tree id

   - unified range checks for extent buffer helpers

   - send: avoid using temporary buffer for copying data

   - remove unnecessary RCU protection from space infos

   - remove unused readpage callback for metadata, enabling several
     cleanups

   - replace indirect function calls for end io hooks and remove
     extent_io_ops completely

  Fixes:

   - more lockdep warning fixes

   - fix qgroup reservation for delayed inode and an occasional
     reservation leak for preallocated files

   - fix device replace of a seed device

   - fix metadata reservation for fallocate that leads to transaction
     aborts

   - reschedule if necessary when logging directory items or when
     cloning lots of extents

   - tree-checker: fix false alert caused by legacy btrfs root item

   - send: fix rename/link conflicts for orphanized inodes

   - properly initialize device stats for seed devices

   - skip devices without magic signature when mounting

  Other:

   - error handling improvements, BUG_ONs replaced by proper handling,
     fuzz fixes

   - various function parameter cleanups

   - various W=1 cleanups

   - error/info messages improved

  Mishaps:

   - commit 62cf5391 ("btrfs: move btrfs_rm_dev_replace_free_srcdev
     outside of all locks") is a rebase leftover after the patch got
     merged to 5.9-rc8 as a466c85e ("btrfs: move
     btrfs_rm_dev_replace_free_srcdev outside of all locks"), the
     remaining part is trivial and the patch is in the middle of the
     series so I'm keeping it there instead of rebasing"

* tag 'for-5.10-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (161 commits)
  btrfs: rename BTRFS_INODE_ORDERED_DATA_CLOSE flag
  btrfs: annotate device name rcu_string with __rcu
  btrfs: skip devices without magic signature when mounting
  btrfs: cleanup cow block on error
  btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK
  fs: remove no longer used dio_end_io()
  btrfs: return error if we're unable to read device stats
  btrfs: init device stats for seed devices
  btrfs: remove struct extent_io_ops
  btrfs: call submit_bio_hook directly for metadata pages
  btrfs: stop calling submit_bio_hook for data inodes
  btrfs: don't opencode is_data_inode in end_bio_extent_readpage
  btrfs: call submit_bio_hook directly in submit_one_bio
  btrfs: remove extent_io_ops::readpage_end_io_hook
  btrfs: replace readpage_end_io_hook with direct calls
  btrfs: send, recompute reference path after orphanization of a directory
  btrfs: send, orphanize first all conflicting inodes when processing references
  btrfs: tree-checker: fix false alert caused by legacy btrfs root item
  btrfs: use unaligned helpers for stack and header set/get helpers
  btrfs: free-space-cache: use unaligned helpers to access data
  ...
parents c024a811 1fd4033d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ config BTRFS_FS
	select LZO_DECOMPRESS
	select ZSTD_COMPRESS
	select ZSTD_DECOMPRESS
	select FS_IOMAP
	select RAID6_PQ
	select XOR_BLOCKS
	select SRCU
+0 −1
Original line number Diff line number Diff line
@@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
	while (!list_empty(&pending_edge)) {
		struct btrfs_backref_node *upper;
		struct btrfs_backref_node *lower;
		struct rb_node *rb_node;

		edge = list_first_entry(&pending_edge,
				struct btrfs_backref_edge, list[UPPER]);
+39 −27
Original line number Diff line number Diff line
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
{
	struct btrfs_space_info *space_info = cache->space_info;
	int index = btrfs_bg_flags_to_raid_index(cache->flags);
	bool first = false;

	down_write(&space_info->groups_sem);
	if (list_empty(&space_info->block_groups[index]))
		first = true;
	list_add_tail(&cache->list, &space_info->block_groups[index]);
	up_write(&space_info->groups_sem);

	if (first)
		btrfs_sysfs_add_block_group_type(cache);
}

static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -1873,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
	return ret;
}

static int read_block_group_item(struct btrfs_block_group *cache,
static void read_block_group_item(struct btrfs_block_group *cache,
				 struct btrfs_path *path,
				 const struct btrfs_key *key)
{
@@ -1887,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache,
			   sizeof(bgi));
	cache->used = btrfs_stack_block_group_used(&bgi);
	cache->flags = btrfs_stack_block_group_flags(&bgi);

	return 0;
}

static int read_one_block_group(struct btrfs_fs_info *info,
@@ -1907,9 +1899,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
	if (!cache)
		return -ENOMEM;

	ret = read_block_group_item(cache, path, key);
	if (ret < 0)
		goto error;
	read_block_group_item(cache, path, key);

	set_free_space_tree_thresholds(cache);

@@ -2035,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
		btrfs_release_path(path);
	}

	rcu_read_lock();
	list_for_each_entry_rcu(space_info, &info->space_info, list) {
	list_for_each_entry(space_info, &info->space_info, list) {
		int i;

		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
			if (list_empty(&space_info->block_groups[i]))
				continue;
			cache = list_first_entry(&space_info->block_groups[i],
						 struct btrfs_block_group,
						 list);
			btrfs_sysfs_add_block_group_type(cache);
		}

		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
		      (BTRFS_BLOCK_GROUP_RAID10 |
		       BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2056,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
				list)
			inc_block_group_ro(cache, 1);
	}
	rcu_read_unlock();

	btrfs_init_global_block_rsv(info);
	ret = check_chunk_block_group_mappings(info);
@@ -2097,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		return;

	while (!list_empty(&trans->new_bgs)) {
		int index;

		block_group = list_first_entry(&trans->new_bgs,
					       struct btrfs_block_group,
					       bg_list);
		if (ret)
			goto next;

		index = btrfs_bg_flags_to_raid_index(block_group->flags);

		ret = insert_block_group_item(trans, block_group);
		if (ret)
			btrfs_abort_transaction(trans, ret);
@@ -2111,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		if (ret)
			btrfs_abort_transaction(trans, ret);
		add_block_group_free_space(trans, block_group);

		/*
		 * If we restriped during balance, we may have added a new raid
		 * type, so now add the sysfs entries when it is safe to do so.
		 * We don't have to worry about locking here as it's handled in
		 * btrfs_sysfs_add_block_group_type.
		 */
		if (block_group->space_info->block_group_kobjs[index] == NULL)
			btrfs_sysfs_add_block_group_type(block_group);

		/* Already aborted the transaction if it failed. */
next:
		btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2785,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
			 * finished yet (no block group item in the extent tree
			 * yet, etc). If this is the case, wait for all free
			 * space endio workers to finish and retry. This is a
			 * a very rare case so no need for a more efficient and
			 * very rare case so no need for a more efficient and
			 * complex approach.
			 */
			if (ret == -ENOENT) {
@@ -2961,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
						      space_info, -ram_bytes);
		if (delalloc)
			cache->delalloc_bytes += num_bytes;

		/*
		 * Compression can use less space than we reserved, so wake
		 * tickets if that happens
		 */
		if (num_bytes < ram_bytes)
			btrfs_try_granting_tickets(cache->fs_info, space_info);
	}
	spin_unlock(&cache->lock);
	spin_unlock(&space_info->lock);
@@ -2994,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
	if (delalloc)
		cache->delalloc_bytes -= num_bytes;
	spin_unlock(&cache->lock);

	btrfs_try_granting_tickets(cache->fs_info, space_info);
	spin_unlock(&space_info->lock);
}

@@ -3002,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
	struct list_head *head = &info->space_info;
	struct btrfs_space_info *found;

	rcu_read_lock();
	list_for_each_entry_rcu(found, head, list) {
	list_for_each_entry(found, head, list) {
		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
			found->force_alloc = CHUNK_ALLOC_FORCE;
	}
	rcu_read_unlock();
}

static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3338,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
	}
	spin_unlock(&info->block_group_cache_lock);

	/*
	 * Now that all the block groups are freed, go through and free all the
	 * space_info structs.  This is only called during the final stages of
	 * unmount, and so we know nobody is using them.  We call
	 * synchronize_rcu() once before we start, just to be on the safe side.
	 */
	synchronize_rcu();

	btrfs_release_global_block_rsv(info);

	while (!list_empty(&info->space_info)) {
+11 −19
Original line number Diff line number Diff line
@@ -21,14 +21,18 @@
 * new data the application may have written before commit.
 */
enum {
	BTRFS_INODE_ORDERED_DATA_CLOSE,
	BTRFS_INODE_FLUSH_ON_CLOSE,
	BTRFS_INODE_DUMMY,
	BTRFS_INODE_IN_DEFRAG,
	BTRFS_INODE_HAS_ASYNC_EXTENT,
	 /*
	  * Always set under the VFS' inode lock, otherwise it can cause races
	  * during fsync (we start as a fast fsync and then end up in a full
	  * fsync racing with ordered extent completion).
	  */
	BTRFS_INODE_NEEDS_FULL_SYNC,
	BTRFS_INODE_COPY_EVERYTHING,
	BTRFS_INODE_IN_DELALLOC_LIST,
	BTRFS_INODE_READDIO_NEED_LOCK,
	BTRFS_INODE_HAS_PROPS,
	BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -212,6 +216,11 @@ struct btrfs_inode {
	struct inode vfs_inode;
};

static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
{
	return inode->root->fs_info->sectorsize;
}

static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
	return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -324,23 +333,6 @@ struct btrfs_dio_private {
	u8 csums[];
};

/*
 * Disable DIO read nolock optimization, so new dio readers will be forced
 * to grab i_mutex. It is used to avoid the endless truncate due to
 * nonlocked dio read.
 */
static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
{
	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
	smp_mb();
}

static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
{
	smp_mb__before_atomic();
	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
}

/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT				"0x%*phN"
#define CSUM_FMT_VALUE(size, bytes)		size, bytes
+0 −35
Original line number Diff line number Diff line
@@ -29,41 +29,6 @@
#include "extent_io.h"
#include "extent_map.h"

int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
		u64 start, struct page **pages, unsigned long *out_pages,
		unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, unsigned char *data_in,
		struct page *dest_page, unsigned long start_byte, size_t srclen,
		size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
struct list_head *zlib_get_workspace(unsigned int level);

int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
		u64 start, struct page **pages, unsigned long *out_pages,
		unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, unsigned char *data_in,
		struct page *dest_page, unsigned long start_byte, size_t srclen,
		size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);

int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
		u64 start, struct page **pages, unsigned long *out_pages,
		unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, unsigned char *data_in,
		struct page *dest_page, unsigned long start_byte, size_t srclen,
		size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
struct list_head *zstd_alloc_workspace(unsigned int level);
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_get_workspace(unsigned int level);
void zstd_put_workspace(struct list_head *ws);

static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };

const char* btrfs_compress_type2str(enum btrfs_compression_type type)
Loading