Commit 81aa0968 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:
 "There are still regressions being found and fixed in the zoned mode
  and subpage code, the rest are fixes for bugs reported by users.

  Regressions:

   - subpage block support:
      - readahead works on the proper block size
      - fix last page zeroing

   - zoned mode:
      - linked list corruption for tree log

  Fixes:

   - qgroup leak after falloc failure

   - tree mod log and backref resolving:
      - extent buffer cloning race when resolving backrefs
      - pin deleted leaves with active tree mod log users

   - drop debugging flag from slab cache"

* tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: always pin deleted leaves when there are active tree mod log users
  btrfs: fix race when cloning extent buffer during rewind of an old root
  btrfs: fix slab cache flags for free space tree bitmap
  btrfs: subpage: make readahead work properly
  btrfs: subpage: fix wild pointer access during metadata read failure
  btrfs: zoned: fix linked list corruption after log root tree allocation failure
  btrfs: fix qgroup data rsv leak caused by falloc failure
  btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
  btrfs: fix wrong offset to zero out range beyond i_size
parents dc033799 485df755
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
				   "failed to read tree block %llu from get_old_root",
				   logical);
		} else {
			btrfs_tree_read_lock(old);
			eb = btrfs_clone_extent_buffer(old);
			btrfs_tree_read_unlock(old);
			free_extent_buffer(old);
		}
	} else if (old_root) {
+22 −1
Original line number Diff line number Diff line
@@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,

	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
		struct btrfs_block_group *cache;
		bool must_pin = false;

		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
			ret = check_ref_cleanup(trans, buf->start);
@@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			goto out;
		}

		if (btrfs_is_zoned(fs_info)) {
		/*
		 * If this is a leaf and there are tree mod log users, we may
		 * have recorded mod log operations that point to this leaf.
		 * So we must make sure no one reuses this leaf's extent before
		 * mod log operations are applied to a node, otherwise after
		 * rewinding a node using the mod log operations we get an
		 * inconsistent btree, as the leaf's extent may now be used as
		 * a node or leaf for another different btree.
		 * We are safe from races here because at this point no other
		 * node or root points to this extent buffer, so if after this
		 * check a new tree mod log user joins, it will not be able to
		 * find a node pointing to this leaf and record operations that
		 * point to this leaf.
		 */
		if (btrfs_header_level(buf) == 0) {
			read_lock(&fs_info->tree_mod_log_lock);
			must_pin = !list_empty(&fs_info->tree_mod_seq_list);
			read_unlock(&fs_info->tree_mod_log_lock);
		}

		if (must_pin || btrfs_is_zoned(fs_info)) {
			btrfs_redirty_list_add(trans->transaction, buf);
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
			btrfs_put_block_group(cache);
+31 −2
Original line number Diff line number Diff line
@@ -2885,6 +2885,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
		btrfs_subpage_end_reader(fs_info, page, start, len);
}

/*
 * Find extent buffer for a givne bytenr.
 *
 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
 * in endio context.
 */
static struct extent_buffer *find_extent_buffer_readpage(
		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
	struct extent_buffer *eb;

	/*
	 * For regular sectorsize, we can use page->private to grab extent
	 * buffer
	 */
	if (fs_info->sectorsize == PAGE_SIZE) {
		ASSERT(PagePrivate(page) && page->private);
		return (struct extent_buffer *)page->private;
	}

	/* For subpage case, we need to lookup buffer radix tree */
	rcu_read_lock();
	eb = radix_tree_lookup(&fs_info->buffer_radix,
			       bytenr >> fs_info->sectorsize_bits);
	rcu_read_unlock();
	ASSERT(eb);
	return eb;
}

/*
 * after a readpage IO is done, we need to:
 * clear the uptodate bits on error
@@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio)
		} else {
			struct extent_buffer *eb;

			eb = (struct extent_buffer *)page->private;
			eb = find_extent_buffer_readpage(fs_info, page, start);
			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
			eb->read_mirror = mirror;
			atomic_dec(&eb->io_pages);
@@ -3020,7 +3049,7 @@ static void end_bio_extent_readpage(struct bio *bio)
			 */
			if (page->index == end_index && i_size <= end) {
				u32 zero_start = max(offset_in_page(i_size),
						     offset_in_page(end));
						     offset_in_page(start));

				zero_user_segment(page, zero_start,
						  offset_in_page(end) + 1);
+26 −11
Original line number Diff line number Diff line
@@ -9008,7 +9008,7 @@ int __init btrfs_init_cachep(void)

	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
							PAGE_SIZE, PAGE_SIZE,
							SLAB_RED_ZONE, NULL);
							SLAB_MEM_SPREAD, NULL);
	if (!btrfs_free_space_bitmap_cachep)
		goto fail;

@@ -9877,6 +9877,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
	struct btrfs_path *path;
	u64 start = ins->objectid;
	u64 len = ins->offset;
	int qgroup_released;
	int ret;

	memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9889,16 +9890,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
	/* Encryption and other encoding is reserved and all 0 */

	ret = btrfs_qgroup_release_data(inode, file_offset, len);
	if (ret < 0)
		return ERR_PTR(ret);
	qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
	if (qgroup_released < 0)
		return ERR_PTR(qgroup_released);

	if (trans) {
		ret = insert_reserved_file_extent(trans, inode,
						  file_offset, &stack_fi,
						  true, ret);
						  true, qgroup_released);
		if (ret)
			return ERR_PTR(ret);
			goto free_qgroup;
		return trans;
	}

@@ -9909,21 +9910,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
	extent_info.file_offset = file_offset;
	extent_info.extent_buf = (char *)&stack_fi;
	extent_info.is_new_extent = true;
	extent_info.qgroup_reserved = ret;
	extent_info.qgroup_reserved = qgroup_released;
	extent_info.insertions = 0;

	path = btrfs_alloc_path();
	if (!path)
		return ERR_PTR(-ENOMEM);
	if (!path) {
		ret = -ENOMEM;
		goto free_qgroup;
	}

	ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
				     file_offset + len - 1, &extent_info,
				     &trans);
	btrfs_free_path(path);
	if (ret)
		return ERR_PTR(ret);

		goto free_qgroup;
	return trans;

free_qgroup:
	/*
	 * We have released qgroup data range at the beginning of the function,
	 * and normally qgroup_released bytes will be freed when committing
	 * transaction.
	 * But if we error out early, we have to free what we have released
	 * or we leak qgroup data reservation.
	 */
	btrfs_qgroup_free_refroot(inode->root->fs_info,
			inode->root->root_key.objectid, qgroup_released,
			BTRFS_QGROUP_RSV_DATA);
	return ERR_PTR(ret);
}

static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+18 −17
Original line number Diff line number Diff line
@@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
	/* find extent */
	spin_lock(&fs_info->reada_lock);
	re = radix_tree_lookup(&fs_info->reada_tree,
			       eb->start >> PAGE_SHIFT);
			       eb->start >> fs_info->sectorsize_bits);
	if (re)
		re->refcnt++;
	spin_unlock(&fs_info->reada_lock);
@@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
	zone = NULL;
	spin_lock(&fs_info->reada_lock);
	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
				     logical >> PAGE_SHIFT, 1);
				     logical >> fs_info->sectorsize_bits, 1);
	if (ret == 1 && logical >= zone->start && logical <= zone->end) {
		kref_get(&zone->refcnt);
		spin_unlock(&fs_info->reada_lock);
@@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,

	spin_lock(&fs_info->reada_lock);
	ret = radix_tree_insert(&dev->reada_zones,
				(unsigned long)(zone->end >> PAGE_SHIFT),
			(unsigned long)(zone->end >> fs_info->sectorsize_bits),
			zone);

	if (ret == -EEXIST) {
		kfree(zone);
		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
					     logical >> PAGE_SHIFT, 1);
					logical >> fs_info->sectorsize_bits, 1);
		if (ret == 1 && logical >= zone->start && logical <= zone->end)
			kref_get(&zone->refcnt);
		else
@@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
	u64 length;
	int real_stripes;
	int nzones = 0;
	unsigned long index = logical >> PAGE_SHIFT;
	unsigned long index = logical >> fs_info->sectorsize_bits;
	int dev_replace_is_ongoing;
	int have_zone = 0;

@@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
			     struct reada_extent *re)
{
	int i;
	unsigned long index = re->logical >> PAGE_SHIFT;
	unsigned long index = re->logical >> fs_info->sectorsize_bits;

	spin_lock(&fs_info->reada_lock);
	if (--re->refcnt) {
@@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
static void reada_zone_release(struct kref *kref)
{
	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
	struct btrfs_fs_info *fs_info = zone->device->fs_info;

	lockdep_assert_held(&zone->device->fs_info->reada_lock);
	lockdep_assert_held(&fs_info->reada_lock);

	radix_tree_delete(&zone->device->reada_zones,
			  zone->end >> PAGE_SHIFT);
			  zone->end >> fs_info->sectorsize_bits);

	kfree(zone);
}
@@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{
	int i;
	unsigned long index = zone->end >> PAGE_SHIFT;
	unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;

	for (i = 0; i < zone->ndevs; ++i) {
		struct reada_zone *peer;
@@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
					     (void **)&zone, index, 1);
		if (ret == 0)
			break;
		index = (zone->end >> PAGE_SHIFT) + 1;
		index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
		if (zone->locked) {
			if (zone->elems > top_locked_elems) {
				top_locked_elems = zone->elems;
@@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
	 * plugging to speed things up
	 */
	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
				     dev->reada_next >> PAGE_SHIFT, 1);
				dev->reada_next >> fs_info->sectorsize_bits, 1);
	if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
		ret = reada_pick_zone(dev);
		if (!ret) {
@@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
		}
		re = NULL;
		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
					dev->reada_next >> PAGE_SHIFT, 1);
				dev->reada_next >> fs_info->sectorsize_bits, 1);
	}
	if (ret == 0) {
		spin_unlock(&fs_info->reada_lock);
@@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
				pr_cont(" curr off %llu",
					device->reada_next - zone->start);
			pr_cont("\n");
			index = (zone->end >> PAGE_SHIFT) + 1;
			index = (zone->end >> fs_info->sectorsize_bits) + 1;
		}
		cnt = 0;
		index = 0;
@@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
				}
			}
			pr_cont("\n");
			index = (re->logical >> PAGE_SHIFT) + 1;
			index = (re->logical >> fs_info->sectorsize_bits) + 1;
			if (++cnt > 15)
				break;
		}
@@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
		if (ret == 0)
			break;
		if (!re->scheduled) {
			index = (re->logical >> PAGE_SHIFT) + 1;
			index = (re->logical >> fs_info->sectorsize_bits) + 1;
			continue;
		}
		pr_debug("re: logical %llu size %u list empty %d scheduled %d",
@@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
			}
		}
		pr_cont("\n");
		index = (re->logical >> PAGE_SHIFT) + 1;
		index = (re->logical >> fs_info->sectorsize_bits) + 1;
	}
	spin_unlock(&fs_info->reada_lock);
}
Loading