Merge branch 'for-chris-4.5' of... (a53fe257) · Commits · EulixOS / Software / Kernel

fs/btrfs/extent-tree.c

+17 −2

Original line number	Diff line number	Diff line
		@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
		return -ENOMEM;

		/*
		* We don't need the lock here since we are protected by the transaction
		* commit. We want to do the cache_save_setup first and then run the
		* Even though we are in the critical section of the transaction commit,
		* we can still have concurrent tasks adding elements to this
		* transaction's list of dirty block groups. These tasks correspond to
		* endio free space workers started when writeback finishes for a
		* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
		* allocate new block groups as a result of COWing nodes of the root
		* tree when updating the free space inode. The writeback for the space
		* caches is triggered by an earlier call to
		* btrfs_start_dirty_block_groups() and iterations of the following
		* loop.
		* Also we want to do the cache_save_setup first and then run the
		* delayed refs to make sure we have the best chance at doing this all
		* in one shot.
		*/
		spin_lock(&cur_trans->dirty_bgs_lock);
		while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
		struct btrfs_block_group_cache,
		@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
		* finish and then do it all again
		*/
		if (!list_empty(&cache->io_list)) {
		spin_unlock(&cur_trans->dirty_bgs_lock);
		list_del_init(&cache->io_list);
		btrfs_wait_cache_io(root, trans, cache,
		&cache->io_ctl, path,
		cache->key.objectid);
		btrfs_put_block_group(cache);
		spin_lock(&cur_trans->dirty_bgs_lock);
		}

		/*
		@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
		* on any pending IO
		*/
		list_del_init(&cache->dirty_list);
		spin_unlock(&cur_trans->dirty_bgs_lock);
		should_put = 1;

		cache_save_setup(cache, trans, path);
		@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
		/* if its not on the io list, we need to put the block group */
		if (should_put)
		btrfs_put_block_group(cache);
		spin_lock(&cur_trans->dirty_bgs_lock);
		}
		spin_unlock(&cur_trans->dirty_bgs_lock);

		while (!list_empty(io)) {
		cache = list_first_entry(io, struct btrfs_block_group_cache,

fs/btrfs/inode.c

+78 −49

Original line number	Diff line number	Diff line
		@@ -66,6 +66,13 @@ struct btrfs_iget_args {
		struct btrfs_root *root;
		};

		struct btrfs_dio_data {
		u64 outstanding_extents;
		u64 reserve;
		u64 unsubmitted_oe_range_start;
		u64 unsubmitted_oe_range_end;
		};

		static const struct inode_operations btrfs_dir_inode_operations;
		static const struct inode_operations btrfs_symlink_inode_operations;
		static const struct inode_operations btrfs_dir_ro_inode_operations;
		@@ -7408,24 +7415,20 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
		btrfs_start_ordered_extent(inode, ordered, 1);
		btrfs_put_ordered_extent(ordered);
		} else {
		/* Screw you mmap */
		ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
		if (ret)
		break;
		ret = filemap_fdatawait_range(inode->i_mapping,
		lockstart,
		lockend);
		if (ret)
		break;

		/*
		* If we found a page that couldn't be invalidated just
		* fall back to buffered.
		* We could trigger writeback for this range (and wait
		* for it to complete) and then invalidate the pages for
		* this range (through invalidate_inode_pages2_range()),
		* but that can lead us to a deadlock with a concurrent
		* call to readpages() (a buffered read or a defrag call
		* triggered a readahead) on a page lock due to an
		* ordered dio extent we created before but did not have
		* yet a corresponding bio submitted (whence it can not
		* complete), which makes readpages() wait for that
		* ordered extent to complete while holding a lock on
		* that page.
		*/
		ret = invalidate_inode_pages2_range(inode->i_mapping,
		lockstart >> PAGE_CACHE_SHIFT,
		lockend >> PAGE_CACHE_SHIFT);
		if (ret)
		ret = -ENOTBLK;
		break;
		}

		@@ -7482,11 +7485,6 @@ static struct extent_map create_pinned_em(struct inode inode, u64 start,
		return em;
		}

		struct btrfs_dio_data {
		u64 outstanding_extents;
		u64 reserve;
		};

		static void adjust_dio_outstanding_extents(struct inode *inode,
		struct btrfs_dio_data *dio_data,
		const u64 len)
		@@ -7670,6 +7668,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
		btrfs_free_reserved_data_space(inode, start, len);
		WARN_ON(dio_data->reserve < len);
		dio_data->reserve -= len;
		dio_data->unsubmitted_oe_range_end = start + len;
		current->journal_info = dio_data;
		}

		@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
		bio_put(bio);
		}

		static void btrfs_endio_direct_write(struct bio *bio)
		static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
		const u64 offset,
		const u64 bytes,
		const int uptodate)
		{
		struct btrfs_dio_private *dip = bio->bi_private;
		struct inode *inode = dip->inode;
		struct btrfs_root *root = BTRFS_I(inode)->root;
		struct btrfs_ordered_extent *ordered = NULL;
		u64 ordered_offset = dip->logical_offset;
		u64 ordered_bytes = dip->bytes;
		struct bio *dio_bio;
		u64 ordered_offset = offset;
		u64 ordered_bytes = bytes;
		int ret;

		again:
		ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
		&ordered_offset,
		ordered_bytes,
		!bio->bi_error);
		uptodate);
		if (!ret)
		goto out_test;

		@@ -8020,13 +8019,22 @@ static void btrfs_endio_direct_write(struct bio *bio)
		* our bio might span multiple ordered extents. If we haven't
		* completed the accounting for the whole dio, go back and try again
		*/
		if (ordered_offset < dip->logical_offset + dip->bytes) {
		ordered_bytes = dip->logical_offset + dip->bytes -
		ordered_offset;
		if (ordered_offset < offset + bytes) {
		ordered_bytes = offset + bytes - ordered_offset;
		ordered = NULL;
		goto again;
		}
		dio_bio = dip->dio_bio;
		}

		static void btrfs_endio_direct_write(struct bio *bio)
		{
		struct btrfs_dio_private *dip = bio->bi_private;
		struct bio *dio_bio = dip->dio_bio;

		btrfs_endio_direct_write_update_ordered(dip->inode,
		dip->logical_offset,
		dip->bytes,
		!bio->bi_error);

		kfree(dip);

		@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
		dip->subio_endio = btrfs_subio_endio_read;
		}

		/*
		* Reset the range for unsubmitted ordered extents (to a 0 length range)
		* even if we fail to submit a bio, because in such case we do the
		* corresponding error handling below and it must not be done a second
		* time by btrfs_direct_IO().
		*/
		if (write) {
		struct btrfs_dio_data *dio_data = current->journal_info;

		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
		dip->bytes;
		dio_data->unsubmitted_oe_range_start =
		dio_data->unsubmitted_oe_range_end;
		}

		ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
		if (!ret)
		return;
		@@ -8362,24 +8385,15 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
		dip = NULL;
		io_bio = NULL;
		} else {
		if (write) {
		struct btrfs_ordered_extent *ordered;

		ordered = btrfs_lookup_ordered_extent(inode,
		file_offset);
		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
		/*
		* Decrements our ref on the ordered extent and removes
		* the ordered extent from the inode's ordered tree,
		* doing all the proper resource cleanup such as for the
		* reserved space and waking up any waiters for this
		* ordered extent (through btrfs_remove_ordered_extent).
		*/
		btrfs_finish_ordered_io(ordered);
		} else {
		if (write)
		btrfs_endio_direct_write_update_ordered(inode,
		file_offset,
		dio_bio->bi_iter.bi_size,
		0);
		else
		unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
		file_offset + dio_bio->bi_iter.bi_size - 1);
		}

		dio_bio->bi_error = -EIO;
		/*
		* Releases and cleans up our dio_bio, no need to bio_put()
		@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb iocb, struct iov_iter iter,
		* originally calculated. Abuse current->journal_info for this.
		*/
		dio_data.reserve = round_up(count, root->sectorsize);
		dio_data.unsubmitted_oe_range_start = (u64)offset;
		dio_data.unsubmitted_oe_range_end = (u64)offset;
		current->journal_info = &dio_data;
		} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
		&BTRFS_I(inode)->runtime_flags)) {
		@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb iocb, struct iov_iter iter,
		if (dio_data.reserve)
		btrfs_delalloc_release_space(inode, offset,
		dio_data.reserve);
		/*
		* On error we might have left some ordered extents
		* without submitting corresponding bios for them, so
		* cleanup them up to avoid other tasks getting them
		* and waiting for them to complete forever.
		*/
		if (dio_data.unsubmitted_oe_range_start <
		dio_data.unsubmitted_oe_range_end)
		btrfs_endio_direct_write_update_ordered(inode,
		dio_data.unsubmitted_oe_range_start,
		dio_data.unsubmitted_oe_range_end -
		dio_data.unsubmitted_oe_range_start,
		0);
		} else if (ret >= 0 && (size_t)ret < count)
		btrfs_delalloc_release_space(inode, offset,
		count - (size_t)ret);

fs/btrfs/transaction.c

+17 −0

Original line number	Diff line number	Diff line
		@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
		list_del_init(&em->list);
		free_extent_map(em);
		}
		/*
		* If any block groups are found in ->deleted_bgs then it's
		* because the transaction was aborted and a commit did not
		* happen (things failed before writing the new superblock
		* and calling btrfs_finish_extent_commit()), so we can not
		* discard the physical locations of the block groups.
		*/
		while (!list_empty(&transaction->deleted_bgs)) {
		struct btrfs_block_group_cache *cache;

		cache = list_first_entry(&transaction->deleted_bgs,
		struct btrfs_block_group_cache,
		bg_list);
		list_del_init(&cache->bg_list);
		btrfs_put_block_group_trimming(cache);
		btrfs_put_block_group(cache);
		}
		kmem_cache_free(btrfs_transaction_cachep, transaction);
		}
		}

fs/btrfs/tree-defrag.c

+24 −3

Original line number	Diff line number	Diff line
		@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
		goto out;
		}
		btrfs_release_path(path);
		/*
		* We don't need a lock on a leaf. btrfs_realloc_node() will lock all
		* leafs from path->nodes[1], so set lowest_level to 1 to avoid later
		* a deadlock (attempting to write lock an already write locked leaf).
		*/
		path->lowest_level = 1;
		wret = btrfs_search_slot(trans, root, &key, path, 0, 1);

		if (wret < 0) {
		@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
		ret = 0;
		goto out;
		}
		path->slots[1] = btrfs_header_nritems(path->nodes[1]);
		next_key_ret = btrfs_find_next_key(root, path, &key, 1,
		min_trans);
		/*
		* The node at level 1 must always be locked when our path has
		* keep_locks set and lowest_level is 1, regardless of the value of
		* path->slots[1].
		*/
		BUG_ON(path->locks[1] == 0);
		ret = btrfs_realloc_node(trans, root,
		path->nodes[1], 0,
		&last_ret,
		@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
		WARN_ON(ret == -EAGAIN);
		goto out;
		}
		/*
		* Now that we reallocated the node we can find the next key. Note that
		* btrfs_find_next_key() can release our path and do another search
		* without COWing, this is because even with path->keep_locks = 1,
		* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
		* node when path->slots[node_level - 1] does not point to the last
		* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
		* we search for the next key after reallocating our node.
		*/
		path->slots[1] = btrfs_header_nritems(path->nodes[1]);
		next_key_ret = btrfs_find_next_key(root, path, &key, 1,
		min_trans);
		if (next_key_ret == 0) {
		memcpy(&root->defrag_progress, &key, sizeof(key));
		ret = -EAGAIN;

fs/btrfs/volumes.c

+15 −2

Original line number	Diff line number	Diff line
		@@ -4825,19 +4825,31 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
		goto out;
		}

		/*
		* Take the device list mutex to prevent races with the final phase of
		* a device replace operation that replaces the device object associated
		* with the map's stripes, because the device object's id can change
		* at any time during that final phase of the device replace operation
		* (dev-replace.c:btrfs_dev_replace_finishing()).
		*/
		mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
		for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;

		ret = btrfs_update_device(trans, device);
		if (ret)
		goto out;
		break;
		ret = btrfs_alloc_dev_extent(trans, device,
		chunk_root->root_key.objectid,
		BTRFS_FIRST_CHUNK_TREE_OBJECTID,
		chunk_offset, dev_offset,
		stripe_size);
		if (ret)
		break;
		}
		if (ret) {
		mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
		goto out;
		}

		@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
		stripe++;
		}
		mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);

		btrfs_set_stack_chunk_length(chunk, chunk_size);
		btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);