Commit 7c0c7269 authored by Omar Sandoval's avatar Omar Sandoval Committed by David Sterba
Browse files

btrfs: add BTRFS_IOC_ENCODED_WRITE



The implementation resembles direct I/O: we have to flush any ordered
extents, invalidate the page cache, and do the io tree/delalloc/extent
map/ordered extent dance. From there, we can reuse the compression code
with a minor modification to distinguish the write from writeback. This
also creates inline extents when possible.

Signed-off-by: default avatarOmar Sandoval <osandov@fb.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 1881fba8
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -383,6 +383,7 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
			cb->start, cb->start + cb->len - 1,
			!cb->errors);

	if (cb->writeback)
		end_compressed_writeback(inode, cb);
	/* Note, our inode could be gone now */

@@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
				 struct page **compressed_pages,
				 unsigned int nr_pages,
				 unsigned int write_flags,
				 struct cgroup_subsys_state *blkcg_css)
				 struct cgroup_subsys_state *blkcg_css,
				 bool writeback)
{
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct bio *bio = NULL;
@@ -531,6 +533,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
	cb->mirror_num = 0;
	cb->compressed_pages = compressed_pages;
	cb->compressed_len = compressed_len;
	cb->writeback = writeback;
	cb->orig_bio = NULL;
	cb->nr_pages = nr_pages;

+5 −1
Original line number Diff line number Diff line
@@ -54,6 +54,9 @@ struct compressed_bio {
	/* The compression algorithm for this bio */
	u8 compress_type;

	/* Whether this is a write for writeback. */
	bool writeback;

	/* IO errors */
	u8 errors;
	int mirror_num;
@@ -97,7 +100,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
				  struct page **compressed_pages,
				  unsigned int nr_pages,
				  unsigned int write_flags,
				  struct cgroup_subsys_state *blkcg_css);
				  struct cgroup_subsys_state *blkcg_css,
				  bool writeback);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
				 int mirror_num, unsigned long bio_flags);

+4 −0
Original line number Diff line number Diff line
@@ -3308,6 +3308,8 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
					  u64 end, bool uptodate);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
			   struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
			     const struct btrfs_ioctl_encoded_io_args *encoded);

extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
@@ -3371,6 +3373,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
			   struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
			      struct btrfs_inode *inode, u64 start, u64 end);
ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
			    const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
		      size_t num_pages, loff_t pos, size_t write_bytes,
+53 −11
Original line number Diff line number Diff line
@@ -2037,12 +2037,43 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
	return err < 0 ? err : written;
}

static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
				    struct iov_iter *from)
static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
			const struct btrfs_ioctl_encoded_io_args *encoded)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
	loff_t count;
	ssize_t ret;

	btrfs_inode_lock(inode, 0);
	count = encoded->len;
	ret = generic_write_checks_count(iocb, &count);
	if (ret == 0 && count != encoded->len) {
		/*
		 * The write got truncated by generic_write_checks_count(). We
		 * can't do a partial encoded write.
		 */
		ret = -EFBIG;
	}
	if (ret || encoded->len == 0)
		goto out;

	ret = btrfs_write_check(iocb, from, encoded->len);
	if (ret < 0)
		goto out;

	ret = btrfs_do_encoded_write(iocb, from, encoded);
out:
	btrfs_inode_unlock(inode, 0);
	return ret;
}

ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
			    const struct btrfs_ioctl_encoded_io_args *encoded)
{
	struct file *file = iocb->ki_filp;
	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
	ssize_t num_written = 0;
	ssize_t num_written, num_sync;
	const bool sync = iocb->ki_flags & IOCB_DSYNC;

	/*
@@ -2053,22 +2084,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
	if (BTRFS_FS_ERROR(inode->root->fs_info))
		return -EROFS;

	if (!(iocb->ki_flags & IOCB_DIRECT) &&
	    (iocb->ki_flags & IOCB_NOWAIT))
	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
		return -EOPNOTSUPP;

	if (sync)
		atomic_inc(&inode->sync_writers);

	if (iocb->ki_flags & IOCB_DIRECT)
		num_written = btrfs_direct_write(iocb, from);
	else
		num_written = btrfs_buffered_write(iocb, from);
	if (encoded) {
		num_written = btrfs_encoded_write(iocb, from, encoded);
		num_sync = encoded->len;
	} else if (iocb->ki_flags & IOCB_DIRECT) {
		num_written = num_sync = btrfs_direct_write(iocb, from);
	} else {
		num_written = num_sync = btrfs_buffered_write(iocb, from);
	}

	btrfs_set_inode_last_sub_trans(inode);

	if (num_written > 0)
		num_written = generic_write_sync(iocb, num_written);
	if (num_sync > 0) {
		num_sync = generic_write_sync(iocb, num_sync);
		if (num_sync < 0)
			num_written = num_sync;
	}

	if (sync)
		atomic_dec(&inode->sync_writers);
@@ -2077,6 +2114,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
	return num_written;
}

static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	return btrfs_do_write_iter(iocb, from, NULL);
}

int btrfs_release_file(struct inode *inode, struct file *filp)
{
	struct btrfs_file_private *private = filp->private_data;
+247 −2
Original line number Diff line number Diff line
@@ -1001,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
			    async_extent->pages,	/* compressed_pages */
			    async_extent->nr_pages,
			    async_chunk->write_flags,
			    async_chunk->blkcg_css)) {
			    async_chunk->blkcg_css, true)) {
		const u64 start = async_extent->start;
		const u64 end = start + async_extent->ram_size - 1;

@@ -3001,6 +3001,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
	 * except if the ordered extent was truncated.
	 */
	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);

	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
@@ -3035,7 +3036,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)

	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
		clear_bits |= EXTENT_DELALLOC_NEW;

	freespace_inode = btrfs_is_free_space_inode(inode);
@@ -10654,6 +10656,249 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
	return ret;
}

ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
			       const struct btrfs_ioctl_encoded_io_args *encoded)
{
	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_io_tree *io_tree = &inode->io_tree;
	struct extent_changeset *data_reserved = NULL;
	struct extent_state *cached_state = NULL;
	int compression;
	size_t orig_count;
	u64 start, end;
	u64 num_bytes, ram_bytes, disk_num_bytes;
	unsigned long nr_pages, i;
	struct page **pages;
	struct btrfs_key ins;
	bool extent_reserved = false;
	struct extent_map *em;
	ssize_t ret;

	switch (encoded->compression) {
	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
		compression = BTRFS_COMPRESS_ZLIB;
		break;
	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
		compression = BTRFS_COMPRESS_ZSTD;
		break;
	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
		/* The sector size must match for LZO. */
		if (encoded->compression -
		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
		    fs_info->sectorsize_bits)
			return -EINVAL;
		compression = BTRFS_COMPRESS_LZO;
		break;
	default:
		return -EINVAL;
	}
	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
		return -EINVAL;

	orig_count = iov_iter_count(from);

	/* The extent size must be sane. */
	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
		return -EINVAL;

	/*
	 * The compressed data must be smaller than the decompressed data.
	 *
	 * It's of course possible for data to compress to larger or the same
	 * size, but the buffered I/O path falls back to no compression for such
	 * data, and we don't want to break any assumptions by creating these
	 * extents.
	 *
	 * Note that this is less strict than the current check we have that the
	 * compressed data must be at least one sector smaller than the
	 * decompressed data. We only want to enforce the weaker requirement
	 * from old kernels that it is at least one byte smaller.
	 */
	if (orig_count >= encoded->unencoded_len)
		return -EINVAL;

	/* The extent must start on a sector boundary. */
	start = iocb->ki_pos;
	if (!IS_ALIGNED(start, fs_info->sectorsize))
		return -EINVAL;

	/*
	 * The extent must end on a sector boundary. However, we allow a write
	 * which ends at or extends i_size to have an unaligned length; we round
	 * up the extent size and set i_size to the unaligned end.
	 */
	if (start + encoded->len < inode->vfs_inode.i_size &&
	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
		return -EINVAL;

	/* Finally, the offset in the unencoded data must be sector-aligned. */
	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
		return -EINVAL;

	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
	end = start + num_bytes - 1;

	/*
	 * If the extent cannot be inline, the compressed data on disk must be
	 * sector-aligned. For convenience, we extend it with zeroes if it
	 * isn't.
	 */
	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
	if (!pages)
		return -ENOMEM;
	for (i = 0; i < nr_pages; i++) {
		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
		char *kaddr;

		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
		if (!pages[i]) {
			ret = -ENOMEM;
			goto out_pages;
		}
		kaddr = kmap(pages[i]);
		if (copy_from_iter(kaddr, bytes, from) != bytes) {
			kunmap(pages[i]);
			ret = -EFAULT;
			goto out_pages;
		}
		if (bytes < PAGE_SIZE)
			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
		kunmap(pages[i]);
	}

	for (;;) {
		struct btrfs_ordered_extent *ordered;

		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
		if (ret)
			goto out_pages;
		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
						    start >> PAGE_SHIFT,
						    end >> PAGE_SHIFT);
		if (ret)
			goto out_pages;
		lock_extent_bits(io_tree, start, end, &cached_state);
		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
		if (!ordered &&
		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
			break;
		if (ordered)
			btrfs_put_ordered_extent(ordered);
		unlock_extent_cached(io_tree, start, end, &cached_state);
		cond_resched();
	}

	/*
	 * We don't use the higher-level delalloc space functions because our
	 * num_bytes and disk_num_bytes are different.
	 */
	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
	if (ret)
		goto out_unlock;
	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
	if (ret)
		goto out_free_data_space;
	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
	if (ret)
		goto out_qgroup_free_data;

	/* Try an inline extent first. */
	if (start == 0 && encoded->unencoded_len == encoded->len &&
	    encoded->unencoded_offset == 0) {
		ret = cow_file_range_inline(inode, encoded->len, orig_count,
					    compression, pages, true);
		if (ret <= 0) {
			if (ret == 0)
				ret = orig_count;
			goto out_delalloc_release;
		}
	}

	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
				   disk_num_bytes, 0, 0, &ins, 1, 1);
	if (ret)
		goto out_delalloc_release;
	extent_reserved = true;

	em = create_io_em(inode, start, num_bytes,
			  start - encoded->unencoded_offset, ins.objectid,
			  ins.offset, ins.offset, ram_bytes, compression,
			  BTRFS_ORDERED_COMPRESSED);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out_free_reserved;
	}
	free_extent_map(em);

	ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
				       ins.objectid, ins.offset,
				       encoded->unencoded_offset,
				       (1 << BTRFS_ORDERED_ENCODED) |
				       (1 << BTRFS_ORDERED_COMPRESSED),
				       compression);
	if (ret) {
		btrfs_drop_extent_cache(inode, start, end, 0);
		goto out_free_reserved;
	}
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);

	if (start + encoded->len > inode->vfs_inode.i_size)
		i_size_write(&inode->vfs_inode, start + encoded->len);

	unlock_extent_cached(io_tree, start, end, &cached_state);

	btrfs_delalloc_release_extents(inode, num_bytes);

	if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
					  ins.offset, pages, nr_pages, 0, NULL,
					  false)) {
		btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
		ret = -EIO;
		goto out_pages;
	}
	ret = orig_count;
	goto out;

out_free_reserved:
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_delalloc_release:
	btrfs_delalloc_release_extents(inode, num_bytes);
	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
out_qgroup_free_data:
	if (ret < 0)
		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
out_free_data_space:
	/*
	 * If btrfs_reserve_extent() succeeded, then we already decremented
	 * bytes_may_use.
	 */
	if (!extent_reserved)
		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
	unlock_extent_cached(io_tree, start, end, &cached_state);
out_pages:
	for (i = 0; i < nr_pages; i++) {
		if (pages[i])
			__free_page(pages[i]);
	}
	kvfree(pages);
out:
	if (ret >= 0)
		iocb->ki_pos += encoded->len;
	return ret;
}

#ifdef CONFIG_SWAP
/*
 * Add an entry indicating a block group or device which is pinned by a
Loading