Commit 61dc131c authored by Gao Xiang's avatar Gao Xiang
Browse files

Merge tag 'iomap-5.15-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git



Pull 'iomap-5.15-merge-2' to support EROFS iomap tail-packing inline:
 - Simplify the bio_end_page usage in the buffered IO code.
 - Support reading inline data at nonzero offsets for erofs.
 - Fix some typos and bad grammar.
 - Convert kmap_atomic usage in the inline data read path.
 - Add some extra inline data input checking.

Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
parents 06252e9c ae44f9c2
Loading
Loading
Loading
Loading
+86 −83
Original line number Diff line number Diff line
@@ -36,7 +36,7 @@ static inline struct iomap_page *to_iomap_page(struct page *page)
{
	/*
	 * per-block data is stored in the head page.  Callers should
	 * not be dealing with tail pages (and if they are, they can
	 * not be dealing with tail pages, and if they are, they can
	 * call thp_head() first.
	 */
	VM_BUG_ON_PGFLAGS(PageTail(page), page);
@@ -98,7 +98,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
	unsigned last = (poff + plen - 1) >> block_bits;

	/*
	 * If the block size is smaller than the page size we need to check the
	 * If the block size is smaller than the page size, we need to check the
	 * per-block uptodate status and adjust the offset and length if needed
	 * to avoid reading in already uptodate ranges.
	 */
@@ -126,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
	}

	/*
	 * If the extent spans the block that contains the i_size we need to
	 * If the extent spans the block that contains the i_size, we need to
	 * handle both halves separately so that we properly zero data in the
	 * page cache for blocks that are entirely outside of i_size.
	 */
@@ -205,25 +205,32 @@ struct iomap_readpage_ctx {
	struct readahead_control *rac;
};

static void
iomap_read_inline_data(struct inode *inode, struct page *page,
static int iomap_read_inline_data(struct inode *inode, struct page *page,
		struct iomap *iomap)
{
	size_t size = i_size_read(inode);
	size_t size = i_size_read(inode) - iomap->offset;
	size_t poff = offset_in_page(iomap->offset);
	void *addr;

	if (PageUptodate(page))
		return;
		return PAGE_SIZE - poff;

	BUG_ON(page_has_private(page));
	BUG_ON(page->index);
	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
	if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
		return -EIO;
	if (WARN_ON_ONCE(size > PAGE_SIZE -
			 offset_in_page(iomap->inline_data)))
		return -EIO;
	if (WARN_ON_ONCE(size > iomap->length))
		return -EIO;
	if (poff > 0)
		iomap_page_create(inode, page);

	addr = kmap_atomic(page);
	addr = kmap_local_page(page) + poff;
	memcpy(addr, iomap->inline_data, size);
	memset(addr + size, 0, PAGE_SIZE - size);
	kunmap_atomic(addr);
	SetPageUptodate(page);
	memset(addr + size, 0, PAGE_SIZE - poff - size);
	kunmap_local(addr);
	iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff);
	return PAGE_SIZE - poff;
}

static inline bool iomap_block_needs_zeroing(struct inode *inode,
@@ -241,16 +248,12 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
	struct iomap_readpage_ctx *ctx = data;
	struct page *page = ctx->cur_page;
	struct iomap_page *iop;
	bool same_page = false, is_contig = false;
	loff_t orig_pos = pos;
	unsigned poff, plen;
	sector_t sector;

	if (iomap->type == IOMAP_INLINE) {
		WARN_ON_ONCE(pos);
		iomap_read_inline_data(inode, page, iomap);
		return PAGE_SIZE;
	}
	if (iomap->type == IOMAP_INLINE)
		return iomap_read_inline_data(inode, page, iomap);

	/* zero post-eof blocks as the page may be mapped */
	iop = iomap_page_create(inode, page);
@@ -268,16 +271,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
	if (iop)
		atomic_add(plen, &iop->read_bytes_pending);

	/* Try to merge into a previous segment if we can */
	sector = iomap_sector(iomap, pos);
	if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
		if (__bio_try_merge_page(ctx->bio, page, plen, poff,
				&same_page))
			goto done;
		is_contig = true;
	}

	if (!is_contig || bio_full(ctx->bio, plen)) {
	if (!ctx->bio ||
	    bio_end_sector(ctx->bio) != sector ||
	    bio_add_page(ctx->bio, page, plen, poff) != plen) {
		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
		gfp_t orig_gfp = gfp;
		unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
@@ -301,13 +298,12 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
		ctx->bio->bi_iter.bi_sector = sector;
		bio_set_dev(ctx->bio, iomap->bdev);
		ctx->bio->bi_end_io = iomap_read_end_io;
		__bio_add_page(ctx->bio, page, plen, poff);
	}

	bio_add_page(ctx->bio, page, plen, poff);
done:
	/*
	 * Move the caller beyond our range so that it keeps making progress.
	 * For that we have to include any leading non-uptodate ranges, but
	 * For that, we have to include any leading non-uptodate ranges, but
	 * we can skip trailing ones as they will be handled in the next
	 * iteration.
	 */
@@ -344,9 +340,9 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
	}

	/*
	 * Just like mpage_readahead and block_read_full_page we always
	 * Just like mpage_readahead and block_read_full_page, we always
	 * return 0 and just mark the page as PageError on errors.  This
	 * should be cleaned up all through the stack eventually.
	 * should be cleaned up throughout the stack eventually.
	 */
	return 0;
}
@@ -467,7 +463,7 @@ iomap_releasepage(struct page *page, gfp_t gfp_mask)
	/*
	 * mm accommodates an old ext3 case where clean pages might not have had
	 * the dirty bit cleared. Thus, it can send actual dirty pages to
	 * ->releasepage() via shrink_active_list(), skip those here.
	 * ->releasepage() via shrink_active_list(); skip those here.
	 */
	if (PageDirty(page) || PageWriteback(page))
		return 0;
@@ -482,7 +478,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
	trace_iomap_invalidatepage(page->mapping->host, offset, len);

	/*
	 * If we are invalidating the entire page, clear the dirty state from it
	 * If we're invalidating the entire page, clear the dirty state from it
	 * and release it to avoid unnecessary buildup of the LRU.
	 */
	if (offset == 0 && len == PAGE_SIZE) {
@@ -589,6 +585,20 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
	return 0;
}

static int iomap_write_begin_inline(struct inode *inode,
		struct page *page, struct iomap *srcmap)
{
	int ret;

	/* needs more work for the tailpacking case; disable for now */
	if (WARN_ON_ONCE(srcmap->offset != 0))
		return -EIO;
	ret = iomap_read_inline_data(inode, page, srcmap);
	if (ret < 0)
		return ret;
	return 0;
}

static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
@@ -618,7 +628,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
	}

	if (srcmap->type == IOMAP_INLINE)
		iomap_read_inline_data(inode, page, srcmap);
		status = iomap_write_begin_inline(inode, page, srcmap);
	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
	else
@@ -650,13 +660,13 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
	/*
	 * The blocks that were entirely written will now be uptodate, so we
	 * don't have to worry about a readpage reading them and overwriting a
	 * partial write.  However if we have encountered a short write and only
	 * partial write.  However, if we've encountered a short write and only
	 * partially written into a block, it will not be marked uptodate, so a
	 * readpage might come in and destroy our partial write.
	 *
	 * Do the simplest thing, and just treat any short write to a non
	 * uptodate page as a zero-length write, and force the caller to redo
	 * the whole thing.
	 * Do the simplest thing and just treat any short write to a
	 * non-uptodate page as a zero-length write, and force the caller to
	 * redo the whole thing.
	 */
	if (unlikely(copied < len && !PageUptodate(page)))
		return 0;
@@ -671,12 +681,12 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
	void *addr;

	WARN_ON_ONCE(!PageUptodate(page));
	BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
	BUG_ON(!iomap_inline_data_valid(iomap));

	flush_dcache_page(page);
	addr = kmap_atomic(page);
	memcpy(iomap->inline_data + pos, addr + pos, copied);
	kunmap_atomic(addr);
	addr = kmap_local_page(page) + pos;
	memcpy(iomap_inline_data(iomap, pos), addr, copied);
	kunmap_local(addr);

	mark_inode_dirty(inode);
	return copied;
@@ -744,7 +754,7 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
			bytes = length;

		/*
		 * Bring in the user page that we will copy from _first_.
		 * Bring in the user page that we'll copy from _first_.
		 * Otherwise there's a nasty deadlock on copying from the
		 * same page as we're writing to, without it being marked
		 * up-to-date.
@@ -1153,7 +1163,7 @@ static void iomap_writepage_end_bio(struct bio *bio)
 * Submit the final bio for an ioend.
 *
 * If @error is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we have marked paged for writeback
 * the submission process has failed after we've marked pages for writeback
 * and unlocked them.  In this situation, we need to fail the bio instead of
 * submitting it.  This typically only happens on a filesystem shutdown.
 */
@@ -1168,7 +1178,7 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
		error = wpc->ops->prepare_ioend(ioend, error);
	if (error) {
		/*
		 * If we are failing the IO now, just mark the ioend with an
		 * If we're failing the IO now, just mark the ioend with an
		 * error and finish it.  This will run IO completion immediately
		 * as there is only one reference to the ioend at this point in
		 * time.
@@ -1210,7 +1220,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
/*
 * Allocate a new bio, and chain the old bio to the new one.
 *
 * Note that we have to do perform the chaining in this unintuitive order
 * Note that we have to perform the chaining in this unintuitive order
 * so that the bi_private linkage is set up in the right direction for the
 * traversal in iomap_finish_ioend().
 */
@@ -1249,7 +1259,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,

/*
 * Test to see if we have an existing ioend structure that we could append to
 * first, otherwise finish off the current ioend and start another.
 * first; otherwise finish off the current ioend and start another.
 */
static void
iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
@@ -1259,7 +1269,6 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
	sector_t sector = iomap_sector(&wpc->iomap, offset);
	unsigned len = i_blocksize(inode);
	unsigned poff = offset & (PAGE_SIZE - 1);
	bool merged, same_page = false;

	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
		if (wpc->ioend)
@@ -1267,19 +1276,13 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
		wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
	}

	merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
			&same_page);
	if (iop)
		atomic_add(len, &iop->write_bytes_pending);

	if (!merged) {
		if (bio_full(wpc->ioend->io_bio, len)) {
			wpc->ioend->io_bio =
				iomap_chain_bio(wpc->ioend->io_bio);
		}
		bio_add_page(wpc->ioend->io_bio, page, len, poff);
	if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) {
		wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
		__bio_add_page(wpc->ioend->io_bio, page, len, poff);
	}

	if (iop)
		atomic_add(len, &iop->write_bytes_pending);
	wpc->ioend->io_size += len;
	wbc_account_cgroup_owner(wbc, page, len);
}
@@ -1287,9 +1290,9 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
/*
 * We implement an immediate ioend submission policy here to avoid needing to
 * chain multiple ioends and hence nest mempool allocations which can violate
 * forward progress guarantees we need to provide. The current ioend we are
 * adding blocks to is cached on the writepage context, and if the new block
 * does not append to the cached ioend it will create a new ioend and cache that
 * the forward progress guarantees we need to provide. The current ioend we're
 * adding blocks to is cached in the writepage context, and if the new block
 * doesn't append to the cached ioend, it will create a new ioend and cache that
 * instead.
 *
 * If a new ioend is created and cached, the old ioend is returned and queued
@@ -1351,7 +1354,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
	if (unlikely(error)) {
		/*
		 * Let the filesystem know what portion of the current page
		 * failed to map. If the page wasn't been added to ioend, it
		 * failed to map. If the page hasn't been added to ioend, it
		 * won't be affected by I/O completion and we must unlock it
		 * now.
		 */
@@ -1368,7 +1371,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
	unlock_page(page);

	/*
	 * Preserve the original error if there was one, otherwise catch
	 * Preserve the original error if there was one; catch
	 * submission errors here and propagate into subsequent ioend
	 * submissions.
	 */
@@ -1395,8 +1398,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
/*
 * Write out a dirty page.
 *
 * For delalloc space on the page we need to allocate space and flush it.
 * For unwritten space on the page we need to start the conversion to
 * For delalloc space on the page, we need to allocate space and flush it.
 * For unwritten space on the page, we need to start the conversion to
 * regular allocated space.
 */
static int
@@ -1411,7 +1414,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
	trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE);

	/*
	 * Refuse to write the page out if we are called from reclaim context.
	 * Refuse to write the page out if we're called from reclaim context.
	 *
	 * This avoids stack overflows when called from deeply used stacks in
	 * random callers for direct reclaim or memcg reclaim.  We explicitly
@@ -1456,20 +1459,20 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
		unsigned offset_into_page = offset & (PAGE_SIZE - 1);

		/*
		 * Skip the page if it is fully outside i_size, e.g. due to a
		 * truncate operation that is in progress. We must redirty the
		 * Skip the page if it's fully outside i_size, e.g. due to a
		 * truncate operation that's in progress. We must redirty the
		 * page so that reclaim stops reclaiming it. Otherwise
		 * iomap_vm_releasepage() is called on it and gets confused.
		 *
		 * Note that the end_index is unsigned long, it would overflow
		 * if the given offset is greater than 16TB on 32-bit system
		 * and if we do check the page is fully outside i_size or not
		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
		 * will be evaluated to 0.  Hence this page will be redirtied
		 * and be written out repeatedly which would result in an
		 * infinite loop, the user program that perform this operation
		 * will hang.  Instead, we can verify this situation by checking
		 * if the page to write is totally beyond the i_size or if it's
		 * Note that the end_index is unsigned long.  If the given
		 * offset is greater than 16TB on a 32-bit system then if we
		 * checked if the page is fully outside i_size with
		 * "if (page->index >= end_index + 1)", "end_index + 1" would
		 * overflow and evaluate to 0.  Hence this page would be
		 * redirtied and written out repeatedly, which would result in
		 * an infinite loop; the user program performing this operation
		 * would hang.  Instead, we can detect this situation by
		 * checking if the page is totally beyond i_size or if its
		 * offset is just equal to the EOF.
		 */
		if (page->index > end_index ||
+6 −4
Original line number Diff line number Diff line
@@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
		struct iomap_dio *dio, struct iomap *iomap)
{
	struct iov_iter *iter = dio->submit.iter;
	void *inline_data = iomap_inline_data(iomap, pos);
	size_t copied;

	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
	if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
		return -EIO;

	if (dio->flags & IOMAP_DIO_WRITE) {
		loff_t size = inode->i_size;

		if (pos > size)
			memset(iomap->inline_data + size, 0, pos - size);
		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
			memset(iomap_inline_data(iomap, size), 0, pos - size);
		copied = copy_from_iter(inline_data, length, iter);
		if (copied) {
			if (pos + copied > size)
				i_size_write(inode, pos + copied);
			mark_inode_dirty(inode);
		}
	} else {
		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
		copied = copy_to_iter(inline_data, length, iter);
	}
	dio->size += copied;
	return copied;
+18 −0
Original line number Diff line number Diff line
@@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos)
	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}

/*
 * Returns the inline data pointer for logical offset @pos.
 */
static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos)
{
	return iomap->inline_data + pos - iomap->offset;
}

/*
 * Check if the mapping's length is within the valid range for inline data.
 * This is used to guard against accessing data beyond the page inline_data
 * points at.
 */
static inline bool iomap_inline_data_valid(struct iomap *iomap)
{
	return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
}

/*
 * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
 * and page_done will be called for each page written to.  This only applies to