Unverified Commit 8bcd04af authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!6632 ext4: use iomap for regular file's buffered IO path and enable large foilo

Merge Pull Request from: @ci-robot 
 
PR sync from: Zhang Yi <yi.zhang@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/J43RSGILPWRUKC2GUDSHWUFV3L2D62UT/ 
From: Zhang Yi <yi.zhang@huaweicloud.com>

This series convert ext4 buffered IO path from buffered_head to iomap,
and enable large folio by default.

01-14: ioamp map multiple blocks pre ->map_blocks by Christoph, backport
       from [1].
15:    A small debug improvement for the previous series in iomap
       map_blocks [2].
16-24: fix a stale zero data issue in xfs and make iomap_zero_ranege
       don't increase i_size [3].
25-29: the first part of prepartory changes have been merged to
       upstream [4].
30-38: the second part of prepartory changes, fix an issue and support
       adding multi-delalloc blocks [5].
39-54: comvert buffered_head to iomap, these are picked up from the my
       v3 series [6].

[1] https://lore.kernel.org/linux-fsdevel/20231207072710.176093-1-hch@lst.de/
[2] https://lore.kernel.org/linux-fsdevel/20240220115759.3445025-1-yi.zhang@huaweicloud.com/
[3] https://lore.kernel.org/linux-xfs/20240320110548.2200662-1-yi.zhang@huaweicloud.com/
[4] https://lore.kernel.org/linux-ext4/20240105033018.1665752-1-yi.zhang@huaweicloud.com/
[5] https://lore.kernel.org/linux-ext4/20240410034203.2188357-1-yi.zhang@huaweicloud.com/
[6] https://lore.kernel.org/linux-ext4/20240127015825.1608160-1-yi.zhang@huaweicloud.com/

Thanks,
Yi.

Christoph Hellwig (14):
  iomap: clear the per-folio dirty bits on all writeback failures
  iomap: treat inline data in iomap_writepage_map as an I/O error
  iomap: move the io_folios field out of struct iomap_ioend
  iomap: move the PF_MEMALLOC check to iomap_writepages
  iomap: factor out a iomap_writepage_handle_eof helper
  iomap: move all remaining per-folio logic into iomap_writepage_map
  iomap: clean up the iomap_alloc_ioend calling convention
  iomap: move the iomap_sector sector calculation out of
    iomap_add_to_ioend
  iomap: don't chain bios
  iomap: only call mapping_set_error once for each failed bio
  iomap: factor out a iomap_writepage_map_block helper
  iomap: submit ioends immediately
  iomap: map multiple blocks at a time
  iomap: pass the length of the dirty region to ->map_blocks

Zhang Yi (40):
  iomap: add pos and dirty_len into trace_iomap_writepage_map
  xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional
  xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset
  xfs: convert delayed extents to unwritten when zeroing post eof blocks
  iomap: drop the write failure handles when unsharing and zeroing
  iomap: don't increase i_size if it's not a write operation
  iomap: use a new variable to handle the written bytes in
    iomap_write_iter()
  iomap: make iomap_write_end() return a boolean
  iomap: do some small logical cleanup in buffered write
  ext4: refactor ext4_da_map_blocks()
  ext4: convert to exclusive lock while inserting delalloc extents
  ext4: add a hole extent entry in cache after punch
  ext4: make ext4_map_blocks() distinguish delalloc only extent
  ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC map type
  ext4: factor out a common helper to query extent map
  ext4: check the extent status again before inserting delalloc block
  ext4: trim delalloc extent
  ext4: drop iblock parameter
  ext4: make ext4_es_insert_delayed_block() insert multi-blocks
  ext4: make ext4_da_reserve_space() reserve multi-clusters
  ext4: factor out check for whether a cluster is allocated
  ext4: make ext4_insert_delayed_block() insert multi-blocks
  ext4: make ext4_da_map_blocks() buffer_head unaware
  ext4: use reserved metadata blocks when splitting extent on endio
  ext4: factor out ext4_map_create_blocks() to allocate new blocks
  ext4: introduce seq counter for the extent status entry
  ext4: add a new iomap aops for regular file's buffered IO path
  ext4: implement buffered read iomap path
  ext4: implement buffered write iomap path
  ext4: implement writeback iomap path
  ext4: implement mmap iomap path
  ext4: implement zero_range iomap path
  ext4: writeback partial blocks before zeroing out range
  ext4: fall back to buffer_head path for defrag
  ext4: partial enable iomap for regular file's buffered IO path
  filemap: support disable large folios on active inode
  ext4: enable large folio for regular file with iomap buffered IO path
  ext4: don't mark IOMAP_F_DIRTY for buffer write
  ext4: add mount option for buffered IO iomap path


-- 
2.39.2
 
https://gitee.com/openeuler/kernel/issues/I9DN5Z 
 
Link:https://gitee.com/openeuler/kernel/pulls/6632

 

Reviewed-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: default avatarXie XiuQi <xiexiuqi@huawei.com>
parents 918c8391 c5073cc5
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -467,7 +467,7 @@ static void blkdev_readahead(struct readahead_control *rac)
}

static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
		struct inode *inode, loff_t offset)
		struct inode *inode, loff_t offset, unsigned int len)
{
	loff_t isize = i_size_read(inode);

+14 −1
Original line number Diff line number Diff line
@@ -263,8 +263,10 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED		BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN	BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY	BIT(BH_Boundary)
#define EXT4_MAP_DELAYED	BIT(BH_Delay)
#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
				 EXT4_MAP_DELAYED)

struct ext4_map_blocks {
	ext4_fsblk_t m_pblk;
@@ -1114,6 +1116,7 @@ struct ext4_inode_info {
	ext4_lblk_t i_es_shrink_lblk;	/* Offset where we start searching for
					   extents to shrink. Protected by
					   i_es_lock  */
	unsigned int i_es_seq;		/* modify counter for extents */

	/* ialloc */
	ext4_group_t	i_last_alloc_group;
@@ -1145,6 +1148,8 @@ struct ext4_inode_info {
	 */
	struct list_head i_rsv_conversion_list;
	struct work_struct i_rsv_conversion_work;
	struct list_head i_iomap_ioend_list;
	struct work_struct i_iomap_ioend_work;
	atomic_t i_unwritten; /* Nr. of inflight conversions pending */

	spinlock_t i_block_reservation_lock;
@@ -1250,6 +1255,7 @@ struct ext4_inode_info {
						    * scanning in mballoc
						    */
#define EXT4_MOUNT2_ABORT		0x00000100 /* Abort filesystem */
#define EXT4_MOUNT2_BUFFERED_IOMAP	0x00000200 /* Use iomap for buffered IO */

#define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
						~EXT4_MOUNT_##opt
@@ -1910,6 +1916,7 @@ enum {
	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
	EXT4_STATE_BUFFERED_IOMAP,	/* Inode use iomap for buffered IO */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -2966,6 +2973,8 @@ int ext4_walk_page_buffers(handle_t *handle,
				     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
				struct buffer_head *bh);
bool ext4_should_use_buffered_iomap(struct inode *inode);
int ext4_nonda_switch(struct super_block *sb);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA	 2

@@ -3750,6 +3759,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
		size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
extern void ext4_iomap_end_io(struct work_struct *work);
extern void ext4_iomap_end_bio(struct bio *bio);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@ -3823,6 +3834,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
extern const struct iomap_ops ext4_iomap_buffered_write_ops;
extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
+6 −0
Original line number Diff line number Diff line
@@ -11,6 +11,12 @@ int ext4_inode_journal_mode(struct inode *inode)
{
	if (EXT4_JOURNAL(inode) == NULL)
		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
	/*
	 * Ordered mode is no longer needed for the inode that use the
	 * iomap path, always use writeback mode.
	 */
	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
	/* We do not support data journalling with delayed allocation */
	if (!S_ISREG(inode->i_mode) ||
	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+30 −12
Original line number Diff line number Diff line
@@ -3719,21 +3719,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
	ext_debug(inode, "logical block %llu, max_blocks %u\n",
		  (unsigned long long)ee_block, ee_len);

	/* If extent is larger than requested it is a clear sign that we still
	 * have some extent state machine issues left. So extent_split is still
	 * required.
	 * TODO: Once all related issues will be fixed this situation should be
	 * illegal.
	/*
	 * For the inodes that use the buffered iomap path need to split
	 * extents in endio, other inodes not.
	 *
	 * TODO: Reserve enough sapce for splitting extents, always split
	 * extents here, and totally remove this warning.
	 */
	if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef CONFIG_EXT4_DEBUG
		ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
		if (!ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
			ext4_warning(inode->i_sb,
				     "Inode (%ld) finished: extent logical block %llu, "
				     "len %u; IO logical block %llu, len %u",
			     inode->i_ino, (unsigned long long)ee_block, ee_len,
			     (unsigned long long)map->m_lblk, map->m_len);
				     inode->i_ino, (unsigned long long)ee_block,
				     ee_len, (unsigned long long)map->m_lblk,
				     map->m_len);
		}
#endif
		err = ext4_split_convert_extents(handle, inode, map, ppath,
						 EXT4_GET_BLOCKS_CONVERT);
					EXT4_GET_BLOCKS_CONVERT |
					EXT4_GET_BLOCKS_METADATA_NOFAIL);
		if (err < 0)
			return err;
		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
@@ -4087,8 +4093,11 @@ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
		/*
		 * The delalloc extent containing lblk, it must have been
		 * added after ext4_map_blocks() checked the extent status
		 * tree, adjust the length to the delalloc extent's after
		 * lblk.
		 * tree so we are not holding i_rwsem and delalloc info is
		 * only stabilized by i_data_sem we are going to release
		 * soon. Don't modify the extent status tree and report
		 * extent as a hole, just adjust the length to the delalloc
		 * extent's after lblk.
		 */
		len = es.es_lblk + es.es_len - lblk;
		return len;
@@ -4618,6 +4627,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
		if (ret)
			goto out_mutex;

		ret = filemap_write_and_wait_range(mapping,
				round_down(offset, 1 << blkbits), offset);
		if (ret)
			goto out_mutex;

		ret = filemap_write_and_wait_range(mapping, offset + len,
				round_up((offset + len), 1 << blkbits));
		if (ret)
			goto out_mutex;
	}

	/* Zero range excluding the unaligned edges */
+54 −22
Original line number Diff line number Diff line
@@ -204,6 +204,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
	return es->es_lblk + es->es_len - 1;
}

static inline void ext4_es_inc_seq(struct inode *inode)
{
	struct ext4_inode_info *ei = EXT4_I(inode);

	WRITE_ONCE(ei->i_es_seq, READ_ONCE(ei->i_es_seq) + 1);
}

/*
 * search through the tree for an delayed extent with a given offset.  If
 * it can't be found, try to find next extent.
@@ -876,6 +883,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
		WARN_ON(1);
	}

	ext4_es_inc_seq(inode);
	newes.es_lblk = lblk;
	newes.es_len = len;
	ext4_es_store_pblock_status(&newes, pblk, status);
@@ -1503,13 +1511,15 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
		return;

	trace_ext4_es_remove_extent(inode, lblk, len);
	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
		 lblk, len, inode->i_ino);

	if (!len)
		return;

	ext4_es_inc_seq(inode);
	trace_ext4_es_remove_extent(inode, lblk, len);

	end = lblk + len - 1;
	BUG_ON(end < lblk);

@@ -2049,34 +2059,43 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}

/*
 * ext4_es_insert_delayed_block - adds a delayed block to the extents status
 *                                tree, adding a pending reservation where
 *                                needed
 * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
 *                                 status tree, adding a pending reservation
 *                                 where needed
 *
 * @inode - file containing the newly added block
 * @lblk - logical block to be added
 * @allocated - indicates whether a physical cluster has been allocated for
 *              the logical cluster that contains the block
 * @lblk - start logical block to be added
 * @len - length of blocks to be added
 * @lclu_allocated/end_allocated - indicates whether a physical cluster has
 *                                 been allocated for the logical cluster
 *                                 that contains the block
 */
void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
				  bool allocated)
void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
				   ext4_lblk_t len, bool lclu_allocated,
				   bool end_allocated)
{
	struct extent_status newes;
	ext4_lblk_t end = lblk + len - 1;
	int err1 = 0, err2 = 0, err3 = 0;
	struct extent_status *es1 = NULL;
	struct extent_status *es2 = NULL;
	struct pending_reservation *pr = NULL;
	struct pending_reservation *pr1 = NULL;
	struct pending_reservation *pr2 = NULL;

	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
		return;

	es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
		 lblk, inode->i_ino);
	es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
		 lblk, len, inode->i_ino);
	if (!len)
		return;

	ext4_es_inc_seq(inode);
	newes.es_lblk = lblk;
	newes.es_len = 1;
	newes.es_len = len;
	ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
	trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
	trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
					    end_allocated);

	ext4_es_insert_extent_check(inode, &newes);

@@ -2085,11 +2104,15 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
		es1 = __es_alloc_extent(true);
	if ((err1 || err2) && !es2)
		es2 = __es_alloc_extent(true);
	if ((err1 || err2 || err3) && allocated && !pr)
		pr = __alloc_pending(true);
	if (err1 || err2 || err3) {
		if (lclu_allocated && !pr1)
			pr1 = __alloc_pending(true);
		if (end_allocated && !pr2)
			pr2 = __alloc_pending(true);
	}
	write_lock(&EXT4_I(inode)->i_es_lock);

	err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
	err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
	if (err1 != 0)
		goto error;
	/* Free preallocated extent if it didn't get used. */
@@ -2109,13 +2132,22 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
		es2 = NULL;
	}

	if (allocated) {
		err3 = __insert_pending(inode, lblk, &pr);
	if (lclu_allocated) {
		err3 = __insert_pending(inode, lblk, &pr1);
		if (err3 != 0)
			goto error;
		if (pr) {
			__free_pending(pr);
			pr = NULL;
		if (pr1) {
			__free_pending(pr1);
			pr1 = NULL;
		}
	}
	if (end_allocated) {
		err3 = __insert_pending(inode, end, &pr2);
		if (err3 != 0)
			goto error;
		if (pr2) {
			__free_pending(pr2);
			pr2 = NULL;
		}
	}
error:
Loading