Commit a2daf91f authored by Jan Kara's avatar Jan Kara Committed by Baokun Li
Browse files

ext4: avoid deadlock in fs reclaim with page writeback

mainline inclusion
from mainline-v6.4-rc2
commit 00d873c1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9SYGK

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=00d873c17e29cc32d90ca852b82685f1673acaa5



--------------------------------

Ext4 has a filesystem wide lock protecting ext4_writepages() calls to
avoid races with switching of journalled data flag or inode format. This
lock can however cause a deadlock like:

CPU0                            CPU1

ext4_writepages()
  percpu_down_read(sbi->s_writepages_rwsem);
                                ext4_change_inode_journal_flag()
                                  percpu_down_write(sbi->s_writepages_rwsem);
                                    - blocks, all readers block from now on
  ext4_do_writepages()
    ext4_init_io_end()
      kmem_cache_zalloc(io_end_cachep, GFP_KERNEL)
        fs_reclaim frees dentry...
          dentry_unlink_inode()
            iput() - last ref =>
              iput_final() - inode dirty =>
                write_inode_now()...
                  ext4_writepages() tries to acquire sbi->s_writepages_rwsem
                    and blocks forever

Make sure we cannot recurse into filesystem reclaim from writeback code
to avoid the deadlock.

Reported-by: default avatar <syzbot+6898da502aef574c5f8a@syzkaller.appspotmail.com>
Link: https://lore.kernel.org/all/0000000000004c66b405fa108e27@google.com


Fixes: c8585c6f ("ext4: fix races between changing inode journal mode and ext4_writepages")
CC: stable@vger.kernel.org
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230504124723.20205-1-jack@suse.cz


Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>

Conflicts:
	fs/ext4/ext4.h
	fs/ext4/super.c
	fs/ext4/inode.c
[Because we have merged in b66c23ec ("ext4: fix race between writepages
and remount") and we don't have ext4_do_writepages() yet and i_mmap_sem
hasn't switched to invalidate_lock yet.]
Signed-off-by: default avatarBaokun Li <libaokun1@huawei.com>
parent 9e1339f3
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#include <linux/sched/mm.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -1701,6 +1702,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
	return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_writepages_down_read(struct super_block *sb)
{
	percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
	return memalloc_nofs_save();
}

static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
{
	memalloc_nofs_restore(ctx);
	percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_writepages_down_write(struct super_block *sb)
{
	percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
	return memalloc_nofs_save();
}

static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
{
	memalloc_nofs_restore(ctx);
	percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
	return ino == EXT4_ROOT_INO ||
+10 −8
Original line number Diff line number Diff line
@@ -2709,13 +2709,14 @@ static int ext4_writepages(struct address_space *mapping,
	struct blk_plug plug;
	bool give_up_on_write = false;
	unsigned long retry_warn_ddl = 0;
	int alloc_ctx;

#define RETRY_WARN_TIMEOUT (30 * HZ)

	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

	percpu_down_read(&sbi->s_writepages_rwsem);
	alloc_ctx = ext4_writepages_down_read(inode->i_sb);
	trace_ext4_writepages(inode, wbc);

	/*
@@ -2936,7 +2937,7 @@ static int ext4_writepages(struct address_space *mapping,
out_writepages:
	trace_ext4_writepages_result(inode, wbc, ret,
				     nr_to_write - wbc->nr_to_write);
	percpu_up_read(&sbi->s_writepages_rwsem);
	ext4_writepages_up_read(inode->i_sb, alloc_ctx);
	return ret;
}

@@ -2947,17 +2948,18 @@ static int ext4_dax_writepages(struct address_space *mapping,
	long nr_to_write = wbc->nr_to_write;
	struct inode *inode = mapping->host;
	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
	int alloc_ctx;

	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

	percpu_down_read(&sbi->s_writepages_rwsem);
	alloc_ctx = ext4_writepages_down_read(inode->i_sb);
	trace_ext4_writepages(inode, wbc);

	ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
	trace_ext4_writepages_result(inode, wbc, ret,
				     nr_to_write - wbc->nr_to_write);
	percpu_up_read(&sbi->s_writepages_rwsem);
	ext4_writepages_up_read(inode->i_sb, alloc_ctx);
	return ret;
}

@@ -6107,7 +6109,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
	journal_t *journal;
	handle_t *handle;
	int err;
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	int alloc_ctx;

	/*
	 * We have to be very careful here: changing a data block's
@@ -6145,7 +6147,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
		}
	}

	percpu_down_write(&sbi->s_writepages_rwsem);
	alloc_ctx = ext4_writepages_down_write(inode->i_sb);
	jbd2_journal_lock_updates(journal);

	/*
@@ -6162,7 +6164,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
		err = jbd2_journal_flush(journal);
		if (err < 0) {
			jbd2_journal_unlock_updates(journal);
			percpu_up_write(&sbi->s_writepages_rwsem);
			ext4_writepages_up_write(inode->i_sb, alloc_ctx);
			return err;
		}
		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -6170,7 +6172,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
	ext4_set_aops(inode);

	jbd2_journal_unlock_updates(journal);
	percpu_up_write(&sbi->s_writepages_rwsem);
	ext4_writepages_up_write(inode->i_sb, alloc_ctx);

	if (val)
		up_write(&EXT4_I(inode)->i_mmap_sem);
+6 −5
Original line number Diff line number Diff line
@@ -409,7 +409,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)

int ext4_ext_migrate(struct inode *inode)
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	handle_t *handle;
	int retval = 0, i;
	__le32 *i_data;
@@ -419,6 +418,7 @@ int ext4_ext_migrate(struct inode *inode)
	unsigned long max_entries;
	__u32 goal, tmp_csum_seed;
	uid_t owner[2];
	int alloc_ctx;

	/*
	 * If the filesystem does not support extents, or the inode
@@ -435,7 +435,7 @@ int ext4_ext_migrate(struct inode *inode)
		 */
		return retval;

	percpu_down_write(&sbi->s_writepages_rwsem);
	alloc_ctx = ext4_writepages_down_write(inode->i_sb);

	/*
	 * Worst case we can touch the allocation bitmaps and a block
@@ -587,7 +587,7 @@ int ext4_ext_migrate(struct inode *inode)
	unlock_new_inode(tmp_inode);
	iput(tmp_inode);
out_unlock:
	percpu_up_write(&sbi->s_writepages_rwsem);
	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
	return retval;
}

@@ -606,6 +606,7 @@ int ext4_ind_migrate(struct inode *inode)
	ext4_fsblk_t			blk;
	handle_t			*handle;
	int				ret, ret2 = 0;
	int				alloc_ctx;

	if (!ext4_has_feature_extents(inode->i_sb) ||
	    (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
@@ -622,7 +623,7 @@ int ext4_ind_migrate(struct inode *inode)
	if (test_opt(inode->i_sb, DELALLOC))
		ext4_alloc_da_blocks(inode);

	percpu_down_write(&sbi->s_writepages_rwsem);
	alloc_ctx = ext4_writepages_down_write(inode->i_sb);

	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
	if (IS_ERR(handle)) {
@@ -666,6 +667,6 @@ int ext4_ind_migrate(struct inode *inode)
	ext4_journal_stop(handle);
	up_write(&EXT4_I(inode)->i_data_sem);
out_unlock:
	percpu_up_write(&sbi->s_writepages_rwsem);
	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
	return ret;
}
+7 −5
Original line number Diff line number Diff line
@@ -5961,6 +5961,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
	ext4_group_t g;
	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
	int err = 0;
	int alloc_ctx;
#ifdef CONFIG_QUOTA
	int enable_quota = 0;
	int i, j;
@@ -6014,13 +6015,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
	 * here s_writepages_rwsem to avoid race between writepages ops and
	 * remount.
	 */
	percpu_down_write(&sbi->s_writepages_rwsem);
	alloc_ctx = ext4_writepages_down_write(sb);
	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
		err = -EINVAL;
		percpu_up_write(&sbi->s_writepages_rwsem);
		ext4_writepages_up_write(sb, alloc_ctx);
		goto restore_opts;
	}
	percpu_up_write(&sbi->s_writepages_rwsem);
	ext4_writepages_up_write(sb, alloc_ctx);

	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
	    test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -6247,7 +6248,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
	if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
	    sb_any_quota_suspended(sb))
		dquot_resume(sb, -1);
	percpu_down_write(&sbi->s_writepages_rwsem);

	alloc_ctx = ext4_writepages_down_write(sb);
	sb->s_flags = old_sb_flags;
	sbi->s_mount_opt = old_opts.s_mount_opt;
	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -6256,7 +6258,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
	sbi->s_commit_interval = old_opts.s_commit_interval;
	sbi->s_min_batch_time = old_opts.s_min_batch_time;
	sbi->s_max_batch_time = old_opts.s_max_batch_time;
	percpu_up_write(&sbi->s_writepages_rwsem);
	ext4_writepages_up_write(sb, alloc_ctx);

	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
		ext4_release_system_zone(sb);