Commit d8ad2ce8 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
 "Various bug fixes for ext4 fast commit and inline data handling.

  Also fix regression introduced as part of moving to the new mount API"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  fs/ext4: fix comments mentioning i_mutex
  ext4: fix incorrect type issue during replay_del_range
  jbd2: fix kernel-doc descriptions for jbd2_journal_shrink_{scan,count}()
  ext4: fix potential NULL pointer dereference in ext4_fill_super()
  jbd2: refactor wait logic for transaction updates into a common function
  jbd2: cleanup unused functions declarations from jbd2.h
  ext4: fix error handling in ext4_fc_record_modified_inode()
  ext4: remove redundant max inline_size check in ext4_da_write_inline_data_begin()
  ext4: fix error handling in ext4_restore_inline_data()
  ext4: fast commit may miss file actions
  ext4: fast commit may not fallback for ineligible commit
  ext4: modify the logic of ext4_mb_new_blocks_simple
  ext4: prevent used blocks from being allocated during fast commit replay
parents 18118a42 f340b3d9
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -139,7 +139,7 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
/*
 * Inode operation get_posix_acl().
 *
 * inode->i_mutex: don't care
 * inode->i_rwsem: don't care
 */
struct posix_acl *
ext4_get_acl(struct inode *inode, int type, bool rcu)
@@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type, bool rcu)
/*
 * Set the access or default ACL of an inode.
 *
 * inode->i_mutex: down unless called from ext4_new_inode
 * inode->i_rwsem: down unless called from ext4_new_inode
 */
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -271,8 +271,8 @@ ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
/*
 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
 *
 * dir->i_mutex: down
 * inode->i_mutex: up (access to inode is still exclusive)
 * dir->i_rwsem: down
 * inode->i_rwsem: up (access to inode is still exclusive)
 */
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+9 −8
Original line number Diff line number Diff line
@@ -1028,7 +1028,7 @@ struct ext4_inode_info {

	/*
	 * Extended attributes can be read independently of the main file
	 * data. Taking i_mutex even when reading would cause contention
	 * data. Taking i_rwsem even when reading would cause contention
	 * between readers of EAs and writers of regular file data, so
	 * instead we synchronize on xattr_sem when reading or changing
	 * EAs.
@@ -1750,6 +1750,7 @@ struct ext4_sb_info {
	spinlock_t s_fc_lock;
	struct buffer_head *s_fc_bh;
	struct ext4_fc_stats s_fc_stats;
	tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
	int s_fc_debug_max_replay;
#endif
@@ -1795,10 +1796,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
enum {
	EXT4_MF_MNTDIR_SAMPLED,
	EXT4_MF_FS_ABORTED,	/* Fatal error detected */
	EXT4_MF_FC_INELIGIBLE,	/* Fast commit ineligible */
	EXT4_MF_FC_COMMITTING	/* File system underoing a fast
				 * commit.
				 */
	EXT4_MF_FC_INELIGIBLE	/* Fast commit ineligible */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@@ -2926,7 +2924,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
			    struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
@@ -2935,6 +2933,9 @@ void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
			   ext4_lblk_t lblk, ext4_fsblk_t pblk,
			   int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
@@ -3407,7 +3408,7 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_mutex to avoid races with truncate */
/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
@@ -3418,7 +3419,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
	up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
	int changed = 0;
+1 −1
Original line number Diff line number Diff line
@@ -491,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_mutex for direct I/O reads.  This only works for extent-based
 * i_rwsem for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
+10 −6
Original line number Diff line number Diff line
@@ -97,7 +97,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
	 * moment, get_block can be called only for blocks inside i_size since
	 * page cache has been already dropped and writes are blocked by
	 * i_mutex. So we can safely drop the i_data_sem here.
	 * i_rwsem. So we can safely drop the i_data_sem here.
	 */
	BUG_ON(EXT4_JOURNAL(inode) == NULL);
	ext4_discard_preallocations(inode, 0);
@@ -4572,7 +4572,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,

	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

	/* Wait all existing dio workers, newcomers will block on i_mutex */
	/* Wait all existing dio workers, newcomers will block on i_rwsem */
	inode_dio_wait(inode);

	/* Preallocate the range including the unaligned edges */
@@ -4738,7 +4738,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
			goto out;
	}

	/* Wait all existing dio workers, newcomers will block on i_mutex */
	/* Wait all existing dio workers, newcomers will block on i_rwsem */
	inode_dio_wait(inode);

	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
@@ -5334,7 +5334,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
		ret = PTR_ERR(handle);
		goto out_mmap;
	}
	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

	down_write(&EXT4_I(inode)->i_data_sem);
	ext4_discard_preallocations(inode, 0);
@@ -5474,7 +5474,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
		ret = PTR_ERR(handle);
		goto out_mmap;
	}
	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

	/* Expand file to avoid data loss if there is error while shifting */
	inode->i_size += len;
@@ -5571,7 +5571,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 * stuff such as page-cache locking consistency, bh mapping consistency or
 * extent's data copying must be performed by caller.
 * Locking:
 * 		i_mutex is held for both inodes
 *		i_rwsem is held for both inodes
 * 		i_data_sem is locked for write for both inodes
 * Assumptions:
 *		All pages from requested range are locked for both inodes
@@ -6091,11 +6091,15 @@ int ext4_ext_clear_bb(struct inode *inode)

					ext4_mb_mark_bb(inode->i_sb,
							path[j].p_block, 1, 0);
					ext4_fc_record_regions(inode->i_sb, inode->i_ino,
							0, path[j].p_block, 1, 1);
				}
				ext4_ext_drop_refs(path);
				kfree(path);
			}
			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
			ext4_fc_record_regions(inode->i_sb, inode->i_ino,
					map.m_lblk, map.m_pblk, map.m_len, 1);
		}
		cur = cur + map.m_len;
	}
+78 −55
Original line number Diff line number Diff line
@@ -300,18 +300,32 @@ void ext4_fc_del(struct inode *inode)
}

/*
 * Mark file system as fast commit ineligible. This means that next commit
 * operation would result in a full jbd2 commit.
 * Mark file system as fast commit ineligible, and record latest
 * ineligible transaction tid. This means until the recorded
 * transaction, commit operation would result in a full jbd2 commit.
 */
void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	tid_t tid;

	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
		return;

	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
	if (handle && !IS_ERR(handle))
		tid = handle->h_transaction->t_tid;
	else {
		read_lock(&sbi->s_journal->j_state_lock);
		tid = sbi->s_journal->j_running_transaction ?
				sbi->s_journal->j_running_transaction->t_tid : 0;
		read_unlock(&sbi->s_journal->j_state_lock);
	}
	spin_lock(&sbi->s_fc_lock);
	if (sbi->s_fc_ineligible_tid < tid)
		sbi->s_fc_ineligible_tid = tid;
	spin_unlock(&sbi->s_fc_lock);
	WARN_ON(reason >= EXT4_FC_REASON_MAX);
	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -361,7 +375,8 @@ static int ext4_fc_track_template(
	spin_lock(&sbi->s_fc_lock);
	if (list_empty(&EXT4_I(inode)->i_fc_list))
		list_add_tail(&EXT4_I(inode)->i_fc_list,
				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
				&sbi->s_fc_q[FC_Q_STAGING] :
				&sbi->s_fc_q[FC_Q_MAIN]);
	spin_unlock(&sbi->s_fc_lock);
@@ -387,7 +402,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
	mutex_unlock(&ei->i_fc_lock);
	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
	if (!node) {
		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
		mutex_lock(&ei->i_fc_lock);
		return -ENOMEM;
	}
@@ -400,7 +415,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
		if (!node->fcd_name.name) {
			kmem_cache_free(ext4_fc_dentry_cachep, node);
			ext4_fc_mark_ineligible(inode->i_sb,
				EXT4_FC_REASON_NOMEM);
				EXT4_FC_REASON_NOMEM, NULL);
			mutex_lock(&ei->i_fc_lock);
			return -ENOMEM;
		}
@@ -414,7 +429,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
	node->fcd_name.len = dentry->d_name.len;

	spin_lock(&sbi->s_fc_lock);
	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
		list_add_tail(&node->fcd_list,
				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
	else
@@ -502,7 +518,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)

	if (ext4_should_journal_data(inode)) {
		ext4_fc_mark_ineligible(inode->i_sb,
					EXT4_FC_REASON_INODE_JOURNAL_DATA);
					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
		return;
	}

@@ -879,7 +895,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
	int ret = 0;

	spin_lock(&sbi->s_fc_lock);
	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
		while (atomic_read(&ei->i_fc_updates)) {
@@ -1179,7 +1194,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 * Fast commit cleanup routine. This is called after every fast commit and
 * full commit. full is true if we are called after a full commit.
 */
static void ext4_fc_cleanup(journal_t *journal, int full)
static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
	struct super_block *sb = journal->j_private;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1197,6 +1212,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
		list_del_init(&iter->i_fc_list);
		ext4_clear_inode_state(&iter->vfs_inode,
				       EXT4_STATE_FC_COMMITTING);
		if (iter->i_sync_tid <= tid)
			ext4_fc_reset_inode(&iter->vfs_inode);
		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
		smp_mb();
@@ -1226,8 +1242,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
				&sbi->s_fc_q[FC_Q_MAIN]);

	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
	if (tid >= sbi->s_fc_ineligible_tid) {
		sbi->s_fc_ineligible_tid = 0;
		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
	}

	if (full)
		sbi->s_fc_bytes = 0;
@@ -1392,14 +1410,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
		if (state->fc_modified_inodes[i] == ino)
			return 0;
	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
		state->fc_modified_inodes_size +=
			EXT4_FC_REPLAY_REALLOC_INCREMENT;
		state->fc_modified_inodes = krealloc(
					state->fc_modified_inodes, sizeof(int) *
					state->fc_modified_inodes_size,
				state->fc_modified_inodes,
				sizeof(int) * (state->fc_modified_inodes_size +
				EXT4_FC_REPLAY_REALLOC_INCREMENT),
				GFP_KERNEL);
		if (!state->fc_modified_inodes)
			return -ENOMEM;
		state->fc_modified_inodes_size +=
			EXT4_FC_REPLAY_REALLOC_INCREMENT;
	}
	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
	return 0;
@@ -1431,7 +1450,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
	}
	inode = NULL;

	ext4_fc_record_modified_inode(sb, ino);
	ret = ext4_fc_record_modified_inode(sb, ino);
	if (ret)
		goto out;

	raw_fc_inode = (struct ext4_inode *)
		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
@@ -1563,16 +1584,23 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
}

/*
 * Record physical disk regions which are in use as per fast commit area. Our
 * simple replay phase allocator excludes these regions from allocation.
 * Record physical disk regions which are in use as per fast commit area,
 * and used by inodes during replay phase. Our simple replay phase
 * allocator excludes these regions from allocation.
 */
static int ext4_fc_record_regions(struct super_block *sb, int ino,
		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
int ext4_fc_record_regions(struct super_block *sb, int ino,
		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
	struct ext4_fc_replay_state *state;
	struct ext4_fc_alloc_region *region;

	state = &EXT4_SB(sb)->s_fc_replay_state;
	/*
	 * during replay phase, the fc_regions_valid may not same as
	 * fc_regions_used, update it when do new additions.
	 */
	if (replay && state->fc_regions_used != state->fc_regions_valid)
		state->fc_regions_used = state->fc_regions_valid;
	if (state->fc_regions_used == state->fc_regions_size) {
		state->fc_regions_size +=
			EXT4_FC_REPLAY_REALLOC_INCREMENT;
@@ -1590,6 +1618,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino,
	region->pblk = pblk;
	region->len = len;

	if (replay)
		state->fc_regions_valid++;

	return 0;
}

@@ -1621,6 +1652,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
	}

	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
	if (ret)
		goto out;

	start = le32_to_cpu(ex->ee_block);
	start_pblk = ext4_ext_pblock(ex);
@@ -1638,18 +1671,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
		map.m_pblk = 0;
		ret = ext4_map_blocks(NULL, inode, &map, 0);

		if (ret < 0) {
			iput(inode);
			return 0;
		}
		if (ret < 0)
			goto out;

		if (ret == 0) {
			/* Range is not mapped */
			path = ext4_find_extent(inode, cur, NULL, 0);
			if (IS_ERR(path)) {
				iput(inode);
				return 0;
			}
			if (IS_ERR(path))
				goto out;
			memset(&newex, 0, sizeof(newex));
			newex.ee_block = cpu_to_le32(cur);
			ext4_ext_store_pblock(
@@ -1663,10 +1692,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
			up_write((&EXT4_I(inode)->i_data_sem));
			ext4_ext_drop_refs(path);
			kfree(path);
			if (ret) {
				iput(inode);
				return 0;
			}
			if (ret)
				goto out;
			goto next;
		}

@@ -1679,10 +1706,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
					ext4_ext_is_unwritten(ex),
					start_pblk + cur - start);
			if (ret) {
				iput(inode);
				return 0;
			}
			if (ret)
				goto out;
			/*
			 * Mark the old blocks as free since they aren't used
			 * anymore. We maintain an array of all the modified
@@ -1702,10 +1727,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
			ext4_ext_is_unwritten(ex), map.m_pblk);
		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
					ext4_ext_is_unwritten(ex), map.m_pblk);
		if (ret) {
			iput(inode);
			return 0;
		}
		if (ret)
			goto out;
		/*
		 * We may have split the extent tree while toggling the state.
		 * Try to shrink the extent tree now.
@@ -1717,6 +1740,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
	}
	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
					sb->s_blocksize_bits);
out:
	iput(inode);
	return 0;
}
@@ -1746,6 +1770,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
	}

	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
	if (ret)
		goto out;

	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
@@ -1755,10 +1781,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
		map.m_len = remaining;

		ret = ext4_map_blocks(NULL, inode, &map, 0);
		if (ret < 0) {
			iput(inode);
			return 0;
		}
		if (ret < 0)
			goto out;
		if (ret > 0) {
			remaining -= ret;
			cur += ret;
@@ -1770,18 +1794,17 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
	}

	down_write(&EXT4_I(inode)->i_data_sem);
	ret = ext4_ext_remove_space(inode, lrange.fc_lblk,
				lrange.fc_lblk + lrange.fc_len - 1);
	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
				le32_to_cpu(lrange.fc_lblk) +
				le32_to_cpu(lrange.fc_len) - 1);
	up_write(&EXT4_I(inode)->i_data_sem);
	if (ret) {
		iput(inode);
		return 0;
	}
	if (ret)
		goto out;
	ext4_ext_replay_shrink_inode(inode,
		i_size_read(inode) >> sb->s_blocksize_bits);
	ext4_mark_inode_dirty(NULL, inode);
out:
	iput(inode);

	return 0;
}

@@ -1937,7 +1960,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
			ret = ext4_fc_record_regions(sb,
				le32_to_cpu(ext.fc_ino),
				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
				ext4_ext_get_actual_len(ex));
				ext4_ext_get_actual_len(ex), 0);
			if (ret < 0)
				break;
			ret = JBD2_FC_REPLAY_CONTINUE;
Loading