Commit d4f5258e authored by Jan Kara's avatar Jan Kara
Browse files

ext4: Convert to use mapping->invalidate_lock



Convert ext4 to use mapping->invalidate_lock instead of its private
EXT4_I(inode)->i_mmap_sem. This is mostly search-and-replace. By this
conversion we fix a long standing race between hole punching and read(2)
/ readahead(2) paths that can lead to stale page cache contents.

CC: <linux-ext4@vger.kernel.org>
CC: Ted Tso <tytso@mit.edu>
Acked-by: default avatarTheodore Ts'o <tytso@mit.edu>
Reviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
parent 7506ae6a
Loading
Loading
Loading
Loading
+0 −10
Original line number Diff line number Diff line
@@ -1086,15 +1086,6 @@ struct ext4_inode_info {
	 * by other means, so we have i_data_sem.
	 */
	struct rw_semaphore i_data_sem;
	/*
	 * i_mmap_sem is for serializing page faults with truncate / punch hole
	 * operations. We have to make sure that new page cannot be faulted in
	 * a section of the inode that is being punched. We cannot easily use
	 * i_data_sem for this since we need protection for the whole punch
	 * operation and i_data_sem ranks below transaction start so we have
	 * to occasionally drop it.
	 */
	struct rw_semaphore i_mmap_sem;
	struct inode vfs_inode;
	struct jbd2_inode *jinode;

@@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
			     loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
+14 −11
Original line number Diff line number Diff line
@@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
			    loff_t len, int mode)
{
	struct inode *inode = file_inode(file);
	struct address_space *mapping = file->f_mapping;
	handle_t *handle = NULL;
	unsigned int max_blocks;
	loff_t new_size = 0;
@@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
		 * Prevent page faults from reinstantiating pages we have
		 * released from page cache.
		 */
		down_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_lock(mapping);

		ret = ext4_break_layouts(inode);
		if (ret) {
			up_write(&EXT4_I(inode)->i_mmap_sem);
			filemap_invalidate_unlock(mapping);
			goto out_mutex;
		}

		ret = ext4_update_disksize_before_punch(inode, offset, len);
		if (ret) {
			up_write(&EXT4_I(inode)->i_mmap_sem);
			filemap_invalidate_unlock(mapping);
			goto out_mutex;
		}
		/* Now release the pages and zero block aligned part of pages */
@@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,

		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
					     flags);
		up_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_unlock(mapping);
		if (ret)
			goto out_mutex;
	}
@@ -5221,6 +5222,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
	struct super_block *sb = inode->i_sb;
	struct address_space *mapping = inode->i_mapping;
	ext4_lblk_t punch_start, punch_stop;
	handle_t *handle;
	unsigned int credits;
@@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
	 * Prevent page faults from reinstantiating pages we have released from
	 * page cache.
	 */
	down_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_lock(mapping);

	ret = ext4_break_layouts(inode);
	if (ret)
@@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
	 * Write tail of the last page before removed range since it will get
	 * removed from the page cache below.
	 */
	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
	ret = filemap_write_and_wait_range(mapping, ioffset, offset);
	if (ret)
		goto out_mmap;
	/*
	 * Write data that will be shifted to preserve them when discarding
	 * page cache below. We are also protected from pages becoming dirty
	 * by i_mmap_sem.
	 * by i_rwsem and invalidate_lock.
	 */
	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
	ret = filemap_write_and_wait_range(mapping, offset + len,
					   LLONG_MAX);
	if (ret)
		goto out_mmap;
@@ -5350,7 +5352,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
	ext4_journal_stop(handle);
	ext4_fc_stop_ineligible(sb);
out_mmap:
	up_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_unlock(mapping);
out_mutex:
	inode_unlock(inode);
	return ret;
@@ -5367,6 +5369,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
	struct super_block *sb = inode->i_sb;
	struct address_space *mapping = inode->i_mapping;
	handle_t *handle;
	struct ext4_ext_path *path;
	struct ext4_extent *extent;
@@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
	 * Prevent page faults from reinstantiating pages we have released from
	 * page cache.
	 */
	down_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_lock(mapping);

	ret = ext4_break_layouts(inode);
	if (ret)
@@ -5526,7 +5529,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
	ext4_journal_stop(handle);
	ext4_fc_stop_ineligible(sb);
out_mmap:
	up_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_unlock(mapping);
out_mutex:
	inode_unlock(inode);
	return ret;
+7 −6
Original line number Diff line number Diff line
@@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
	 */
	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
		(vmf->vma->vm_flags & VM_SHARED);
	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
	pfn_t pfn;

	if (write) {
		sb_start_pagefault(sb);
		file_update_time(vmf->vma->vm_file);
		down_read(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_lock_shared(mapping);
retry:
		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
					       EXT4_DATA_TRANS_BLOCKS(sb));
		if (IS_ERR(handle)) {
			up_read(&EXT4_I(inode)->i_mmap_sem);
			filemap_invalidate_unlock_shared(mapping);
			sb_end_pagefault(sb);
			return VM_FAULT_SIGBUS;
		}
	} else {
		down_read(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_lock_shared(mapping);
	}
	result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
	if (write) {
@@ -731,10 +732,10 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
		/* Handling synchronous page fault? */
		if (result & VM_FAULT_NEEDDSYNC)
			result = dax_finish_sync_fault(vmf, pe_size, pfn);
		up_read(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_unlock_shared(mapping);
		sb_end_pagefault(sb);
	} else {
		up_read(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_unlock_shared(mapping);
	}

	return result;
@@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif

static const struct vm_operations_struct ext4_file_vm_ops = {
	.fault		= ext4_filemap_fault,
	.fault		= filemap_fault,
	.map_pages	= filemap_map_pages,
	.page_mkwrite   = ext4_page_mkwrite,
};
+17 −30
Original line number Diff line number Diff line
@@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
	return ret;
}

static void ext4_wait_dax_page(struct ext4_inode_info *ei)
static void ext4_wait_dax_page(struct inode *inode)
{
	up_write(&ei->i_mmap_sem);
	filemap_invalidate_unlock(inode->i_mapping);
	schedule();
	down_write(&ei->i_mmap_sem);
	filemap_invalidate_lock(inode->i_mapping);
}

int ext4_break_layouts(struct inode *inode)
{
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct page *page;
	int error;

	if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
	if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
		return -EINVAL;

	do {
@@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)
		error = ___wait_var_event(&page->_refcount,
				atomic_read(&page->_refcount) == 1,
				TASK_INTERRUPTIBLE, 0, 0,
				ext4_wait_dax_page(ei));
				ext4_wait_dax_page(inode));
	} while (error == 0);

	return error;
@@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
	if (ext4_has_inline_data(inode)) {
		down_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_lock(mapping);
		ret = ext4_convert_inline_data(inode);
		up_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_unlock(mapping);
		if (ret)
			return ret;
	}
@@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
	 * Prevent page faults from reinstantiating pages we have released from
	 * page cache.
	 */
	down_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_lock(mapping);

	ret = ext4_break_layouts(inode);
	if (ret)
@@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
out_stop:
	ext4_journal_stop(handle);
out_dio:
	up_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_unlock(mapping);
out_mutex:
	inode_unlock(inode);
	return ret;
@@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
			inode_dio_wait(inode);
		}

		down_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_lock(inode->i_mapping);

		rc = ext4_break_layouts(inode);
		if (rc) {
			up_write(&EXT4_I(inode)->i_mmap_sem);
			filemap_invalidate_unlock(inode->i_mapping);
			goto err_out;
		}

@@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
				error = rc;
		}
out_mmap_sem:
		up_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_unlock(inode->i_mapping);
	}

	if (!error) {
@@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
	 * data (and journalled aops don't know how to handle these cases).
	 */
	if (val) {
		down_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_lock(inode->i_mapping);
		err = filemap_write_and_wait(inode->i_mapping);
		if (err < 0) {
			up_write(&EXT4_I(inode)->i_mmap_sem);
			filemap_invalidate_unlock(inode->i_mapping);
			return err;
		}
	}
@@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
	percpu_up_write(&sbi->s_writepages_rwsem);

	if (val)
		up_write(&EXT4_I(inode)->i_mmap_sem);
		filemap_invalidate_unlock(inode->i_mapping);

	/* Finally we can mark the inode as dirty. */

@@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
	sb_start_pagefault(inode->i_sb);
	file_update_time(vma->vm_file);

	down_read(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_lock_shared(mapping);

	err = ext4_convert_inline_data(inode);
	if (err)
@@ -6176,7 +6175,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
out_ret:
	ret = block_page_mkwrite_return(err);
out:
	up_read(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_unlock_shared(mapping);
	sb_end_pagefault(inode->i_sb);
	return ret;
out_error:
@@ -6184,15 +6183,3 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
	ext4_journal_stop(handle);
	goto out;
}

vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
{
	struct inode *inode = file_inode(vmf->vma->vm_file);
	vm_fault_t ret;

	down_read(&EXT4_I(inode)->i_mmap_sem);
	ret = filemap_fault(vmf);
	up_read(&EXT4_I(inode)->i_mmap_sem);

	return ret;
}
+2 −2
Original line number Diff line number Diff line
@@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
		goto journal_err_out;
	}

	down_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_lock(inode->i_mapping);
	err = filemap_write_and_wait(inode->i_mapping);
	if (err)
		goto err_out;
@@ -256,7 +256,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
	ext4_double_up_write_data_sem(inode, inode_bl);

err_out:
	up_write(&EXT4_I(inode)->i_mmap_sem);
	filemap_invalidate_unlock(inode->i_mapping);
journal_err_out:
	unlock_two_nondirectories(inode, inode_bl);
	iput(inode_bl);
Loading