Commit 8bcbbe9c authored by Jan Kara's avatar Jan Kara
Browse files

fuse: Convert to using invalidate_lock



Use invalidate_lock instead of fuse's private i_mmap_sem. The intended
purpose is exactly the same. By this conversion we fix a long standing
race between hole punching and read(2) / readahead(2) paths that can
lead to stale page cache contents.

CC: Miklos Szeredi <miklos@szeredi.hu>
Reviewed-by: default avatarMiklos Szeredi <mszeredi@redhat.com>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
parent edc6d01b
Loading
Loading
Loading
Loading
+24 −26
Original line number Diff line number Diff line
@@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
	/*
	 * Can't do inline reclaim in fault path. We call
	 * dax_layout_busy_page() before we free a range. And
	 * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
	 * In fault path we enter with fi->i_mmap_sem held and can't drop
	 * it. Also in fault path we hold fi->i_mmap_sem shared and not
	 * exclusive, so that creates further issues with fuse_wait_dax_page().
	 * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
	 * range to become free and retry.
	 * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
	 * In fault path we enter with mapping->invalidate_lock held and can't
	 * drop it. Also in fault path we hold mapping->invalidate_lock shared
	 * and not exclusive, so that creates further issues with
	 * fuse_wait_dax_page().  Hence return -EAGAIN and fuse_dax_fault()
	 * will wait for a memory range to become free and retry.
	 */
	if (flags & IOMAP_FAULT) {
		alloc_dmap = alloc_dax_mapping(fcd);
@@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
	down_write(&fi->dax->sem);
	node = interval_tree_iter_first(&fi->dax->tree, idx, idx);

	/* We are holding either inode lock or i_mmap_sem, and that should
	/* We are holding either inode lock or invalidate_lock, and that should
	 * ensure that dmap can't be truncated. We are holding a reference
	 * on dmap and that should make sure it can't be reclaimed. So dmap
	 * should still be there in tree despite the fact we dropped and
@@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = {

static void fuse_wait_dax_page(struct inode *inode)
{
	struct fuse_inode *fi = get_fuse_inode(inode);

	up_write(&fi->i_mmap_sem);
	filemap_invalidate_unlock(inode->i_mapping);
	schedule();
	down_write(&fi->i_mmap_sem);
	filemap_invalidate_lock(inode->i_mapping);
}

/* Should be called with fi->i_mmap_sem lock held exclusively */
/* Should be called with mapping->invalidate_lock held exclusively */
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
				    loff_t start, loff_t end)
{
@@ -813,18 +811,18 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
	 * we do not want any read/write/mmap to make progress and try
	 * to populate page cache or access memory we are trying to free.
	 */
	down_read(&get_fuse_inode(inode)->i_mmap_sem);
	filemap_invalidate_lock_shared(inode->i_mapping);
	ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
	if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
		error = 0;
		retry = true;
		up_read(&get_fuse_inode(inode)->i_mmap_sem);
		filemap_invalidate_unlock_shared(inode->i_mapping);
		goto retry;
	}

	if (ret & VM_FAULT_NEEDDSYNC)
		ret = dax_finish_sync_fault(vmf, pe_size, pfn);
	up_read(&get_fuse_inode(inode)->i_mmap_sem);
	filemap_invalidate_unlock_shared(inode->i_mapping);

	if (write)
		sb_end_pagefault(sb);
@@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
	int ret;
	struct interval_tree_node *node;

	down_write(&fi->i_mmap_sem);
	filemap_invalidate_lock(inode->i_mapping);

	/* Lookup a dmap and corresponding file offset to reclaim. */
	down_read(&fi->dax->sem);
@@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
out_write_dmap_sem:
	up_write(&fi->dax->sem);
out_mmap_sem:
	up_write(&fi->i_mmap_sem);
	filemap_invalidate_unlock(inode->i_mapping);
	return dmap;
}

@@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
		 * had a reference or some other temporary failure,
		 * Try again. We want to give up inline reclaim only
		 * if there is no range assigned to this node. Otherwise
		 * if a deadlock is possible if we sleep with fi->i_mmap_sem
		 * held and worker to free memory can't make progress due
		 * to unavailability of fi->i_mmap_sem lock. So sleep
		 * only if fi->dax->nr=0
		 * if a deadlock is possible if we sleep with
		 * mapping->invalidate_lock held and worker to free memory
		 * can't make progress due to unavailability of
		 * mapping->invalidate_lock.  So sleep only if fi->dax->nr=0
		 */
		if (retry)
			continue;
@@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
		 * There are no mappings which can be reclaimed. Wait for one.
		 * We are not holding fi->dax->sem. So it is possible
		 * that range gets added now. But as we are not holding
		 * fi->i_mmap_sem, worker should still be able to free up
		 * a range and wake us up.
		 * mapping->invalidate_lock, worker should still be able to
		 * free up a range and wake us up.
		 */
		if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
			if (wait_event_killable_exclusive(fcd->range_waitq,
@@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
/*
 * Free a range of memory.
 * Locking:
 * 1. Take fi->i_mmap_sem to block dax faults.
 * 1. Take mapping->invalidate_lock to block dax faults.
 * 2. Take fi->dax->sem to protect interval tree and also to make sure
 *    read/write can not reuse a dmap which we might be freeing.
 */
@@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
	loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
	loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;

	down_write(&fi->i_mmap_sem);
	filemap_invalidate_lock(inode->i_mapping);
	ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
	if (ret) {
		pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
@@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
	ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
	up_write(&fi->dax->sem);
out_mmap_sem:
	up_write(&fi->i_mmap_sem);
	filemap_invalidate_unlock(inode->i_mapping);
	return ret;
}

+6 −5
Original line number Diff line number Diff line
@@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
	struct fuse_mount *fm = get_fuse_mount(inode);
	struct fuse_conn *fc = fm->fc;
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct address_space *mapping = inode->i_mapping;
	FUSE_ARGS(args);
	struct fuse_setattr_in inarg;
	struct fuse_attr_out outarg;
@@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
	}

	if (FUSE_IS_DAX(inode) && is_truncate) {
		down_write(&fi->i_mmap_sem);
		filemap_invalidate_lock(mapping);
		fault_blocked = true;
		err = fuse_dax_break_layouts(inode, 0, 0);
		if (err) {
			up_write(&fi->i_mmap_sem);
			filemap_invalidate_unlock(mapping);
			return err;
		}
	}
@@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
	if ((is_truncate || !is_wb) &&
	    S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
		truncate_pagecache(inode, outarg.attr.size);
		invalidate_inode_pages2(inode->i_mapping);
		invalidate_inode_pages2(mapping);
	}

	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
out:
	if (fault_blocked)
		up_write(&fi->i_mmap_sem);
		filemap_invalidate_unlock(mapping);

	return 0;

@@ -1711,7 +1712,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

	if (fault_blocked)
		up_write(&fi->i_mmap_sem);
		filemap_invalidate_unlock(mapping);
	return err;
}

+5 −5
Original line number Diff line number Diff line
@@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
	}

	if (dax_truncate) {
		down_write(&get_fuse_inode(inode)->i_mmap_sem);
		filemap_invalidate_lock(inode->i_mapping);
		err = fuse_dax_break_layouts(inode, 0, 0);
		if (err)
			goto out;
@@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)

out:
	if (dax_truncate)
		up_write(&get_fuse_inode(inode)->i_mmap_sem);
		filemap_invalidate_unlock(inode->i_mapping);

	if (is_wb_truncate | dax_truncate) {
		fuse_release_nowrite(inode);
@@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
	if (lock_inode) {
		inode_lock(inode);
		if (block_faults) {
			down_write(&fi->i_mmap_sem);
			filemap_invalidate_lock(inode->i_mapping);
			err = fuse_dax_break_layouts(inode, 0, 0);
			if (err)
				goto out;
@@ -2976,7 +2976,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

	if (block_faults)
		up_write(&fi->i_mmap_sem);
		filemap_invalidate_unlock(inode->i_mapping);

	if (lock_inode)
		inode_unlock(inode);
@@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
	 * modifications.  Yet this does give less guarantees than if the
	 * copying was performed with write(2).
	 *
	 * To fix this a i_mmap_sem style lock could be used to prevent new
	 * To fix this a mapping->invalidate_lock could be used to prevent new
	 * faults while the copy is ongoing.
	 */
	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+0 −7
Original line number Diff line number Diff line
@@ -149,13 +149,6 @@ struct fuse_inode {
	/** Lock to protect write related fields */
	spinlock_t lock;

	/**
	 * Can't take inode lock in fault path (leads to circular dependency).
	 * Introduce another semaphore which can be taken in fault path and
	 * then other filesystem paths can take this to block faults.
	 */
	struct rw_semaphore i_mmap_sem;

#ifdef CONFIG_FUSE_DAX
	/*
	 * Dax specific inode data
+0 −1
Original line number Diff line number Diff line
@@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
	fi->orig_ino = 0;
	fi->state = 0;
	mutex_init(&fi->mutex);
	init_rwsem(&fi->i_mmap_sem);
	spin_lock_init(&fi->lock);
	fi->forget = fuse_alloc_forget();
	if (!fi->forget)