Commit aa99f3c2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'hole_punch_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

Pull fs hole punching vs cache filling race fixes from Jan Kara:
 "Fix races leading to possible data corruption or stale data exposure
  in multiple filesystems when hole punching races with operations such
  as readahead.

  This is the series I was sending for the last merge window but with
  your objection fixed - now filemap_fault() has been modified to take
  invalidate_lock only when we need to create new page in the page cache
  and / or bring it uptodate"

* tag 'hole_punch_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
  filesystems/locking: fix Malformed table warning
  cifs: Fix race between hole punch and page fault
  ceph: Fix race between hole punch and page fault
  fuse: Convert to using invalidate_lock
  f2fs: Convert to using invalidate_lock
  zonefs: Convert to using invalidate_lock
  xfs: Convert double locking of MMAPLOCK to use VFS helpers
  xfs: Convert to use invalidate_lock
  xfs: Refactor xfs_isilocked()
  ext2: Convert to using invalidate_lock
  ext4: Convert to use mapping->invalidate_lock
  mm: Add functions to lock invalidate_lock for two mappings
  mm: Protect operations adding pages to page cache with invalidate_lock
  documentation: Sync file_operations members with reality
  mm: Fix comments mentioning i_mutex
parents a1ca8e71 7882c55e
Loading
Loading
Loading
Loading
+53 −26
Original line number Diff line number Diff line
@@ -271,19 +271,19 @@ prototypes::
locking rules:
	All except set_page_dirty and freepage may block

======================	======================== =========
ops			PageLocked(page)	 i_rwsem
======================	======================== =========
======================	======================== =========	===============
ops			PageLocked(page)	 i_rwsem	invalidate_lock
======================	======================== =========	===============
writepage:		yes, unlocks (see below)
readpage:		yes, unlocks
readpage:		yes, unlocks				shared
writepages:
set_page_dirty		no
readahead:		yes, unlocks
readpages:		no
readahead:		yes, unlocks				shared
readpages:		no					shared
write_begin:		locks the page		 exclusive
write_end:		yes, unlocks		 exclusive
bmap:
invalidatepage:		yes
invalidatepage:		yes					exclusive
releasepage:		yes
freepage:		yes
direct_IO:
@@ -295,7 +295,7 @@ is_partially_uptodate: yes
error_remove_page:	yes
swap_activate:		no
swap_deactivate:	no
======================	======================== =========
======================	======================== =========	===============

->write_begin(), ->write_end() and ->readpage() may be called from
the request handler (/dev/loop).
@@ -378,7 +378,10 @@ keep it that way and don't breed new callers.
->invalidatepage() is called when the filesystem must attempt to drop
some or all of the buffers from the page when it is being truncated. It
returns zero on success. If ->invalidatepage is zero, the kernel uses
block_invalidatepage() instead.
block_invalidatepage() instead. The filesystem must exclusively acquire
invalidate_lock before invalidating page cache in truncate / hole punch path
(and thus calling into ->invalidatepage) to block races between page cache
invalidation and page cache filling functions (fault, read, ...).

->releasepage() is called when the kernel is about to try to drop the
buffers from the page in preparation for freeing it.  It returns zero to
@@ -506,6 +509,7 @@ prototypes::
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
	int (*iopoll) (struct kiocb *kiocb, bool spin);
	int (*iterate) (struct file *, struct dir_context *);
	int (*iterate_shared) (struct file *, struct dir_context *);
	__poll_t (*poll) (struct file *, struct poll_table_struct *);
@@ -518,12 +522,6 @@ prototypes::
	int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
	int (*fasync) (int, struct file *, int);
	int (*lock) (struct file *, int, struct file_lock *);
	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
			loff_t *);
	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
			loff_t *);
	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
			void __user *);
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
			loff_t *, int);
	unsigned long (*get_unmapped_area)(struct file *, unsigned long,
@@ -536,6 +534,14 @@ prototypes::
			size_t, unsigned int);
	int (*setlease)(struct file *, long, struct file_lock **, void **);
	long (*fallocate)(struct file *, int, loff_t, loff_t);
	void (*show_fdinfo)(struct seq_file *m, struct file *f);
	unsigned (*mmap_capabilities)(struct file *);
	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
			loff_t, size_t, unsigned int);
	loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
			struct file *file_out, loff_t pos_out,
			loff_t len, unsigned int remap_flags);
	int (*fadvise)(struct file *, loff_t, loff_t, int);

locking rules:
	All may block.
@@ -570,6 +576,25 @@ in sys_read() and friends.
the lease within the individual filesystem to record the result of the
operation

->fallocate implementation must be really careful to maintain page cache
consistency when punching holes or performing other operations that invalidate
page cache contents. Usually the filesystem needs to call
truncate_inode_pages_range() to invalidate relevant range of the page cache.
However the filesystem usually also needs to update its internal (and on disk)
view of file offset -> disk block mapping. Until this update is finished, the
filesystem needs to block page faults and reads from reloading now-stale page
cache contents from the disk. Since VFS acquires mapping->invalidate_lock in
shared mode when loading pages from disk (filemap_fault(), filemap_read(),
readahead paths), the fallocate implementation must take the invalidate_lock to
prevent reloading.

->copy_file_range and ->remap_file_range implementations need to serialize
against modifications of file data while the operation is running. For
blocking changes through write(2) and similar operations inode->i_rwsem can be
used. To block changes to file contents via a memory mapping during the
operation, the filesystem must take mapping->invalidate_lock to coordinate
with ->page_mkwrite.

dquot_operations
================

@@ -627,11 +652,11 @@ pfn_mkwrite: yes
access:		yes
=============	=========	===========================

->fault() is called when a previously not present pte is about
to be faulted in. The filesystem must find and return the page associated
with the passed in "pgoff" in the vm_fault structure. If it is possible that
the page may be truncated and/or invalidated, then the filesystem must lock
the page, then ensure it is not already truncated (the page lock will block
->fault() is called when a previously not present pte is about to be faulted
in. The filesystem must find and return the page associated with the passed in
"pgoff" in the vm_fault structure. If it is possible that the page may be
truncated and/or invalidated, then the filesystem must lock invalidate_lock,
then ensure the page is not already truncated (invalidate_lock will block
subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.

@@ -644,12 +669,14 @@ page table entry. Pointer to entry associated with the page is passed in
"pte" field in vm_fault structure. Pointers to entries for other offsets
should be calculated relative to "pte".

->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
no truncate/invalidate races, and then return with the page locked. If
the page has been truncated, the filesystem should not look up a new page
like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
will cause the VM to retry the fault.
->page_mkwrite() is called when a previously read-only pte is about to become
writeable. The filesystem again must ensure that there are no
truncate/invalidate races or races with operations such as ->remap_file_range
or ->copy_file_range, and then return with the page locked. Usually
mapping->invalidate_lock is suitable for proper serialization. If the page has
been truncated, the filesystem should not look up a new page like the ->fault()
handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to
retry the fault.

->pfn_mkwrite() is the same as page_mkwrite but when the pte is
VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is
+6 −3
Original line number Diff line number Diff line
@@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
		ret = VM_FAULT_SIGBUS;
	} else {
		struct address_space *mapping = inode->i_mapping;
		struct page *page = find_or_create_page(mapping, 0,
						mapping_gfp_constraint(mapping,
						~__GFP_FS));
		struct page *page;

		filemap_invalidate_lock_shared(mapping);
		page = find_or_create_page(mapping, 0,
				mapping_gfp_constraint(mapping, ~__GFP_FS));
		if (!page) {
			ret = VM_FAULT_OOM;
			goto out_inline;
@@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
		vmf->page = page;
		ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
		filemap_invalidate_unlock_shared(mapping);
		dout("filemap_fault %p %llu read inline data ret %x\n",
		     inode, off, ret);
	}
+2 −0
Original line number Diff line number Diff line
@@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode,
	if (ret < 0)
		goto unlock;

	filemap_invalidate_lock(inode->i_mapping);
	ceph_zero_pagecache_range(inode, offset, length);
	ret = ceph_zero_objects(inode, offset, length);

@@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode,
		if (dirty)
			__mark_inode_dirty(inode, dirty);
	}
	filemap_invalidate_unlock(inode->i_mapping);

	ceph_put_cap_refs(ci, got);
unlock:
+2 −0
Original line number Diff line number Diff line
@@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
		return rc;
	}

	filemap_invalidate_lock(inode->i_mapping);
	/*
	 * We implement the punch hole through ioctl, so we need remove the page
	 * caches first, otherwise the data may be inconsistent with the server.
@@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
			sizeof(struct file_zero_data_information),
			CIFSMaxBufSize, NULL, NULL);
	free_xid(xid);
	filemap_invalidate_unlock(inode->i_mapping);
	return rc;
}

+0 −11
Original line number Diff line number Diff line
@@ -667,9 +667,6 @@ struct ext2_inode_info {
	struct rw_semaphore xattr_sem;
#endif
	rwlock_t i_meta_lock;
#ifdef CONFIG_FS_DAX
	struct rw_semaphore dax_sem;
#endif

	/*
	 * truncate_mutex is for serialising ext2_truncate() against
@@ -685,14 +682,6 @@ struct ext2_inode_info {
#endif
};

#ifdef CONFIG_FS_DAX
#define dax_sem_down_write(ext2_inode)	down_write(&(ext2_inode)->dax_sem)
#define dax_sem_up_write(ext2_inode)	up_write(&(ext2_inode)->dax_sem)
#else
#define dax_sem_down_write(ext2_inode)
#define dax_sem_up_write(ext2_inode)
#endif

/*
 * Inode dynamic state flags
 */
Loading