Merge tag 'hole_punch_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs (aa99f3c2) · Commits · EulixOS / Software / Kernel

Documentation/filesystems/locking.rst

+53 −26

Original line number	Diff line number	Diff line
		@@ -271,19 +271,19 @@ prototypes::
		locking rules:
		All except set_page_dirty and freepage may block

		====================== ======================== =========
		ops PageLocked(page) i_rwsem
		====================== ======================== =========
		====================== ======================== ========= ===============
		ops PageLocked(page) i_rwsem invalidate_lock
		====================== ======================== ========= ===============
		writepage: yes, unlocks (see below)
		readpage: yes, unlocks
		readpage: yes, unlocks shared
		writepages:
		set_page_dirty no
		readahead: yes, unlocks
		readpages: no
		readahead: yes, unlocks shared
		readpages: no shared
		write_begin: locks the page exclusive
		write_end: yes, unlocks exclusive
		bmap:
		invalidatepage: yes
		invalidatepage: yes exclusive
		releasepage: yes
		freepage: yes
		direct_IO:
		@@ -295,7 +295,7 @@ is_partially_uptodate: yes
		error_remove_page: yes
		swap_activate: no
		swap_deactivate: no
		====================== ======================== =========
		====================== ======================== ========= ===============

		->write_begin(), ->write_end() and ->readpage() may be called from
		the request handler (/dev/loop).
		@@ -378,7 +378,10 @@ keep it that way and don't breed new callers.
		->invalidatepage() is called when the filesystem must attempt to drop
		some or all of the buffers from the page when it is being truncated. It
		returns zero on success. If ->invalidatepage is zero, the kernel uses
		block_invalidatepage() instead.
		block_invalidatepage() instead. The filesystem must exclusively acquire
		invalidate_lock before invalidating page cache in truncate / hole punch path
		(and thus calling into ->invalidatepage) to block races between page cache
		invalidation and page cache filling functions (fault, read, ...).

		->releasepage() is called when the kernel is about to try to drop the
		buffers from the page in preparation for freeing it. It returns zero to
		@@ -506,6 +509,7 @@ prototypes::
		ssize_t (write) (struct file , const char __user , size_t, loff_t );
		ssize_t (read_iter) (struct kiocb , struct iov_iter *);
		ssize_t (write_iter) (struct kiocb , struct iov_iter *);
		int (iopoll) (struct kiocb kiocb, bool spin);
		int (iterate) (struct file , struct dir_context *);
		int (iterate_shared) (struct file , struct dir_context *);
		__poll_t (poll) (struct file , struct poll_table_struct *);
		@@ -518,12 +522,6 @@ prototypes::
		int (fsync) (struct file , loff_t start, loff_t end, int datasync);
		int (fasync) (int, struct file , int);
		int (lock) (struct file , int, struct file_lock *);
		ssize_t (readv) (struct file , const struct iovec *, unsigned long,
		loff_t *);
		ssize_t (writev) (struct file , const struct iovec *, unsigned long,
		loff_t *);
		ssize_t (sendfile) (struct file , loff_t *, size_t, read_actor_t,
		void __user *);
		ssize_t (sendpage) (struct file , struct page *, int, size_t,
		loff_t *, int);
		unsigned long (get_unmapped_area)(struct file , unsigned long,
		@@ -536,6 +534,14 @@ prototypes::
		size_t, unsigned int);
		int (setlease)(struct file , long, struct file_lock , void );
		long (fallocate)(struct file , int, loff_t, loff_t);
		void (show_fdinfo)(struct seq_file m, struct file *f);
		unsigned (mmap_capabilities)(struct file );
		ssize_t (copy_file_range)(struct file , loff_t, struct file *,
		loff_t, size_t, unsigned int);
		loff_t (remap_file_range)(struct file file_in, loff_t pos_in,
		struct file *file_out, loff_t pos_out,
		loff_t len, unsigned int remap_flags);
		int (fadvise)(struct file , loff_t, loff_t, int);

		locking rules:
		All may block.
		@@ -570,6 +576,25 @@ in sys_read() and friends.
		the lease within the individual filesystem to record the result of the
		operation

		->fallocate implementation must be really careful to maintain page cache
		consistency when punching holes or performing other operations that invalidate
		page cache contents. Usually the filesystem needs to call
		truncate_inode_pages_range() to invalidate relevant range of the page cache.
		However the filesystem usually also needs to update its internal (and on disk)
		view of file offset -> disk block mapping. Until this update is finished, the
		filesystem needs to block page faults and reads from reloading now-stale page
		cache contents from the disk. Since VFS acquires mapping->invalidate_lock in
		shared mode when loading pages from disk (filemap_fault(), filemap_read(),
		readahead paths), the fallocate implementation must take the invalidate_lock to
		prevent reloading.

		->copy_file_range and ->remap_file_range implementations need to serialize
		against modifications of file data while the operation is running. For
		blocking changes through write(2) and similar operations inode->i_rwsem can be
		used. To block changes to file contents via a memory mapping during the
		operation, the filesystem must take mapping->invalidate_lock to coordinate
		with ->page_mkwrite.

		dquot_operations
		================

		@@ -627,11 +652,11 @@ pfn_mkwrite: yes
		access: yes
		============= ========= ===========================

		->fault() is called when a previously not present pte is about
		to be faulted in. The filesystem must find and return the page associated
		with the passed in "pgoff" in the vm_fault structure. If it is possible that
		the page may be truncated and/or invalidated, then the filesystem must lock
		the page, then ensure it is not already truncated (the page lock will block
		->fault() is called when a previously not present pte is about to be faulted
		in. The filesystem must find and return the page associated with the passed in
		"pgoff" in the vm_fault structure. If it is possible that the page may be
		truncated and/or invalidated, then the filesystem must lock invalidate_lock,
		then ensure the page is not already truncated (invalidate_lock will block
		subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
		locked. The VM will unlock the page.

		@@ -644,12 +669,14 @@ page table entry. Pointer to entry associated with the page is passed in
		"pte" field in vm_fault structure. Pointers to entries for other offsets
		should be calculated relative to "pte".

		->page_mkwrite() is called when a previously read-only pte is
		about to become writeable. The filesystem again must ensure that there are
		no truncate/invalidate races, and then return with the page locked. If
		the page has been truncated, the filesystem should not look up a new page
		like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
		will cause the VM to retry the fault.
		->page_mkwrite() is called when a previously read-only pte is about to become
		writeable. The filesystem again must ensure that there are no
		truncate/invalidate races or races with operations such as ->remap_file_range
		or ->copy_file_range, and then return with the page locked. Usually
		mapping->invalidate_lock is suitable for proper serialization. If the page has
		been truncated, the filesystem should not look up a new page like the ->fault()
		handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to
		retry the fault.

		->pfn_mkwrite() is the same as page_mkwrite but when the pte is
		VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is

fs/ceph/addr.c

+6 −3

Original line number	Diff line number	Diff line
		@@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
		ret = VM_FAULT_SIGBUS;
		} else {
		struct address_space *mapping = inode->i_mapping;
		struct page *page = find_or_create_page(mapping, 0,
		mapping_gfp_constraint(mapping,
		~__GFP_FS));
		struct page *page;

		filemap_invalidate_lock_shared(mapping);
		page = find_or_create_page(mapping, 0,
		mapping_gfp_constraint(mapping, ~__GFP_FS));
		if (!page) {
		ret = VM_FAULT_OOM;
		goto out_inline;
		@@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
		vmf->page = page;
		ret = VM_FAULT_MAJOR \| VM_FAULT_LOCKED;
		out_inline:
		filemap_invalidate_unlock_shared(mapping);
		dout("filemap_fault %p %llu read inline data ret %x\n",
		inode, off, ret);
		}

fs/ceph/file.c

+2 −0

Original line number	Diff line number	Diff line
		@@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode,
		if (ret < 0)
		goto unlock;

		filemap_invalidate_lock(inode->i_mapping);
		ceph_zero_pagecache_range(inode, offset, length);
		ret = ceph_zero_objects(inode, offset, length);

		@@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode,
		if (dirty)
		__mark_inode_dirty(inode, dirty);
		}
		filemap_invalidate_unlock(inode->i_mapping);

		ceph_put_cap_refs(ci, got);
		unlock:

fs/cifs/smb2ops.c

+2 −0

Original line number	Diff line number	Diff line
		@@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file file, struct cifs_tcon tcon,
		return rc;
		}

		filemap_invalidate_lock(inode->i_mapping);
		/*
		* We implement the punch hole through ioctl, so we need remove the page
		* caches first, otherwise the data may be inconsistent with the server.
		@@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file file, struct cifs_tcon tcon,
		sizeof(struct file_zero_data_information),
		CIFSMaxBufSize, NULL, NULL);
		free_xid(xid);
		filemap_invalidate_unlock(inode->i_mapping);
		return rc;
		}

fs/ext2/ext2.h

+0 −11

Original line number	Diff line number	Diff line
		@@ -667,9 +667,6 @@ struct ext2_inode_info {
		struct rw_semaphore xattr_sem;
		#endif
		rwlock_t i_meta_lock;
		#ifdef CONFIG_FS_DAX
		struct rw_semaphore dax_sem;
		#endif

		/*
		* truncate_mutex is for serialising ext2_truncate() against
		@@ -685,14 +682,6 @@ struct ext2_inode_info {
		#endif
		};

		#ifdef CONFIG_FS_DAX
		#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
		#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
		#else
		#define dax_sem_down_write(ext2_inode)
		#define dax_sem_up_write(ext2_inode)
		#endif

		/*
		* Inode dynamic state flags
		*/