Commit 98e24746 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block

Pull io_uring buffered writes support from Jens Axboe:
 "This contains support for buffered writes, specifically for XFS. btrfs
  is in progress, will be coming in the next release.

  io_uring does support buffered writes on any file type, but since the
  buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt
  to do so if IOCB_NOWAIT is set, any buffered write will effectively be
  handled by io-wq offload. This isn't very efficient, and we even have
  specific code in io-wq to serialize buffered writes to the same inode
  to avoid further inefficiencies with thread offload.

  This is particularly sad since most buffered writes don't block, they
  simply copy data to a page and dirty it. With this pull request, we
  can handle buffered writes a lot more effiently.

  If balance_dirty_pages() needs to block, we back off on writes as
  indicated.

  This improves buffered write support by 2-3x.

  Jan Kara helped with the mm bits for this, and Stefan handled the
  fs/iomap/xfs/io_uring parts of it"

* tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block:
  mm: honor FGP_NOWAIT for page cache page allocation
  xfs: Add async buffered write support
  xfs: Specify lockmode when calling xfs_ilock_for_iomap()
  io_uring: Add tracepoint for short writes
  io_uring: fix issue with io_write() not always undoing sb_start_write()
  io_uring: Add support for async buffered writes
  fs: Add async write file modification handling.
  fs: Split off inode_needs_update_time and __file_update_time
  fs: add __remove_file_privs() with flags parameter
  fs: add a FMODE_BUF_WASYNC flags for f_mode
  iomap: Return -EAGAIN from iomap_write_iter()
  iomap: Add async buffered write support
  iomap: Add flags parameter to iomap_page_create()
  mm: Add balance_dirty_pages_ratelimited_flags() function
  mm: Move updates of dirty_exceeded into one place
  mm: Move starting of background writeback into the main balancing loop
parents b349b118 0dd316ba
Loading
Loading
Loading
Loading
+124 −44
Original line number Diff line number Diff line
@@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns,
	return notify_change(mnt_userns, dentry, &newattrs, NULL);
}

/*
 * Remove special file priviledges (suid, capabilities) when file is written
 * to or truncated.
 */
int file_remove_privs(struct file *file)
static int __file_remove_privs(struct file *file, unsigned int flags)
{
	struct dentry *dentry = file_dentry(file);
	struct inode *inode = file_inode(file);
	int error;
	int kill;
	int error = 0;

	/*
	 * Fast path for nothing security related.
	 * As well for non-regular files, e.g. blkdev inodes.
	 * For example, blkdev_write_iter() might get here
	 * trying to remove privs which it is not allowed to.
	 */
	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
		return 0;

	kill = dentry_needs_remove_privs(dentry);
	if (kill < 0)
	if (kill <= 0)
		return kill;
	if (kill)

	if (flags & IOCB_NOWAIT)
		return -EAGAIN;

	error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
	if (!error)
		inode_has_no_xattr(inode);

	return error;
}
EXPORT_SYMBOL(file_remove_privs);

/**
 *	file_update_time	-	update mtime and ctime time
 *	@file: file accessed
 * file_remove_privs - remove special file privileges (suid, capabilities)
 * @file: file to remove privileges from
 *
 * When file is modified by a write or truncation ensure that special
 * file privileges are removed.
 *
 *	Update the mtime and ctime members of an inode and mark the inode
 *	for writeback.  Note that this function is meant exclusively for
 *	usage in the file write path of filesystems, and filesystems may
 *	choose to explicitly ignore update via this function with the
 *	S_NOCMTIME inode flag, e.g. for network filesystem where these
 *	timestamps are handled by the server.  This can return an error for
 *	file systems who need to allocate space in order to update an inode.
 * Return: 0 on success, negative errno on failure.
 */
int file_remove_privs(struct file *file)
{
	return __file_remove_privs(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);

int file_update_time(struct file *file)
static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{
	struct inode *inode = file_inode(file);
	struct timespec64 now;
	int sync_it = 0;
	int ret;

	/* First try to exhaust all avenues to not sync */
	if (IS_NOCMTIME(inode))
		return 0;

	now = current_time(inode);
	if (!timespec64_equal(&inode->i_mtime, &now))
	if (!timespec64_equal(&inode->i_mtime, now))
		sync_it = S_MTIME;

	if (!timespec64_equal(&inode->i_ctime, &now))
	if (!timespec64_equal(&inode->i_ctime, now))
		sync_it |= S_CTIME;

	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2079,37 +2069,127 @@ int file_update_time(struct file *file)
	if (!sync_it)
		return 0;

	/* Finally allowed to write? Takes lock. */
	if (__mnt_want_write_file(file))
		return 0;
	return sync_it;
}

	ret = inode_update_time(inode, &now, sync_it);
static int __file_update_time(struct file *file, struct timespec64 *now,
			int sync_mode)
{
	int ret = 0;
	struct inode *inode = file_inode(file);

	/* try to update time settings */
	if (!__mnt_want_write_file(file)) {
		ret = inode_update_time(inode, now, sync_mode);
		__mnt_drop_write_file(file);
	}

	return ret;
}

/**
 * file_update_time - update mtime and ctime time
 * @file: file accessed
 *
 * Update the mtime and ctime members of an inode and mark the inode for
 * writeback. Note that this function is meant exclusively for usage in
 * the file write path of filesystems, and filesystems may choose to
 * explicitly ignore updates via this function with the _NOCMTIME inode
 * flag, e.g. for network filesystem where these imestamps are handled
 * by the server. This can return an error for file systems who need to
 * allocate space in order to update an inode.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_update_time(struct file *file)
{
	int ret;
	struct inode *inode = file_inode(file);
	struct timespec64 now = current_time(inode);

	ret = inode_needs_update_time(inode, &now);
	if (ret <= 0)
		return ret;

	return __file_update_time(file, &now, ret);
}
EXPORT_SYMBOL(file_update_time);

/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
/**
 * file_modified_flags - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 * @flags: kiocb flags
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * If IOCB_NOWAIT is set, special file privileges will not be removed and
 * time settings will not be updated. It will return -EAGAIN.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
static int file_modified_flags(struct file *file, int flags)
{
	int err;
	int ret;
	struct inode *inode = file_inode(file);
	struct timespec64 now = current_time(inode);

	/*
	 * Clear the security bits if the process is not being run by root.
	 * This keeps people from modifying setuid and setgid binaries.
	 */
	err = file_remove_privs(file);
	if (err)
		return err;
	ret = __file_remove_privs(file, flags);
	if (ret)
		return ret;

	if (unlikely(file->f_mode & FMODE_NOCMTIME))
		return 0;

	return file_update_time(file);
	ret = inode_needs_update_time(inode, &now);
	if (ret <= 0)
		return ret;
	if (flags & IOCB_NOWAIT)
		return -EAGAIN;

	return __file_update_time(file, &now, ret);
}

/**
 * file_modified - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_modified(struct file *file)
{
	return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);

/**
 * kiocb_modified - handle mandated vfs changes when modifying a file
 * @iocb: iocb that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int kiocb_modified(struct kiocb *iocb)
{
	return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);

int inode_needs_sync(struct inode *inode)
{
	if (IS_SYNC(inode))
+52 −15
Original line number Diff line number Diff line
@@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset;

static struct iomap_page *
iomap_page_create(struct inode *inode, struct folio *folio)
iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
{
	struct iomap_page *iop = to_iomap_page(folio);
	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
	gfp_t gfp;

	if (iop || nr_blocks <= 1)
		return iop;

	if (flags & IOMAP_NOWAIT)
		gfp = GFP_NOWAIT;
	else
		gfp = GFP_NOFS | __GFP_NOFAIL;

	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
			GFP_NOFS | __GFP_NOFAIL);
		      gfp);
	if (iop) {
		spin_lock_init(&iop->uptodate_lock);
		if (folio_test_uptodate(folio))
			bitmap_fill(iop->uptodate, nr_blocks);
		folio_attach_private(folio, iop);
	}
	return iop;
}

@@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
	if (WARN_ON_ONCE(size > iomap->length))
		return -EIO;
	if (offset > 0)
		iop = iomap_page_create(iter->inode, folio);
		iop = iomap_page_create(iter->inode, folio, iter->flags);
	else
		iop = to_iomap_page(folio);

@@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
		return iomap_read_inline_data(iter, folio);

	/* zero post-eof blocks as the page may be mapped */
	iop = iomap_page_create(iter->inode, folio);
	iop = iomap_page_create(iter->inode, folio, iter->flags);
	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
	if (plen == 0)
		goto done;
@@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
		size_t len, struct folio *folio)
{
	const struct iomap *srcmap = iomap_iter_srcmap(iter);
	struct iomap_page *iop = iomap_page_create(iter->inode, folio);
	struct iomap_page *iop;
	loff_t block_size = i_blocksize(iter->inode);
	loff_t block_start = round_down(pos, block_size);
	loff_t block_end = round_up(pos + len, block_size);
	unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
	size_t from = offset_in_folio(folio, pos), to = from + len;
	size_t poff, plen;

@@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
		return 0;
	folio_clear_error(folio);

	iop = iomap_page_create(iter->inode, folio, iter->flags);
	if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
		return -EAGAIN;

	do {
		iomap_adjust_read_range(iter->inode, folio, &block_start,
				block_end - block_start, &poff, &plen);
@@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
				return -EIO;
			folio_zero_segments(folio, poff, from, to, poff + plen);
		} else {
			int status = iomap_read_folio_sync(block_start, folio,
			int status;

			if (iter->flags & IOMAP_NOWAIT)
				return -EAGAIN;

			status = iomap_read_folio_sync(block_start, folio,
					poff, plen, srcmap);
			if (status)
				return status;
@@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
	int status = 0;

	if (iter->flags & IOMAP_NOWAIT)
		fgp |= FGP_NOWAIT;

	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
	if (srcmap != &iter->iomap)
		BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
			fgp, mapping_gfp_mask(iter->inode->i_mapping));
	if (!folio) {
		status = -ENOMEM;
		status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
		goto out_no_page;
	}
	if (pos + len > folio_pos(folio) + folio_size(folio))
@@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
	loff_t pos = iter->pos;
	ssize_t written = 0;
	long status = 0;
	struct address_space *mapping = iter->inode->i_mapping;
	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;

	do {
		struct folio *folio;
@@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
		bytes = min_t(unsigned long, PAGE_SIZE - offset,
						iov_iter_count(i));
again:
		status = balance_dirty_pages_ratelimited_flags(mapping,
							       bdp_flags);
		if (unlikely(status))
			break;

		if (bytes > length)
			bytes = length;

@@ -760,6 +788,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
		 * Otherwise there's a nasty deadlock on copying from the
		 * same page as we're writing to, without it being marked
		 * up-to-date.
		 *
		 * For async buffered writes the assumption is that the user
		 * page has already been faulted in. This can be optimized by
		 * faulting the user page.
		 */
		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
			status = -EFAULT;
@@ -771,7 +803,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
			break;

		page = folio_file_page(folio, pos >> PAGE_SHIFT);
		if (mapping_writably_mapped(iter->inode->i_mapping))
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
@@ -796,10 +828,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
		pos += status;
		written += status;
		length -= status;

		balance_dirty_pages_ratelimited(iter->inode->i_mapping);
	} while (iov_iter_count(i) && length);

	if (status == -EAGAIN) {
		iov_iter_revert(i, written);
		return -EAGAIN;
	}
	return written ? written : status;
}

@@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
	};
	int ret;

	if (iocb->ki_flags & IOCB_NOWAIT)
		iter.flags |= IOMAP_NOWAIT;

	while ((ret = iomap_iter(&iter, ops)) > 0)
		iter.processed = iomap_write_iter(&iter, i);
	if (iter.pos == iocb->ki_pos)
@@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct inode *inode,
		struct folio *folio, u64 end_pos)
{
	struct iomap_page *iop = iomap_page_create(inode, folio);
	struct iomap_page *iop = iomap_page_create(inode, folio, 0);
	struct iomap_ioend *ioend, *next;
	unsigned len = i_blocksize(inode);
	unsigned nblocks = i_blocks_per_folio(inode, folio);
+3 −1
Original line number Diff line number Diff line
@@ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
	if (iocb->ki_flags & IOCB_APPEND)
		iocb->ki_pos = i_size_read(inode);

	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
	if ((iocb->ki_flags & IOCB_NOWAIT) &&
	    !((iocb->ki_flags & IOCB_DIRECT) ||
	      (file->f_mode & FMODE_BUF_WASYNC)))
		return -EINVAL;

	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+5 −6
Original line number Diff line number Diff line
@@ -410,7 +410,7 @@ xfs_file_write_checks(
		spin_unlock(&ip->i_flags_lock);

out:
	return file_modified(file);
	return kiocb_modified(iocb);
}

static int
@@ -700,12 +700,11 @@ xfs_file_buffered_write(
	bool			cleared_space = false;
	unsigned int		iolock;

	if (iocb->ki_flags & IOCB_NOWAIT)
		return -EOPNOTSUPP;

write_retry:
	iolock = XFS_IOLOCK_EXCL;
	xfs_ilock(ip, iolock);
	ret = xfs_ilock_iocb(iocb, iolock);
	if (ret)
		return ret;

	ret = xfs_file_write_checks(iocb, from, &iolock);
	if (ret)
@@ -1165,7 +1164,7 @@ xfs_file_open(
{
	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
		return -EIO;
	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
	return generic_file_open(inode, file);
}

+7 −4
Original line number Diff line number Diff line
@@ -664,7 +664,7 @@ xfs_ilock_for_iomap(
	unsigned		flags,
	unsigned		*lockmode)
{
	unsigned		mode = XFS_ILOCK_SHARED;
	unsigned int		mode = *lockmode;
	bool			is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);

	/*
@@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin(
	int			nimaps = 1, error = 0;
	bool			shared = false;
	u16			iomap_flags = 0;
	unsigned		lockmode;
	unsigned int		lockmode = XFS_ILOCK_SHARED;

	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));

@@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin(
	bool			eof = false, cow_eof = false, shared = false;
	int			allocfork = XFS_DATA_FORK;
	int			error = 0;
	unsigned int		lockmode = XFS_ILOCK_EXCL;

	if (xfs_is_shutdown(mp))
		return -EIO;
@@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin(

	ASSERT(!XFS_IS_REALTIME_INODE(ip));

	xfs_ilock(ip, XFS_ILOCK_EXCL);
	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
	if (error)
		return error;

	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
@@ -1172,7 +1175,7 @@ xfs_read_iomap_begin(
	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
	int			nimaps = 1, error = 0;
	bool			shared = false;
	unsigned		lockmode;
	unsigned int		lockmode = XFS_ILOCK_SHARED;

	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));

Loading