Commit 06c04442 authored by Kent Overstreet's avatar Kent Overstreet Committed by Linus Torvalds
Browse files

mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig

Convert generic_file_buffered_read() to get pages to read from in batches,
and then copy data to userspace from many pages at once - in particular,
we now don't touch any cachelines that might be contended while we're in
the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance improvement
(1%, within the margin of error).

akpm: kernel test robot found a 32% speedup on one test:
https://lkml.kernel.org/r/20201030081456.GY31092@shao2-debian

Link: https://lkml.kernel.org/r/20201025212949.602194-3-kent.overstreet@gmail.com


Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 723ef24b
Loading
Loading
Loading
Loading
+175 −138
Original line number Diff line number Diff line
@@ -2176,67 +2176,6 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
		return lock_page_killable(page);
}

static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
			struct iov_iter *iter,
			struct page *page)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	struct inode *inode = mapping->host;
	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
	unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
	unsigned int bytes, copied;
	loff_t isize, end_offset;

	BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);

	/*
	 * i_size must be checked after we know the page is Uptodate.
	 *
	 * Checking i_size after the check allows us to calculate
	 * the correct value for "bytes", which means the zero-filled
	 * part of the page is not copied back to userspace (unless
	 * another truncate extends the file - this is desired though).
	 */

	isize = i_size_read(inode);
	if (unlikely(iocb->ki_pos >= isize))
		return 1;

	end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

	bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);

	/* If users can be writing to this page using arbitrary
	 * virtual addresses, take care about potential aliasing
	 * before reading the page on the kernel side.
	 */
	if (mapping_writably_mapped(mapping))
		flush_dcache_page(page);

	/*
	 * Ok, we have the page, and it's up-to-date, so
	 * now we can copy it to user space...
	 */

	copied = copy_page_to_iter(page, offset, bytes, iter);

	iocb->ki_pos += copied;

	/*
	 * When a sequential read accesses a page several times,
	 * only mark it as accessed the first time.
	 */
	if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
		mark_page_accessed(page);

	ra->prev_pos = iocb->ki_pos;

	if (copied < bytes)
		return -EFAULT;

	return !iov_iter_count(iter) || iocb->ki_pos == isize;
}

static struct page *
generic_file_buffered_read_readpage(struct kiocb *iocb,
				    struct file *filp,
@@ -2394,6 +2333,92 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
}

static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
						struct iov_iter *iter,
						struct page **pages,
						unsigned int nr)
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct file_ra_state *ra = &filp->f_ra;
	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
	pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
	int i, j, nr_got, err = 0;

	nr = min_t(unsigned long, last_index - index, nr);
find_page:
	if (fatal_signal_pending(current))
		return -EINTR;

	nr_got = find_get_pages_contig(mapping, index, nr, pages);
	if (nr_got)
		goto got_pages;

	if (iocb->ki_flags & IOCB_NOIO)
		return -EAGAIN;

	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);

	nr_got = find_get_pages_contig(mapping, index, nr, pages);
	if (nr_got)
		goto got_pages;

	pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
	err = PTR_ERR_OR_ZERO(pages[0]);
	if (!IS_ERR_OR_NULL(pages[0]))
		nr_got = 1;
got_pages:
	for (i = 0; i < nr_got; i++) {
		struct page *page = pages[i];
		pgoff_t pg_index = index + i;
		loff_t pg_pos = max(iocb->ki_pos,
				    (loff_t) pg_index << PAGE_SHIFT);
		loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;

		if (PageReadahead(page)) {
			if (iocb->ki_flags & IOCB_NOIO) {
				for (j = i; j < nr_got; j++)
					put_page(pages[j]);
				nr_got = i;
				err = -EAGAIN;
				break;
			}
			page_cache_async_readahead(mapping, ra, filp, page,
					pg_index, last_index - pg_index);
		}

		if (!PageUptodate(page)) {
			if ((iocb->ki_flags & IOCB_NOWAIT) ||
			    ((iocb->ki_flags & IOCB_WAITQ) && i)) {
				for (j = i; j < nr_got; j++)
					put_page(pages[j]);
				nr_got = i;
				err = -EAGAIN;
				break;
			}

			page = generic_file_buffered_read_pagenotuptodate(iocb,
					filp, iter, page, pg_pos, pg_count);
			if (IS_ERR_OR_NULL(page)) {
				for (j = i + 1; j < nr_got; j++)
					put_page(pages[j]);
				nr_got = i;
				err = PTR_ERR_OR_ZERO(page);
				break;
			}
		}
	}

	if (likely(nr_got))
		return nr_got;
	if (err)
		return err;
	/*
	 * No pages and no error means we raced and should retry:
	 */
	goto find_page;
}

/**
 * generic_file_buffered_read - generic file read routine
 * @iocb:	the iocb to read
@@ -2414,104 +2439,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
		struct iov_iter *iter, ssize_t written)
{
	struct file *filp = iocb->ki_filp;
	struct file_ra_state *ra = &filp->f_ra;
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;
	struct file_ra_state *ra = &filp->f_ra;
	size_t orig_count = iov_iter_count(iter);
	pgoff_t last_index;
	int error = 0;
	struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
	unsigned int nr_pages = min_t(unsigned int, 512,
			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
			(iocb->ki_pos >> PAGE_SHIFT));
	int i, pg_nr, error = 0;
	bool writably_mapped;
	loff_t isize, end_offset;

	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
		return 0;
	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);

	last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
	if (nr_pages > ARRAY_SIZE(pages_onstack))
		pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);

	if (!pages) {
		pages = pages_onstack;
		nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
	}

	do {
		cond_resched();

		/*
		 * If we've already successfully copied some data, then we
		 * can no longer safely return -EIOCBQUEUED. Hence mark
		 * an async read NOWAIT at that point.
		 */
	if (written && (iocb->ki_flags & IOCB_WAITQ))
		if ((iocb->ki_flags & IOCB_WAITQ) && written)
			iocb->ki_flags |= IOCB_NOWAIT;

	for (;;) {
		pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
		struct page *page;

		cond_resched();
find_page:
		if (fatal_signal_pending(current)) {
			error = -EINTR;
			goto out;
		i = 0;
		pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
							     pages, nr_pages);
		if (pg_nr < 0) {
			error = pg_nr;
			break;
		}

		/*
		 * We can't return -EIOCBQUEUED once we've done some work, so
		 * ensure we don't block:
		 * i_size must be checked after we know the pages are Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */
		if ((iocb->ki_flags & IOCB_WAITQ) &&
		    (written + orig_count - iov_iter_count(iter)))
			iocb->ki_flags |= IOCB_NOWAIT;
		isize = i_size_read(inode);
		if (unlikely(iocb->ki_pos >= isize))
			goto put_pages;

		page = find_get_page(mapping, index);
		if (!page) {
			if (iocb->ki_flags & IOCB_NOIO)
				goto would_block;
			page_cache_sync_readahead(mapping,
					ra, filp,
					index, last_index - index);
			page = find_get_page(mapping, index);
			if (unlikely(page == NULL)) {
				page = generic_file_buffered_read_no_cached_page(iocb, iter);
				if (!page)
					goto find_page;
				if (IS_ERR(page)) {
					error = PTR_ERR(page);
					goto out;
				}
			}
		}
		if (PageReadahead(page)) {
			if (iocb->ki_flags & IOCB_NOIO) {
				put_page(page);
				goto out;
			}
			page_cache_async_readahead(mapping,
					ra, filp, page,
					index, last_index - index);
		}
		if (!PageUptodate(page)) {
			if (iocb->ki_flags & IOCB_NOWAIT) {
				put_page(page);
				error = -EAGAIN;
				goto out;
			}
			page = generic_file_buffered_read_pagenotuptodate(iocb,
					filp, iter, page, iocb->ki_pos, iter->count);
			if (!page)
				goto find_page;
			if (IS_ERR(page)) {
				error = PTR_ERR(page);
				goto out;
			}
		}
		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

		error = generic_file_buffered_read_page_ok(iocb, iter, page);
		put_page(page);
		while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
		       (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
			put_page(pages[--pg_nr]);

		if (error) {
			if (error > 0)
				error = 0;
			goto out;
		/*
		 * Once we start copying data, we don't want to be touching any
		 * cachelines that might be contended:
		 */
		writably_mapped = mapping_writably_mapped(mapping);

		/*
		 * When a sequential read accesses a page several times, only
		 * mark it as accessed the first time.
		 */
		if (iocb->ki_pos >> PAGE_SHIFT !=
		    ra->prev_pos >> PAGE_SHIFT)
			mark_page_accessed(pages[0]);
		for (i = 1; i < pg_nr; i++)
			mark_page_accessed(pages[i]);

		for (i = 0; i < pg_nr; i++) {
			unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
			unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
						   PAGE_SIZE - offset);
			unsigned int copied;

			/*
			 * If users can be writing to this page using arbitrary
			 * virtual addresses, take care about potential aliasing
			 * before reading the page on the kernel side.
			 */
			if (writably_mapped)
				flush_dcache_page(pages[i]);

			copied = copy_page_to_iter(pages[i], offset, bytes, iter);

			written += copied;
			iocb->ki_pos += copied;
			ra->prev_pos = iocb->ki_pos;

			if (copied < bytes) {
				error = -EFAULT;
				break;
			}
		}
put_pages:
		for (i = 0; i < pg_nr; i++)
			put_page(pages[i]);
	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

would_block:
	error = -EAGAIN;
out:
	file_accessed(filp);
	written += orig_count - iov_iter_count(iter);

	if (pages != pages_onstack)
		kfree(pages);

	return written ? written : error;
}