Commit 9ccce092 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull orangefs updates from Mike Marshall:
 "orangefs: implement orangefs_readahead

  mm/readahead.c/read_pages was quite a bit different back when I put my
  open-coded readahead logic into orangefs_readpage. That logic seemed
  to work as designed back then, it is a trainwreck now.

  This implements orangefs_readahead using the new xarray and
  readahead_expand features and removes all my open-coded readahead
  logic.

  This results in an extreme read performance improvement, these sample
  numbers are from my test VM:

  Here's an example of what's upstream in
  5.11.8-200.fc33.x86_64:

     30+0 records in
     30+0 records out
     125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s

  And here's this version of orangefs_readahead on top of 5.12.0-rc4:

     30+0 records in
     30+0 records out
     125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s

  There are four xfstest regressions with this patch. David Howells and
  Matthew Wilcox have been helping me work with this code"

* tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux:
  orangefs: leave files in the page cache for a few micro seconds at least
  Orangef: implement orangefs_readahead.
parents 27787ba3 211f9f2e
Loading
Loading
Loading
Loading
+6 −28
Original line number Diff line number Diff line
@@ -248,20 +248,6 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
		 *       or it can pointers to struct page's
		 */

		/*
		 * When reading, readahead_size will only be zero when
		 * we're doing O_DIRECT, otherwise we got here from
		 * orangefs_readpage.
		 *
		 * If we got here from orangefs_readpage we want to
		 * copy either a page or the whole file into the io
		 * vector, whichever is smaller.
		 */
		if (readahead_size)
			copy_amount =
				min(new_op->downcall.resp.io.amt_complete,
					(__s64)PAGE_SIZE);
		else
		copy_amount = new_op->downcall.resp.io.amt_complete;

		ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
@@ -283,19 +269,11 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,

out:
	if (buffer_index >= 0) {
		if ((readahead_size) && (type == ORANGEFS_IO_READ)) {
			/* readpage */
			*index_return = buffer_index;
			gossip_debug(GOSSIP_FILE_DEBUG,
				"%s: hold on to buffer_index :%d:\n",
				__func__, buffer_index);
		} else {
			/* O_DIRECT */
		orangefs_bufmap_put(buffer_index);
		gossip_debug(GOSSIP_FILE_DEBUG,
			"%s(%pU): PUT buffer_index %d\n",
			__func__, handle, buffer_index);
		}
		buffer_index = -1;
	}
	op_release(new_op);
	return ret;
+47 −75
Original line number Diff line number Diff line
@@ -245,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping,

static int orangefs_launder_page(struct page *);

static void orangefs_readahead(struct readahead_control *rac)
{
	loff_t offset;
	struct iov_iter iter;
	struct file *file = rac->file;
	struct inode *inode = file->f_mapping->host;
	struct xarray *i_pages;
	struct page *page;
	loff_t new_start = readahead_pos(rac);
	int ret;
	size_t new_len = 0;

	loff_t bytes_remaining = inode->i_size - readahead_pos(rac);
	loff_t pages_remaining = bytes_remaining / PAGE_SIZE;

	if (pages_remaining >= 1024)
		new_len = 4194304;
	else if (pages_remaining > readahead_count(rac))
		new_len = bytes_remaining;

	if (new_len)
		readahead_expand(rac, new_start, new_len);

	offset = readahead_pos(rac);
	i_pages = &file->f_mapping->i_pages;

	iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));

	/* read in the pages. */
	if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
			&offset, &iter, readahead_length(rac),
			inode->i_size, NULL, NULL, file)) < 0)
		gossip_debug(GOSSIP_FILE_DEBUG,
			"%s: wait_for_direct_io failed. \n", __func__);
	else
		ret = 0;

	/* clean up. */
	while ((page = readahead_page(rac))) {
		page_endio(page, false, ret);
		put_page(page);
	}
}

static int orangefs_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
@@ -252,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page)
	struct bio_vec bv;
	ssize_t ret;
	loff_t off; /* offset into this page */
	pgoff_t index; /* which page */
	struct page *next_page;
	char *kaddr;
	loff_t read_size;
	int buffer_index = -1; /* orangefs shared memory slot */
	int slot_index;   /* index into slot */
	int remaining;

	/*
	 * Get up to this many bytes from Orangefs at a time and try
	 * to fill them into the page cache at once. Tests with dd made
	 * this seem like a reasonable static number, if there was
	 * interest perhaps this number could be made setable through
	 * sysfs...
	 */
	read_size = 524288;

	if (PageDirty(page))
		orangefs_launder_page(page);

	off = page_offset(page);
	index = off >> PAGE_SHIFT;
	bv.bv_page = page;
	bv.bv_len = PAGE_SIZE;
	bv.bv_offset = 0;
	iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);

	ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
	    read_size, inode->i_size, NULL, &buffer_index, file);
	remaining = ret;
	    PAGE_SIZE, inode->i_size, NULL, NULL, file);
	/* this will only zero remaining unread portions of the page data */
	iov_iter_zero(~0U, &iter);
	/* takes care of potential aliasing */
	flush_dcache_page(page);
	if (ret < 0) {
		SetPageError(page);
		unlock_page(page);
		goto out;
	} else {
		SetPageUptodate(page);
		if (PageError(page))
@@ -298,59 +322,6 @@ static int orangefs_readpage(struct file *file, struct page *page)
	}
	/* unlock the page after the ->readpage() routine completes */
	unlock_page(page);

	if (remaining > PAGE_SIZE) {
		slot_index = 0;
		while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
			remaining -= PAGE_SIZE;
			/*
			 * It is an optimization to try and fill more than one
			 * page... by now we've already gotten the single
			 * page we were after, if stuff doesn't seem to
			 * be going our way at this point just return
			 * and hope for the best.
			 *
			 * If we look for pages and they're already there is
			 * one reason to give up, and if they're not there
			 * and we can't create them is another reason.
			 */

			index++;
			slot_index++;
			next_page = find_get_page(inode->i_mapping, index);
			if (next_page) {
				gossip_debug(GOSSIP_FILE_DEBUG,
					"%s: found next page, quitting\n",
					__func__);
				put_page(next_page);
				goto out;
			}
			next_page = find_or_create_page(inode->i_mapping,
							index,
							GFP_KERNEL);
			/*
			 * I've never hit this, leave it as a printk for
			 * now so it will be obvious.
			 */
			if (!next_page) {
				printk("%s: can't create next page, quitting\n",
					__func__);
				goto out;
			}
			kaddr = kmap_atomic(next_page);
			orangefs_bufmap_page_fill(kaddr,
						buffer_index,
						slot_index);
			kunmap_atomic(kaddr);
			SetPageUptodate(next_page);
			unlock_page(next_page);
			put_page(next_page);
		}
	}

out:
	if (buffer_index != -1)
		orangefs_bufmap_put(buffer_index);
        return ret;
}

@@ -660,6 +631,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
	.writepage = orangefs_writepage,
	.readahead = orangefs_readahead,
	.readpage = orangefs_readpage,
	.writepages = orangefs_writepages,
	.set_page_dirty = __set_page_dirty_nobuffers,
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ static ulong module_parm_debug_mask;
__u64 orangefs_gossip_debug_mask;
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
int orangefs_cache_timeout_msecs = 50;
int orangefs_cache_timeout_msecs = 500;
int orangefs_dcache_timeout_msecs = 50;
int orangefs_getattr_timeout_msecs = 50;