Commit 11ef08c9 authored by Theodore Ts'o's avatar Theodore Ts'o
Browse files

Merge branch 'delalloc-buffer-write' into dev



Fix a bug in how we update i_disksize, and the error path in
inline_data_end.  Finally, drop an unnecessary creation of a journal
handle which was only needed for inline data, which can give us a
large performance gain in delayed allocation writes.

Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parents 1fd95c05 cc883236
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -3603,9 +3603,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
					   unsigned flags,
					   struct page **pagep,
					   void **fsdata);
extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
					 unsigned len, unsigned copied,
					 struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
				     struct ext4_filename *fname,
				     struct inode *dir, struct inode *inode);
+66 −65
Original line number Diff line number Diff line
@@ -733,35 +733,35 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
			       unsigned copied, struct page *page)
{
	int ret, no_expand;
	handle_t *handle = ext4_journal_current_handle();
	int no_expand;
	void *kaddr;
	struct ext4_iloc iloc;
	int ret = 0, ret2;

	if (unlikely(copied < len)) {
		if (!PageUptodate(page)) {
	if (unlikely(copied < len) && !PageUptodate(page))
		copied = 0;
			goto out;
		}
	}

	if (likely(copied)) {
		ret = ext4_get_inode_loc(inode, &iloc);
		if (ret) {
			unlock_page(page);
			put_page(page);
			ext4_std_error(inode->i_sb, ret);
		copied = 0;
			goto out;
		}

		ext4_write_lock_xattr(inode, &no_expand);
		BUG_ON(!ext4_has_inline_data(inode));

		/*
	 * ei->i_inline_off may have changed since ext4_write_begin()
	 * called ext4_try_to_write_inline_data()
		 * ei->i_inline_off may have changed since
		 * ext4_write_begin() called
		 * ext4_try_to_write_inline_data()
		 */
		(void) ext4_find_inline_data_nolock(inode);

		kaddr = kmap_atomic(page);
	ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
		ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
		kunmap_atomic(kaddr);
		SetPageUptodate(page);
		/* clear page dirty so that writepages wouldn't work for us. */
@@ -769,9 +769,47 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,

		ext4_write_unlock_xattr(inode, &no_expand);
		brelse(iloc.bh);

		/*
		 * It's important to update i_size while still holding page
		 * lock: page writeout could otherwise come in and zero
		 * beyond i_size.
		 */
		ext4_update_inode_size(inode, pos + copied);
	}
	unlock_page(page);
	put_page(page);

	/*
	 * Don't mark the inode dirty under page lock. First, it unnecessarily
	 * makes the holding time of page lock longer. Second, it forces lock
	 * ordering of page lock and transaction start for journaling
	 * filesystems.
	 */
	if (likely(copied))
		mark_inode_dirty(inode);
out:
	return copied;
	/*
	 * If we didn't copy as much data as expected, we need to trim back
	 * size of xattr containing inline data.
	 */
	if (pos + len > inode->i_size && ext4_can_truncate(inode))
		ext4_orphan_add(handle, inode);

	ret2 = ext4_journal_stop(handle);
	if (!ret)
		ret = ret2;
	if (pos + len > inode->i_size) {
		ext4_truncate_failed_write(inode);
		/*
		 * If truncate failed early the inode might still be
		 * on the orphan list; we need to make sure the inode
		 * is removed from the orphan list in that case.
		 */
		if (inode->i_nlink)
			ext4_orphan_del(NULL, inode);
	}
	return ret ? ret : copied;
}

struct buffer_head *
@@ -953,43 +991,6 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
	return ret;
}

int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
				  unsigned len, unsigned copied,
				  struct page *page)
{
	int ret;

	ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
	if (ret < 0) {
		unlock_page(page);
		put_page(page);
		return ret;
	}
	copied = ret;

	/*
	 * No need to use i_size_read() here, the i_size
	 * cannot change under us because we hold i_mutex.
	 *
	 * But it's important to update i_size while still holding page lock:
	 * page writeout could otherwise come in and zero beyond i_size.
	 */
	if (pos+copied > inode->i_size)
		i_size_write(inode, pos+copied);
	unlock_page(page);
	put_page(page);

	/*
	 * Don't mark the inode dirty under page lock. First, it unnecessarily
	 * makes the holding time of page lock longer. Second, it forces lock
	 * ordering of page lock and transaction start for journaling
	 * filesystems.
	 */
	mark_inode_dirty(inode);

	return copied;
}

#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
			  void *inline_start, int inline_size)
+39 −111
Original line number Diff line number Diff line
@@ -1284,22 +1284,14 @@ static int ext4_write_end(struct file *file,
	loff_t old_size = inode->i_size;
	int ret = 0, ret2;
	int i_size_changed = 0;
	int inline_data = ext4_has_inline_data(inode);
	bool verity = ext4_verity_in_progress(inode);

	trace_ext4_write_end(inode, pos, len, copied);
	if (inline_data) {
		ret = ext4_write_inline_data_end(inode, pos, len,
						 copied, page);
		if (ret < 0) {
			unlock_page(page);
			put_page(page);
			goto errout;
		}
		copied = ret;
	} else
		copied = block_write_end(file, mapping, pos,
					 len, copied, page, fsdata);

	if (ext4_has_inline_data(inode))
		return ext4_write_inline_data_end(inode, pos, len, copied, page);

	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
	/*
	 * it's important to update i_size while still holding page lock:
	 * page writeout could otherwise come in and zero beyond i_size.
@@ -1320,7 +1312,7 @@ static int ext4_write_end(struct file *file,
	 * ordering of page lock and transaction start for journaling
	 * filesystems.
	 */
	if (i_size_changed || inline_data)
	if (i_size_changed)
		ret = ext4_mark_inode_dirty(handle, inode);

	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
@@ -1329,7 +1321,7 @@ static int ext4_write_end(struct file *file,
		 * inode->i_size. So truncate them
		 */
		ext4_orphan_add(handle, inode);
errout:

	ret2 = ext4_journal_stop(handle);
	if (!ret)
		ret = ret2;
@@ -1395,7 +1387,6 @@ static int ext4_journalled_write_end(struct file *file,
	int partial = 0;
	unsigned from, to;
	int size_changed = 0;
	int inline_data = ext4_has_inline_data(inode);
	bool verity = ext4_verity_in_progress(inode);

	trace_ext4_journalled_write_end(inode, pos, len, copied);
@@ -1404,16 +1395,10 @@ static int ext4_journalled_write_end(struct file *file,

	BUG_ON(!ext4_handle_valid(handle));

	if (inline_data) {
		ret = ext4_write_inline_data_end(inode, pos, len,
						 copied, page);
		if (ret < 0) {
			unlock_page(page);
			put_page(page);
			goto errout;
		}
		copied = ret;
	} else if (unlikely(copied < len) && !PageUptodate(page)) {
	if (ext4_has_inline_data(inode))
		return ext4_write_inline_data_end(inode, pos, len, copied, page);

	if (unlikely(copied < len) && !PageUptodate(page)) {
		copied = 0;
		ext4_journalled_zero_new_buffers(handle, inode, page, from, to);
	} else {
@@ -1436,7 +1421,7 @@ static int ext4_journalled_write_end(struct file *file,
	if (old_size < pos && !verity)
		pagecache_isize_extended(inode, old_size, pos);

	if (size_changed || inline_data) {
	if (size_changed) {
		ret2 = ext4_mark_inode_dirty(handle, inode);
		if (!ret)
			ret = ret2;
@@ -1449,7 +1434,6 @@ static int ext4_journalled_write_end(struct file *file,
		 */
		ext4_orphan_add(handle, inode);

errout:
	ret2 = ext4_journal_stop(handle);
	if (!ret)
		ret = ret2;
@@ -2932,19 +2916,6 @@ static int ext4_nonda_switch(struct super_block *sb)
	return 0;
}

/* We always reserve for an inode update; the superblock could be there too */
static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
{
	if (likely(ext4_has_feature_large_file(inode->i_sb)))
		return 1;

	if (pos + len <= 0x7fffffffULL)
		return 1;

	/* We might need to update the superblock to set LARGE_FILE */
	return 2;
}

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
			       loff_t pos, unsigned len, unsigned flags,
			       struct page **pagep, void **fsdata)
@@ -2953,7 +2924,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
	struct page *page;
	pgoff_t index;
	struct inode *inode = mapping->host;
	handle_t *handle;

	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;
@@ -2979,41 +2949,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
			return 0;
	}

	/*
	 * grab_cache_page_write_begin() can take a long time if the
	 * system is thrashing due to memory pressure, or if the page
	 * is being written back.  So grab it first before we start
	 * the transaction handle.  This also allows us to allocate
	 * the page (if needed) without using GFP_NOFS.
	 */
retry_grab:
retry:
	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
		return -ENOMEM;
	unlock_page(page);

	/*
	 * With delayed allocation, we don't log the i_disksize update
	 * if there is delayed block allocation. But we still need
	 * to journalling the i_disksize update if writes to the end
	 * of file which has an already mapped buffer.
	 */
retry_journal:
	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
				ext4_da_write_credits(inode, pos, len));
	if (IS_ERR(handle)) {
		put_page(page);
		return PTR_ERR(handle);
	}

	lock_page(page);
	if (page->mapping != mapping) {
		/* The page got truncated from under us */
		unlock_page(page);
		put_page(page);
		ext4_journal_stop(handle);
		goto retry_grab;
	}
	/* In case writeback began while the page was unlocked */
	wait_for_stable_page(page);

@@ -3025,20 +2965,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
#endif
	if (ret < 0) {
		unlock_page(page);
		ext4_journal_stop(handle);
		put_page(page);
		/*
		 * block_write_begin may have instantiated a few blocks
		 * outside i_size.  Trim these off again. Don't need
		 * i_size_read because we hold i_mutex.
		 * i_size_read because we hold inode lock.
		 */
		if (pos + len > inode->i_size)
			ext4_truncate_failed_write(inode);

		if (ret == -ENOSPC &&
		    ext4_should_retry_alloc(inode->i_sb, &retries))
			goto retry_journal;

		put_page(page);
			goto retry;
		return ret;
	}

@@ -3075,8 +3013,6 @@ static int ext4_da_write_end(struct file *file,
			     struct page *page, void *fsdata)
{
	struct inode *inode = mapping->host;
	int ret = 0, ret2;
	handle_t *handle = ext4_journal_current_handle();
	loff_t new_i_size;
	unsigned long start, end;
	int write_mode = (int)(unsigned long)fsdata;
@@ -3086,44 +3022,36 @@ static int ext4_da_write_end(struct file *file,
				      len, copied, page, fsdata);

	trace_ext4_da_write_end(inode, pos, len, copied);

	if (write_mode != CONVERT_INLINE_DATA &&
	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
	    ext4_has_inline_data(inode))
		return ext4_write_inline_data_end(inode, pos, len, copied, page);

	start = pos & (PAGE_SIZE - 1);
	end = start + copied - 1;

	/*
	 * generic_write_end() will run mark_inode_dirty() if i_size
	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
	 * into that.
	 * Since we are holding inode lock, we are sure i_disksize <=
	 * i_size. We also know that if i_disksize < i_size, there are
	 * delalloc writes pending in the range upto i_size. If the end of
	 * the current write is <= i_size, there's no need to touch
	 * i_disksize since writeback will push i_disksize upto i_size
	 * eventually. If the end of the current write is > i_size and
	 * inside an allocated block (ext4_da_should_update_i_disksize()
	 * check), we need to update i_disksize here as neither
	 * ext4_writepage() nor certain ext4_writepages() paths not
	 * allocating blocks update i_disksize.
	 *
	 * Note that we defer inode dirtying to generic_write_end() /
	 * ext4_da_write_inline_data_end().
	 */
	new_i_size = pos + copied;
	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
		if (ext4_has_inline_data(inode) ||
		    ext4_da_should_update_i_disksize(page, end)) {
	if (copied && new_i_size > inode->i_size &&
	    ext4_da_should_update_i_disksize(page, end))
		ext4_update_i_disksize(inode, new_i_size);
			/* We need to mark inode dirty even if
			 * new_i_size is less that inode->i_size
			 * bu greater than i_disksize.(hint delalloc)
			 */
			ret = ext4_mark_inode_dirty(handle, inode);
		}
	}

	if (write_mode != CONVERT_INLINE_DATA &&
	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
	    ext4_has_inline_data(inode))
		ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
						     page);
	else
		ret2 = generic_write_end(file, mapping, pos, len, copied,
							page, fsdata);

	copied = ret2;
	if (ret2 < 0)
		ret = ret2;
	ret2 = ext4_journal_stop(handle);
	if (unlikely(ret2 && !ret))
		ret = ret2;

	return ret ? ret : copied;
	return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
}

/*