Loading fs/dax.c +8 −5 Original line number Diff line number Diff line Loading @@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; Loading Loading @@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; struct iomap iomap = { 0 }; struct iomap iomap = { .type = IOMAP_HOLE }; struct iomap srcmap = { .type = IOMAP_HOLE }; unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; Loading Loading @@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); if (iomap_errp) *iomap_errp = error; if (error) { Loading Loading @@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; struct iomap iomap = { .type = IOMAP_HOLE }; struct iomap srcmap = { .type = IOMAP_HOLE }; pgoff_t max_pgoff; void *entry; loff_t pos; Loading Loading @@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * to look up our filesystem block. */ pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, &srcmap); if (error) goto unlock_entry; Loading fs/ext2/inode.c +1 −1 Original line number Diff line number Diff line Loading @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) unsigned flags, struct iomap *iomap, struct iomap *srcmap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; Loading fs/ext4/ext4.h +1 −3 Original line number Diff line number Diff line Loading @@ -1584,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ Loading Loading @@ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, Loading Loading @@ -3391,6 +3388,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { Loading fs/ext4/extents.c +2 −9 Original line number Diff line number Diff line Loading @@ -1765,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; /* * The check for IO to unwritten extent is somewhat racy as we * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after * dropping i_data_sem. But reserved blocks should save us in that * case. */ if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) Loading fs/ext4/file.c +338 −74 Original line number Diff line number Diff line Loading @@ -29,10 +29,58 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" static bool ext4_dio_supported(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) return false; if (fsverity_active(inode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; return true; } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_dio_supported(inode)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, is_sync_kiocb(iocb)); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) Loading Loading @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(file_inode(iocb->ki_filp))) if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } Loading Loading @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they Loading Loading @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. Loading @@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } ret = file_modified(iocb->ki_filp); if (ret) return ret; return iov_iter_count(from); } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; return -EOPNOTSUPP; inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = file_remove_privs(iocb->ki_filp); if (ret) goto out; ret = file_update_time(iocb->ki_filp); if (ret) goto out; ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); current->backing_dev_info = NULL; out: inode_unlock(inode); if (ret > 0) if (likely(ret > 0)) { iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); } return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, size_t count) { handle_t *handle; bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But, the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. */ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); if (offset + count <= EXT4_I(inode)->i_disksize) { /* * We need to ensure that the inode is removed from the orphan * list if it has been added prematurely, due to writeback of * delalloc blocks. */ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ext4_orphan_del(NULL, inode); return PTR_ERR(handle); } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } return written; } if (written < 0) goto truncate; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { written = PTR_ERR(handle); goto truncate; } if (ext4_update_inode_size(inode, offset + written)) ext4_mark_inode_dirty(handle, inode); /* * We may need to truncate allocated but not written blocks beyond EOF. */ written_blk = ALIGN(offset + written, 1 << blkbits); end_blk = ALIGN(offset + count, 1 << blkbits); if (written_blk < end_blk && ext4_can_truncate(inode)) truncate = true; /* * Remove the inode from the orphan list if it has been extended and * everything went OK. */ if (!truncate && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); if (truncate) { truncate: ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return written; } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t offset = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); int o_direct = iocb->ki_flags & IOCB_DIRECT; int unaligned_aio = 0; int overwrite = 0; if (error) return error; if (size && flags & IOMAP_DIO_UNWRITTEN) return ext4_convert_unwritten_extents(NULL, inode, offset, size); return 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); bool extend = false, overwrite = false, unaligned_aio = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } if (!ext4_dio_supported(inode)) { inode_unlock(inode); /* * Fallback to buffered I/O if the inode does not support * direct I/O. */ return ext4_buffered_write_iter(iocb, from); } ret = ext4_write_checks(iocb, from); if (ret <= 0) { inode_unlock(inode); return ret; } /* * Unaligned asynchronous direct I/O must be serialized among each * other as the zeroing of partial blocks of two competing unaligned * asynchronous direct I/O writes can result in data corruption. */ offset = iocb->ki_pos; count = iov_iter_count(from); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { unaligned_aio = true; inode_dio_wait(inode); } /* * Determine whether the I/O will overwrite allocated and initialized * blocks. If so, check to see whether it is possible to take the * dioread_nolock path. */ if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && ext4_should_dioread_nolock(inode)) { overwrite = true; downgrade_write(&inode->i_rwsem); } if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, is_sync_kiocb(iocb) || unaligned_aio || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: if (overwrite) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) Loading @@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; /* * Unaligned direct AIO must be serialized among each other as zeroing * of partial blocks of two competing unaligned AIOs can result in data * corruption. */ if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, iocb->ki_pos)) { unaligned_aio = 1; ext4_unwritten_wait(inode); } iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ if (o_direct && !unaligned_aio) { if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { if (ext4_should_dioread_nolock(inode)) overwrite = 1; } else if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } } ret = __generic_file_write_iter(iocb, from); /* * Unaligned direct AIO must be the only IO in flight. Otherwise * overlapping aligned IO after unaligned might result in data * corruption. */ if (ret == -EIOCBQUEUED && unaligned_aio) ext4_unwritten_wait(inode); inode_unlock(inode); ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } if (ret > 0) ret = generic_write_sync(iocb, ret); extend = true; ext4_journal_stop(handle); } return ret; ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, Loading Loading @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } Loading Loading
fs/dax.c +8 −5 Original line number Diff line number Diff line Loading @@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; Loading Loading @@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; struct iomap iomap = { 0 }; struct iomap iomap = { .type = IOMAP_HOLE }; struct iomap srcmap = { .type = IOMAP_HOLE }; unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; Loading Loading @@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); if (iomap_errp) *iomap_errp = error; if (error) { Loading Loading @@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; struct iomap iomap = { .type = IOMAP_HOLE }; struct iomap srcmap = { .type = IOMAP_HOLE }; pgoff_t max_pgoff; void *entry; loff_t pos; Loading Loading @@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * to look up our filesystem block. */ pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, &srcmap); if (error) goto unlock_entry; Loading
fs/ext2/inode.c +1 −1 Original line number Diff line number Diff line Loading @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) unsigned flags, struct iomap *iomap, struct iomap *srcmap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; Loading
fs/ext4/ext4.h +1 −3 Original line number Diff line number Diff line Loading @@ -1584,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ Loading Loading @@ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, Loading Loading @@ -3391,6 +3388,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { Loading
fs/ext4/extents.c +2 −9 Original line number Diff line number Diff line Loading @@ -1765,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; /* * The check for IO to unwritten extent is somewhat racy as we * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after * dropping i_data_sem. But reserved blocks should save us in that * case. */ if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) Loading
fs/ext4/file.c +338 −74 Original line number Diff line number Diff line Loading @@ -29,10 +29,58 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" static bool ext4_dio_supported(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) return false; if (fsverity_active(inode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; return true; } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_dio_supported(inode)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, is_sync_kiocb(iocb)); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) Loading Loading @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(file_inode(iocb->ki_filp))) if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } Loading Loading @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they Loading Loading @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. Loading @@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } ret = file_modified(iocb->ki_filp); if (ret) return ret; return iov_iter_count(from); } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; return -EOPNOTSUPP; inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = file_remove_privs(iocb->ki_filp); if (ret) goto out; ret = file_update_time(iocb->ki_filp); if (ret) goto out; ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); current->backing_dev_info = NULL; out: inode_unlock(inode); if (ret > 0) if (likely(ret > 0)) { iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); } return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, size_t count) { handle_t *handle; bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But, the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. */ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); if (offset + count <= EXT4_I(inode)->i_disksize) { /* * We need to ensure that the inode is removed from the orphan * list if it has been added prematurely, due to writeback of * delalloc blocks. */ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ext4_orphan_del(NULL, inode); return PTR_ERR(handle); } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } return written; } if (written < 0) goto truncate; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { written = PTR_ERR(handle); goto truncate; } if (ext4_update_inode_size(inode, offset + written)) ext4_mark_inode_dirty(handle, inode); /* * We may need to truncate allocated but not written blocks beyond EOF. */ written_blk = ALIGN(offset + written, 1 << blkbits); end_blk = ALIGN(offset + count, 1 << blkbits); if (written_blk < end_blk && ext4_can_truncate(inode)) truncate = true; /* * Remove the inode from the orphan list if it has been extended and * everything went OK. */ if (!truncate && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); if (truncate) { truncate: ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return written; } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t offset = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); int o_direct = iocb->ki_flags & IOCB_DIRECT; int unaligned_aio = 0; int overwrite = 0; if (error) return error; if (size && flags & IOMAP_DIO_UNWRITTEN) return ext4_convert_unwritten_extents(NULL, inode, offset, size); return 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); bool extend = false, overwrite = false, unaligned_aio = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } if (!ext4_dio_supported(inode)) { inode_unlock(inode); /* * Fallback to buffered I/O if the inode does not support * direct I/O. */ return ext4_buffered_write_iter(iocb, from); } ret = ext4_write_checks(iocb, from); if (ret <= 0) { inode_unlock(inode); return ret; } /* * Unaligned asynchronous direct I/O must be serialized among each * other as the zeroing of partial blocks of two competing unaligned * asynchronous direct I/O writes can result in data corruption. */ offset = iocb->ki_pos; count = iov_iter_count(from); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { unaligned_aio = true; inode_dio_wait(inode); } /* * Determine whether the I/O will overwrite allocated and initialized * blocks. If so, check to see whether it is possible to take the * dioread_nolock path. */ if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && ext4_should_dioread_nolock(inode)) { overwrite = true; downgrade_write(&inode->i_rwsem); } if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, is_sync_kiocb(iocb) || unaligned_aio || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: if (overwrite) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) Loading @@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; /* * Unaligned direct AIO must be serialized among each other as zeroing * of partial blocks of two competing unaligned AIOs can result in data * corruption. */ if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, iocb->ki_pos)) { unaligned_aio = 1; ext4_unwritten_wait(inode); } iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ if (o_direct && !unaligned_aio) { if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { if (ext4_should_dioread_nolock(inode)) overwrite = 1; } else if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } } ret = __generic_file_write_iter(iocb, from); /* * Unaligned direct AIO must be the only IO in flight. Otherwise * overlapping aligned IO after unaligned might result in data * corruption. */ if (ret == -EIOCBQUEUED && unaligned_aio) ext4_unwritten_wait(inode); inode_unlock(inode); ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } if (ret > 0) ret = generic_write_sync(iocb, ret); extend = true; ext4_journal_stop(handle); } return ret; ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, Loading Loading @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } Loading