Commit 9f67672a authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull ext4 updates from Ted Ts'o:
 "New features for ext4 this cycle include support for encrypted
  casefold, ensure that deleted file names are cleared in directory
  blocks by zeroing directory entries when they are unlinked or moved as
  part of a hash tree node split. We also improve the block allocator's
  performance on a freshly mounted file system by prefetching block
  bitmaps.

  There are also the usual cleanups and bug fixes, including fixing a
  page cache invalidation race when there is mixed buffered and direct
  I/O and the block size is less than page size, and allow the dax flag
  to be set and cleared on inline directories"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (32 commits)
  ext4: wipe ext4_dir_entry2 upon file deletion
  ext4: Fix occasional generic/418 failure
  fs: fix reporting supported extra file attributes for statx()
  ext4: allow the dax flag to be set and cleared on inline directories
  ext4: fix debug format string warning
  ext4: fix trailing whitespace
  ext4: fix various seppling typos
  ext4: fix error return code in ext4_fc_perform_commit()
  ext4: annotate data race in jbd2_journal_dirty_metadata()
  ext4: annotate data race in start_this_handle()
  ext4: fix ext4_error_err save negative errno into superblock
  ext4: fix error code in ext4_commit_super
  ext4: always panic when errors=panic is specified
  ext4: delete redundant uptodate check for buffer
  ext4: do not set SB_ACTIVE in ext4_orphan_cleanup()
  ext4: make prefetch_block_bitmaps default
  ext4: add proc files to monitor new structures
  ext4: improve cr 0 / cr 1 group scanning
  ext4: add MB_NUM_ORDERS macro
  ext4: add mballoc stats proc file
  ...
parents 6bab076a 6c091273
Loading
Loading
Loading
Loading
+27 −0
Original line number Diff line number Diff line
@@ -121,6 +121,31 @@ The directory file type is one of the following values:
   * - 0x7
     - Symbolic link.

To support directories that are both encrypted and casefolded directories, we
must also include hash information in the directory entry. We append
``ext4_extended_dir_entry_2`` to ``ext4_dir_entry_2`` except for the entries
for dot and dotdot, which are kept the same. The structure follows immediately
after ``name`` and is included in the size listed by ``rec_len`` If a directory
entry uses this extension, it may be up to 271 bytes.

.. list-table::
   :widths: 8 8 24 40
   :header-rows: 1

   * - Offset
     - Size
     - Name
     - Description
   * - 0x0
     - \_\_le32
     - hash
     - The hash of the directory name
   * - 0x4
     - \_\_le32
     - minor\_hash
     - The minor hash of the directory name


In order to add checksums to these classic directory blocks, a phony
``struct ext4_dir_entry`` is placed at the end of each leaf block to
hold the checksum. The directory entry is 12 bytes long. The inode
@@ -322,6 +347,8 @@ The directory hash is one of the following values:
     - Half MD4, unsigned.
   * - 0x5
     - Tea, unsigned.
   * - 0x6
     - Siphash.

Interior nodes of an htree are recorded as ``struct dx_node``, which is
also the full length of a data block:
+30 −11
Original line number Diff line number Diff line
@@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode)
	return 0;
}

static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
{
	/* Check if . or .. , or skip if namelen is 0 */
	if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
	    (de->name[1] == '.' || de->name[1] == '\0'))
		return true;
	/* Check if this is a csum entry */
	if (de->file_type == EXT4_FT_DIR_CSUM)
		return true;
	return false;
}

/*
 * Return 0 if the directory entry is OK, and 1 if there is a problem
 *
@@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
	const int rlen = ext4_rec_len_from_disk(de->rec_len,
						dir->i_sb->s_blocksize);
	const int next_offset = ((char *) de - buf) + rlen;
	bool fake = is_fake_dir_entry(de);
	bool has_csum = ext4_has_metadata_csum(dir->i_sb);

	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
	if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
		error_msg = "rec_len is smaller than minimal";
	else if (unlikely(rlen % 4 != 0))
		error_msg = "rec_len % 4 != 0";
	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
	else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
							fake ? NULL : dir)))
		error_msg = "rec_len is too small for name_len";
	else if (unlikely(next_offset > size))
		error_msg = "directory entry overrun";
	else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
	else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
						  has_csum ? NULL : dir) &&
			  next_offset != size))
		error_msg = "directory entry too close to block end";
	else if (unlikely(le32_to_cpu(de->inode) >
@@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
	if (filp)
		ext4_error_file(filp, function, line, bh->b_blocknr,
				"bad entry in directory: %s - offset=%u, "
				"inode=%u, rec_len=%d, name_len=%d, size=%d",
				"inode=%u, rec_len=%d, size=%d fake=%d",
				error_msg, offset, le32_to_cpu(de->inode),
				rlen, de->name_len, size);
				rlen, size, fake);
	else
		ext4_error_inode(dir, function, line, bh->b_blocknr,
				"bad entry in directory: %s - offset=%u, "
				"inode=%u, rec_len=%d, name_len=%d, size=%d",
				"inode=%u, rec_len=%d, size=%d fake=%d",
				 error_msg, offset, le32_to_cpu(de->inode),
				 rlen, de->name_len, size);
				 rlen, size, fake);

	return 1;
}
@@ -124,9 +140,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)

	if (is_dx_dir(inode)) {
		err = ext4_dx_readdir(file, ctx);
		if (err != ERR_BAD_DX_DIR) {
		if (err != ERR_BAD_DX_DIR)
			return err;
		}

		/* Can we just clear INDEX flag to ignore htree information? */
		if (!ext4_has_metadata_csum(sb)) {
			/*
@@ -224,7 +240,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
				 * failure will be detected in the
				 * dirent test below. */
				if (ext4_rec_len_from_disk(de->rec_len,
					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
					sb->s_blocksize) < ext4_dir_rec_len(1,
									inode))
					break;
				i += ext4_rec_len_from_disk(de->rec_len,
							    sb->s_blocksize);
@@ -265,7 +282,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)

					/* Directory is encrypted */
					err = fscrypt_fname_disk_to_usr(inode,
						0, 0, &de_name, &fstr);
						EXT4_DIRENT_HASH(de),
						EXT4_DIRENT_MINOR_HASH(de),
						&de_name, &fstr);
					de_name = fstr;
					fstr.len = save_len;
					if (err)
+85 −22
Original line number Diff line number Diff line
@@ -162,7 +162,12 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_USE_RESERVED		0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK		0x4000

/* Large fragment size list lookup succeeded at least once for cr = 0 */
#define EXT4_MB_CR0_OPTIMIZED		0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
#define EXT4_MB_CR1_OPTIMIZED		0x00010000
/* Perform linear traversal for one group */
#define EXT4_MB_SEARCH_NEXT_LINEAR	0x00020000
struct ext4_allocation_request {
	/* target inode for block we're allocating */
	struct inode *inode;
@@ -1213,7 +1218,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR	0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
@@ -1238,7 +1243,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT	0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER		0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE		0x00000040 /* For printing options only */

#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN	0x00000080 /* Optimize group
						    * scanning in mballoc
						    */

#define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
						~EXT4_MOUNT_##opt
@@ -1519,9 +1526,14 @@ struct ext4_sb_info {
	unsigned int s_mb_free_pending;
	struct list_head s_freed_data_list;	/* List of blocks to be freed
						   after commit completed */
	struct rb_root s_mb_avg_fragment_size_root;
	rwlock_t s_mb_rb_lock;
	struct list_head *s_mb_largest_free_orders;
	rwlock_t *s_mb_largest_free_orders_locks;

	/* tunables */
	unsigned long s_stripe;
	unsigned int s_mb_max_linear_groups;
	unsigned int s_mb_stream_request;
	unsigned int s_mb_max_to_scan;
	unsigned int s_mb_min_to_scan;
@@ -1541,12 +1553,17 @@ struct ext4_sb_info {
	atomic_t s_bal_success;	/* we found long enough chunks */
	atomic_t s_bal_allocated;	/* in blocks */
	atomic_t s_bal_ex_scanned;	/* total extents scanned */
	atomic_t s_bal_groups_scanned;	/* number of groups scanned */
	atomic_t s_bal_goals;	/* goal hits */
	atomic_t s_bal_breaks;	/* too long searches */
	atomic_t s_bal_2orders;	/* 2^order hits */
	spinlock_t s_bal_lock;
	unsigned long s_mb_buddies_generated;
	unsigned long long s_mb_generation_time;
	atomic_t s_bal_cr0_bad_suggestions;
	atomic_t s_bal_cr1_bad_suggestions;
	atomic64_t s_bal_cX_groups_considered[4];
	atomic64_t s_bal_cX_hits[4];
	atomic64_t s_bal_cX_failed[4];		/* cX loop didn't find blocks */
	atomic_t s_mb_buddies_generated;	/* number of buddies generated */
	atomic64_t s_mb_generation_time;
	atomic_t s_mb_lost_chunks;
	atomic_t s_mb_preallocated;
	atomic_t s_mb_discarded;
@@ -2187,6 +2204,17 @@ struct ext4_dir_entry {
	char	name[EXT4_NAME_LEN];	/* File name */
};


/*
 * Encrypted Casefolded entries require saving the hash on disk. This structure
 * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
 * boundary.
 */
struct ext4_dir_entry_hash {
	__le32 hash;
	__le32 minor_hash;
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
@@ -2201,6 +2229,22 @@ struct ext4_dir_entry_2 {
	char	name[EXT4_NAME_LEN];	/* File name */
};

/*
 * Access the hashes at the end of ext4_dir_entry_2
 */
#define EXT4_DIRENT_HASHES(entry) \
	((struct ext4_dir_entry_hash *) \
		(((void *)(entry)) + \
		((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
		le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)

static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
	return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
}

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
@@ -2242,10 +2286,24 @@ struct ext4_dir_entry_tail {
 */
#define EXT4_DIR_PAD			4
#define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
#define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
					 ~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN		((1<<16)-1)

/*
 * The rec_len is dependent on the type of directory. Directories that are
 * casefolded and encrypted need to store the hash as well, so we add room for
 * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
 * pass NULL for dir, as those entries do not use the extra fields.
 */
static inline unsigned int ext4_dir_rec_len(__u8 name_len,
						const struct inode *dir)
{
	int rec_len = (name_len + 8 + EXT4_DIR_ROUND);

	if (dir && ext4_hash_in_dirent(dir))
		rec_len += sizeof(struct ext4_dir_entry_hash);
	return (rec_len & ~EXT4_DIR_ROUND);
}

/*
 * If we ever get support for fs block sizes > page_size, we'll need
 * to remove the #if statements in the next two functions...
@@ -2302,6 +2360,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_LEGACY_UNSIGNED		3
#define DX_HASH_HALF_MD4_UNSIGNED	4
#define DX_HASH_TEA_UNSIGNED		5
#define DX_HASH_SIPHASH			6

static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
			      const void *address, unsigned int length)
@@ -2356,6 +2415,7 @@ struct ext4_filename {
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p)  ((p)->disk_name.len)

/*
@@ -2586,9 +2646,9 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#ifdef CONFIG_UNICODE
extern void ext4_fname_setup_ci_filename(struct inode *dir,
extern int ext4_fname_setup_ci_filename(struct inode *dir,
					 const struct qstr *iname,
					 struct fscrypt_str *fname);
					 struct ext4_filename *fname);
#endif

#ifdef CONFIG_FS_ENCRYPTION
@@ -2619,9 +2679,9 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
	ext4_fname_from_fscrypt_name(fname, &name);

#ifdef CONFIG_UNICODE
	ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
	err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
	return 0;
	return err;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2638,9 +2698,9 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
	ext4_fname_from_fscrypt_name(fname, &name);

#ifdef CONFIG_UNICODE
	ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
	err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
#endif
	return 0;
	return err;
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
@@ -2665,15 +2725,16 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
					    int lookup,
					    struct ext4_filename *fname)
{
	int err = 0;
	fname->usr_fname = iname;
	fname->disk_name.name = (unsigned char *) iname->name;
	fname->disk_name.len = iname->len;

#ifdef CONFIG_UNICODE
	ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
	err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif

	return 0;
	return err;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2711,7 +2772,7 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
			     void *buf, int buf_size,
			     struct ext4_filename *fname,
			     struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *inode,
void ext4_insert_dentry(struct inode *dir, struct inode *inode,
			struct ext4_dir_entry_2 *de,
			int buf_size,
			struct ext4_filename *fname);
@@ -2802,8 +2863,10 @@ int __init ext4_fc_init_dentry_cache(void);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
@@ -3306,11 +3369,14 @@ struct ext4_group_info {
	ext4_grpblk_t	bb_free;	/* total free blocks */
	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
	ext4_group_t	bb_group;	/* Group number */
	struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
	void            *bb_bitmap;
#endif
	struct rw_semaphore alloc_sem;
	struct rb_node	bb_avg_fragment_size_rb;
	struct list_head bb_largest_free_order_node;
	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
					 * regions, index is order.
					 * bb_counters[3] = 5 means
@@ -3513,9 +3579,6 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
					unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
				      struct buffer_head *bh);
extern int ext4_ci_compare(const struct inode *parent,
			   const struct qstr *fname,
			   const struct qstr *entry, bool quick);
extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
			 struct inode *inode);
extern int __ext4_link(struct inode *dir, struct inode *inode,
+5 −3
Original line number Diff line number Diff line
@@ -66,7 +66,7 @@
 * Fast Commit Ineligibility
 * -------------------------
 * Not all operations are supported by fast commits today (e.g extended
 * attributes). Fast commit ineligiblity is marked by calling one of the
 * attributes). Fast commit ineligibility is marked by calling one of the
 * two following functions:
 *
 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
@@ -1088,9 +1088,11 @@ static int ext4_fc_perform_commit(journal_t *journal)
		head.fc_tid = cpu_to_le32(
			sbi->s_journal->j_running_transaction->t_tid);
		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
			(u8 *)&head, &crc))
			(u8 *)&head, &crc)) {
			ret = -ENOSPC;
			goto out;
		}
	}

	spin_lock(&sbi->s_fc_lock);
	ret = ext4_fc_commit_dentry_updates(journal, &crc);
@@ -1734,7 +1736,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
		}

		/* Range is mapped and needs a state change */
		jbd_debug(1, "Converting from %d to %d %lld",
		jbd_debug(1, "Converting from %ld to %d %lld",
				map.m_flags & EXT4_MAP_UNWRITTEN,
			ext4_ext_is_unwritten(ex), map.m_pblk);
		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+21 −4
Original line number Diff line number Diff line
@@ -371,15 +371,32 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
				 int error, unsigned int flags)
{
	loff_t offset = iocb->ki_pos;
	loff_t pos = iocb->ki_pos;
	struct inode *inode = file_inode(iocb->ki_filp);

	if (error)
		return error;

	if (size && flags & IOMAP_DIO_UNWRITTEN)
		return ext4_convert_unwritten_extents(NULL, inode,
						      offset, size);
	if (size && flags & IOMAP_DIO_UNWRITTEN) {
		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
		if (error < 0)
			return error;
	}
	/*
	 * If we are extending the file, we have to update i_size here before
	 * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
	 * buffered reads could zero out too much from page cache pages. Update
	 * of on-disk size will happen later in ext4_dio_write_iter() where
	 * we have enough information to also perform orphan list handling etc.
	 * Note that we perform all extending writes synchronously under
	 * i_rwsem held exclusively so i_size update is safe here in that case.
	 * If the write was not extending, we cannot see pos > i_size here
	 * because operations reducing i_size like truncate wait for all
	 * outstanding DIO before updating i_size.
	 */
	pos += size;
	if (pos > i_size_read(inode))
		i_size_write(inode, pos);

	return 0;
}
Loading