Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 (9f67672a) · Commits · EulixOS / Software / Kernel

Documentation/filesystems/ext4/directory.rst

+27 −0

Original line number	Diff line number	Diff line
		@@ -121,6 +121,31 @@ The directory file type is one of the following values:
		* - 0x7
		- Symbolic link.

		To support directories that are both encrypted and casefolded directories, we
		must also include hash information in the directory entry. We append
		``ext4_extended_dir_entry_2`` to ``ext4_dir_entry_2`` except for the entries
		for dot and dotdot, which are kept the same. The structure follows immediately
		after ``name`` and is included in the size listed by ``rec_len`` If a directory
		entry uses this extension, it may be up to 271 bytes.

		.. list-table::
		:widths: 8 8 24 40
		:header-rows: 1

		* - Offset
		- Size
		- Name
		- Description
		* - 0x0
		- \_\_le32
		- hash
		- The hash of the directory name
		* - 0x4
		- \_\_le32
		- minor\_hash
		- The minor hash of the directory name


		In order to add checksums to these classic directory blocks, a phony
		``struct ext4_dir_entry`` is placed at the end of each leaf block to
		hold the checksum. The directory entry is 12 bytes long. The inode
		@@ -322,6 +347,8 @@ The directory hash is one of the following values:
		- Half MD4, unsigned.
		* - 0x5
		- Tea, unsigned.
		* - 0x6
		- Siphash.

		Interior nodes of an htree are recorded as ``struct dx_node``, which is
		also the full length of a data block:

fs/ext4/dir.c

+30 −11

Original line number	Diff line number	Diff line
		@@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode)
		return 0;
		}

		static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
		{
		/* Check if . or .. , or skip if namelen is 0 */
		if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
		(de->name[1] == '.' \|\| de->name[1] == '\0'))
		return true;
		/* Check if this is a csum entry */
		if (de->file_type == EXT4_FT_DIR_CSUM)
		return true;
		return false;
		}

		/*
		* Return 0 if the directory entry is OK, and 1 if there is a problem
		*
		@@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
		const int rlen = ext4_rec_len_from_disk(de->rec_len,
		dir->i_sb->s_blocksize);
		const int next_offset = ((char *) de - buf) + rlen;
		bool fake = is_fake_dir_entry(de);
		bool has_csum = ext4_has_metadata_csum(dir->i_sb);

		if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
		if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
		error_msg = "rec_len is smaller than minimal";
		else if (unlikely(rlen % 4 != 0))
		error_msg = "rec_len % 4 != 0";
		else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
		else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
		fake ? NULL : dir)))
		error_msg = "rec_len is too small for name_len";
		else if (unlikely(next_offset > size))
		error_msg = "directory entry overrun";
		else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
		else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
		has_csum ? NULL : dir) &&
		next_offset != size))
		error_msg = "directory entry too close to block end";
		else if (unlikely(le32_to_cpu(de->inode) >
		@@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
		if (filp)
		ext4_error_file(filp, function, line, bh->b_blocknr,
		"bad entry in directory: %s - offset=%u, "
		"inode=%u, rec_len=%d, name_len=%d, size=%d",
		"inode=%u, rec_len=%d, size=%d fake=%d",
		error_msg, offset, le32_to_cpu(de->inode),
		rlen, de->name_len, size);
		rlen, size, fake);
		else
		ext4_error_inode(dir, function, line, bh->b_blocknr,
		"bad entry in directory: %s - offset=%u, "
		"inode=%u, rec_len=%d, name_len=%d, size=%d",
		"inode=%u, rec_len=%d, size=%d fake=%d",
		error_msg, offset, le32_to_cpu(de->inode),
		rlen, de->name_len, size);
		rlen, size, fake);

		return 1;
		}
		@@ -124,9 +140,9 @@ static int ext4_readdir(struct file file, struct dir_context ctx)

		if (is_dx_dir(inode)) {
		err = ext4_dx_readdir(file, ctx);
		if (err != ERR_BAD_DX_DIR) {
		if (err != ERR_BAD_DX_DIR)
		return err;
		}

		/* Can we just clear INDEX flag to ignore htree information? */
		if (!ext4_has_metadata_csum(sb)) {
		/*
		@@ -224,7 +240,8 @@ static int ext4_readdir(struct file file, struct dir_context ctx)
		* failure will be detected in the
		* dirent test below. */
		if (ext4_rec_len_from_disk(de->rec_len,
		sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
		sb->s_blocksize) < ext4_dir_rec_len(1,
		inode))
		break;
		i += ext4_rec_len_from_disk(de->rec_len,
		sb->s_blocksize);
		@@ -265,7 +282,9 @@ static int ext4_readdir(struct file file, struct dir_context ctx)

		/* Directory is encrypted */
		err = fscrypt_fname_disk_to_usr(inode,
		0, 0, &de_name, &fstr);
		EXT4_DIRENT_HASH(de),
		EXT4_DIRENT_MINOR_HASH(de),
		&de_name, &fstr);
		de_name = fstr;
		fstr.len = save_len;
		if (err)

fs/ext4/ext4.h

+85 −22

Original line number	Diff line number	Diff line
		@@ -162,7 +162,12 @@ enum SHIFT_DIRECTION {
		#define EXT4_MB_USE_RESERVED 0x2000
		/* Do strict check for free blocks while retrying block allocation */
		#define EXT4_MB_STRICT_CHECK 0x4000

		/* Large fragment size list lookup succeeded at least once for cr = 0 */
		#define EXT4_MB_CR0_OPTIMIZED 0x8000
		/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
		#define EXT4_MB_CR1_OPTIMIZED 0x00010000
		/* Perform linear traversal for one group */
		#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
		struct ext4_allocation_request {
		/* target inode for block we're allocating */
		struct inode *inode;
		@@ -1213,7 +1218,7 @@ struct ext4_inode_info {
		#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
		#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
		#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
		#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
		#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
		#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
		#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
		#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
		@@ -1238,7 +1243,9 @@ struct ext4_inode_info {
		#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
		#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
		#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */

		#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
		* scanning in mballoc
		*/

		#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
		~EXT4_MOUNT_##opt
		@@ -1519,9 +1526,14 @@ struct ext4_sb_info {
		unsigned int s_mb_free_pending;
		struct list_head s_freed_data_list; /* List of blocks to be freed
		after commit completed */
		struct rb_root s_mb_avg_fragment_size_root;
		rwlock_t s_mb_rb_lock;
		struct list_head *s_mb_largest_free_orders;
		rwlock_t *s_mb_largest_free_orders_locks;

		/* tunables */
		unsigned long s_stripe;
		unsigned int s_mb_max_linear_groups;
		unsigned int s_mb_stream_request;
		unsigned int s_mb_max_to_scan;
		unsigned int s_mb_min_to_scan;
		@@ -1541,12 +1553,17 @@ struct ext4_sb_info {
		atomic_t s_bal_success; /* we found long enough chunks */
		atomic_t s_bal_allocated; /* in blocks */
		atomic_t s_bal_ex_scanned; /* total extents scanned */
		atomic_t s_bal_groups_scanned; /* number of groups scanned */
		atomic_t s_bal_goals; /* goal hits */
		atomic_t s_bal_breaks; /* too long searches */
		atomic_t s_bal_2orders; /* 2^order hits */
		spinlock_t s_bal_lock;
		unsigned long s_mb_buddies_generated;
		unsigned long long s_mb_generation_time;
		atomic_t s_bal_cr0_bad_suggestions;
		atomic_t s_bal_cr1_bad_suggestions;
		atomic64_t s_bal_cX_groups_considered[4];
		atomic64_t s_bal_cX_hits[4];
		atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
		atomic_t s_mb_buddies_generated; /* number of buddies generated */
		atomic64_t s_mb_generation_time;
		atomic_t s_mb_lost_chunks;
		atomic_t s_mb_preallocated;
		atomic_t s_mb_discarded;
		@@ -2187,6 +2204,17 @@ struct ext4_dir_entry {
		char name[EXT4_NAME_LEN]; /* File name */
		};


		/*
		* Encrypted Casefolded entries require saving the hash on disk. This structure
		* followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
		* boundary.
		*/
		struct ext4_dir_entry_hash {
		__le32 hash;
		__le32 minor_hash;
		};

		/*
		* The new version of the directory entry. Since EXT4 structures are
		* stored in intel byte order, and the name_len field could never be
		@@ -2201,6 +2229,22 @@ struct ext4_dir_entry_2 {
		char name[EXT4_NAME_LEN]; /* File name */
		};

		/*
		* Access the hashes at the end of ext4_dir_entry_2
		*/
		#define EXT4_DIRENT_HASHES(entry) \
		((struct ext4_dir_entry_hash *) \
		(((void *)(entry)) + \
		((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
		#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
		#define EXT4_DIRENT_MINOR_HASH(entry) \
		le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)

		static inline bool ext4_hash_in_dirent(const struct inode *inode)
		{
		return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
		}

		/*
		* This is a bogus directory entry at the end of each leaf block that
		* records checksums.
		@@ -2242,10 +2286,24 @@ struct ext4_dir_entry_tail {
		*/
		#define EXT4_DIR_PAD 4
		#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
		#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
		~EXT4_DIR_ROUND)
		#define EXT4_MAX_REC_LEN ((1<<16)-1)

		/*
		* The rec_len is dependent on the type of directory. Directories that are
		* casefolded and encrypted need to store the hash as well, so we add room for
		* ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
		* pass NULL for dir, as those entries do not use the extra fields.
		*/
		static inline unsigned int ext4_dir_rec_len(__u8 name_len,
		const struct inode *dir)
		{
		int rec_len = (name_len + 8 + EXT4_DIR_ROUND);

		if (dir && ext4_hash_in_dirent(dir))
		rec_len += sizeof(struct ext4_dir_entry_hash);
		return (rec_len & ~EXT4_DIR_ROUND);
		}

		/*
		* If we ever get support for fs block sizes > page_size, we'll need
		* to remove the #if statements in the next two functions...
		@@ -2302,6 +2360,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
		#define DX_HASH_LEGACY_UNSIGNED 3
		#define DX_HASH_HALF_MD4_UNSIGNED 4
		#define DX_HASH_TEA_UNSIGNED 5
		#define DX_HASH_SIPHASH 6

		static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
		const void *address, unsigned int length)
		@@ -2356,6 +2415,7 @@ struct ext4_filename {
		};

		#define fname_name(p) ((p)->disk_name.name)
		#define fname_usr_name(p) ((p)->usr_fname->name)
		#define fname_len(p) ((p)->disk_name.len)

		/*
		@@ -2586,9 +2646,9 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
		ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

		#ifdef CONFIG_UNICODE
		extern void ext4_fname_setup_ci_filename(struct inode *dir,
		extern int ext4_fname_setup_ci_filename(struct inode *dir,
		const struct qstr *iname,
		struct fscrypt_str *fname);
		struct ext4_filename *fname);
		#endif

		#ifdef CONFIG_FS_ENCRYPTION
		@@ -2619,9 +2679,9 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
		ext4_fname_from_fscrypt_name(fname, &name);

		#ifdef CONFIG_UNICODE
		ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
		err = ext4_fname_setup_ci_filename(dir, iname, fname);
		#endif
		return 0;
		return err;
		}

		static inline int ext4_fname_prepare_lookup(struct inode *dir,
		@@ -2638,9 +2698,9 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
		ext4_fname_from_fscrypt_name(fname, &name);

		#ifdef CONFIG_UNICODE
		ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
		err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
		#endif
		return 0;
		return err;
		}

		static inline void ext4_fname_free_filename(struct ext4_filename *fname)
		@@ -2665,15 +2725,16 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
		int lookup,
		struct ext4_filename *fname)
		{
		int err = 0;
		fname->usr_fname = iname;
		fname->disk_name.name = (unsigned char *) iname->name;
		fname->disk_name.len = iname->len;

		#ifdef CONFIG_UNICODE
		ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
		err = ext4_fname_setup_ci_filename(dir, iname, fname);
		#endif

		return 0;
		return err;
		}

		static inline int ext4_fname_prepare_lookup(struct inode *dir,
		@@ -2711,7 +2772,7 @@ extern int ext4_find_dest_de(struct inode dir, struct inode inode,
		void *buf, int buf_size,
		struct ext4_filename *fname,
		struct ext4_dir_entry_2 **dest_de);
		void ext4_insert_dentry(struct inode *inode,
		void ext4_insert_dentry(struct inode dir, struct inode inode,
		struct ext4_dir_entry_2 *de,
		int buf_size,
		struct ext4_filename *fname);
		@@ -2802,8 +2863,10 @@ int __init ext4_fc_init_dentry_cache(void);

		/* mballoc.c */
		extern const struct seq_operations ext4_mb_seq_groups_ops;
		extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
		extern long ext4_mb_stats;
		extern long ext4_mb_max_to_scan;
		extern int ext4_seq_mb_stats_show(struct seq_file seq, void offset);
		extern int ext4_mb_init(struct super_block *);
		extern int ext4_mb_release(struct super_block *);
		extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
		@@ -3306,11 +3369,14 @@ struct ext4_group_info {
		ext4_grpblk_t bb_free; /* total free blocks */
		ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
		ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
		ext4_group_t bb_group; /* Group number */
		struct list_head bb_prealloc_list;
		#ifdef DOUBLE_CHECK
		void *bb_bitmap;
		#endif
		struct rw_semaphore alloc_sem;
		struct rb_node bb_avg_fragment_size_rb;
		struct list_head bb_largest_free_order_node;
		ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
		* regions, index is order.
		* bb_counters[3] = 5 means
		@@ -3513,9 +3579,6 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
		unsigned int blocksize);
		extern int ext4_handle_dirty_dirblock(handle_t handle, struct inode inode,
		struct buffer_head *bh);
		extern int ext4_ci_compare(const struct inode *parent,
		const struct qstr *fname,
		const struct qstr *entry, bool quick);
		extern int __ext4_unlink(handle_t handle, struct inode dir, const struct qstr *d_name,
		struct inode *inode);
		extern int __ext4_link(struct inode dir, struct inode inode,

fs/ext4/fast_commit.c

+5 −3

Original line number	Diff line number	Diff line
		@@ -66,7 +66,7 @@
		* Fast Commit Ineligibility
		* -------------------------
		* Not all operations are supported by fast commits today (e.g extended
		* attributes). Fast commit ineligiblity is marked by calling one of the
		* attributes). Fast commit ineligibility is marked by calling one of the
		* two following functions:
		*
		* - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
		@@ -1088,9 +1088,11 @@ static int ext4_fc_perform_commit(journal_t *journal)
		head.fc_tid = cpu_to_le32(
		sbi->s_journal->j_running_transaction->t_tid);
		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
		(u8 *)&head, &crc))
		(u8 *)&head, &crc)) {
		ret = -ENOSPC;
		goto out;
		}
		}

		spin_lock(&sbi->s_fc_lock);
		ret = ext4_fc_commit_dentry_updates(journal, &crc);
		@@ -1734,7 +1736,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
		}

		/* Range is mapped and needs a state change */
		jbd_debug(1, "Converting from %d to %d %lld",
		jbd_debug(1, "Converting from %ld to %d %lld",
		map.m_flags & EXT4_MAP_UNWRITTEN,
		ext4_ext_is_unwritten(ex), map.m_pblk);
		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,

fs/ext4/file.c

+21 −4

Original line number	Diff line number	Diff line
		@@ -371,15 +371,32 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
		static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
		int error, unsigned int flags)
		{
		loff_t offset = iocb->ki_pos;
		loff_t pos = iocb->ki_pos;
		struct inode *inode = file_inode(iocb->ki_filp);

		if (error)
		return error;

		if (size && flags & IOMAP_DIO_UNWRITTEN)
		return ext4_convert_unwritten_extents(NULL, inode,
		offset, size);
		if (size && flags & IOMAP_DIO_UNWRITTEN) {
		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
		if (error < 0)
		return error;
		}
		/*
		* If we are extending the file, we have to update i_size here before
		* page cache gets invalidated in iomap_dio_rw(). Otherwise racing
		* buffered reads could zero out too much from page cache pages. Update
		* of on-disk size will happen later in ext4_dio_write_iter() where
		* we have enough information to also perform orphan list handling etc.
		* Note that we perform all extending writes synchronously under
		* i_rwsem held exclusively so i_size update is safe here in that case.
		* If the write was not extending, we cannot see pos > i_size here
		* because operations reducing i_size like truncate wait for all
		* outstanding DIO before updating i_size.
		*/
		pos += size;
		if (pos > i_size_read(inode))
		i_size_write(inode, pos);

		return 0;
		}