Commit ef510682 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull f2fs updates from Jaegeuk Kim:
 "In this cycle, f2fs has some performance improvements for Android
  workloads such as using read-unfair rwsems and adding some sysfs
  entries to control GCs and discard commands in more details. In
  addtiion, it has some tunings to improve the recovery speed after
  sudden power-cut.

  Enhancement:
   - add reader-unfair rwsems with F2FS_UNFAIR_RWSEM: will replace with
     generic API support
   - adjust to make the readahead/recovery flow more efficiently
   - sysfs entries to control issue speeds of GCs and Discard commands
   - enable idmapped mounts

  Bug fix:
   - correct wrong error handling routines
   - fix missing conditions in quota
   - fix a potential deadlock between writeback and block plug routines
   - fix a deadlock btween freezefs and evict_inode

  We've added some boundary checks to avoid kernel panics on corrupted
  images, and several minor code clean-ups"

* tag 'f2fs-for-5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (27 commits)
  f2fs: fix to do sanity check on .cp_pack_total_block_count
  f2fs: make gc_urgent and gc_segment_mode sysfs node readable
  f2fs: use aggressive GC policy during f2fs_disable_checkpoint()
  f2fs: fix compressed file start atomic write may cause data corruption
  f2fs: initialize sbi->gc_mode explicitly
  f2fs: introduce gc_urgent_mid mode
  f2fs: compress: fix to print raw data size in error path of lz4 decompression
  f2fs: remove redundant parameter judgment
  f2fs: use spin_lock to avoid hang
  f2fs: don't get FREEZE lock in f2fs_evict_inode in frozen fs
  f2fs: remove unnecessary read for F2FS_FITS_IN_INODE
  f2fs: introduce F2FS_UNFAIR_RWSEM to support unfair rwsem
  f2fs: avoid an infinite loop in f2fs_sync_dirty_inodes
  f2fs: fix to do sanity check on curseg->alloc_type
  f2fs: fix to avoid potential deadlock
  f2fs: quota: fix loop condition at f2fs_quota_sync()
  f2fs: Restore rwsem lockdep support
  f2fs: fix missing free nid in f2fs_handle_failed_inode
  f2fs: support idmapped mounts
  f2fs: add a way to limit roll forward recovery time
  ...
parents aab4ed58 5b5b4f85
Loading
Loading
Loading
Loading
+47 −7
Original line number Diff line number Diff line
@@ -55,8 +55,9 @@ Description: Controls the in-place-update policy.
		0x04  F2FS_IPU_UTIL
		0x08  F2FS_IPU_SSR_UTIL
		0x10  F2FS_IPU_FSYNC
		0x20  F2FS_IPU_ASYNC,
		0x20  F2FS_IPU_ASYNC
		0x40  F2FS_IPU_NOCACHE
		0x80  F2FS_IPU_HONOR_OPU_WRITE
		====  =================

		Refer segment.h for details.
@@ -98,6 +99,33 @@ Description: Controls the issue rate of discard commands that consist of small
		checkpoint is triggered, and issued during the checkpoint.
		By default, it is disabled with 0.

What:		/sys/fs/f2fs/<disk>/max_discard_request
Date:		December 2021
Contact:	"Konstantin Vyshetsky" <vkon@google.com>
Description:	Controls the number of discards a thread will issue at a time.
		Higher number will allow the discard thread to finish its work
		faster, at the cost of higher latency for incomming I/O.

What:		/sys/fs/f2fs/<disk>/min_discard_issue_time
Date:		December 2021
Contact:	"Konstantin Vyshetsky" <vkon@google.com>
Description:	Controls the interval the discard thread will wait between
		issuing discard requests when there are discards to be issued and
		no I/O aware interruptions occur.

What:		/sys/fs/f2fs/<disk>/mid_discard_issue_time
Date:		December 2021
Contact:	"Konstantin Vyshetsky" <vkon@google.com>
Description:	Controls the interval the discard thread will wait between
		issuing discard requests when there are discards to be issued and
		an I/O aware interruption occurs.

What:		/sys/fs/f2fs/<disk>/max_discard_issue_time
Date:		December 2021
Contact:	"Konstantin Vyshetsky" <vkon@google.com>
Description:	Controls the interval the discard thread will wait when there are
		no discard operations to be issued.

What:		/sys/fs/f2fs/<disk>/discard_granularity
Date:		July 2017
Contact:	"Chao Yu" <yuchao0@huawei.com>
@@ -269,11 +297,16 @@ Description: Shows current reserved blocks in system, it may be temporarily
What:		/sys/fs/f2fs/<disk>/gc_urgent
Date:		August 2017
Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
Description:	Do background GC aggressively when set. When gc_urgent = 1,
		background thread starts to do GC by given gc_urgent_sleep_time
		interval. When gc_urgent = 2, F2FS will lower the bar of
		checking idle in order to process outstanding discard commands
		and GC a little bit aggressively. It is set to 0 by default.
Description:	Do background GC aggressively when set. Set to 0 by default.
		gc urgent high(1): does GC forcibly in a period of given
		gc_urgent_sleep_time and ignores I/O idling check. uses greedy
		GC approach and turns SSR mode on.
		gc urgent low(2): lowers the bar of checking I/O idling in
		order to process outstanding discard commands and GC a
		little bit aggressively. uses cost benefit GC approach.
		gc urgent mid(3): does GC forcibly in a period of given
		gc_urgent_sleep_time and executes a mid level of I/O idling check.
		uses cost benefit GC approach.

What:		/sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date:		August 2017
@@ -430,6 +463,7 @@ Description: Show status of f2fs superblock in real time.
		0x800  SBI_QUOTA_SKIP_FLUSH  skip flushing quota in current CP
		0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted
		0x2000 SBI_IS_RESIZEFS       resizefs is in process
		0x4000 SBI_IS_FREEZING       freefs is in process
		====== ===================== =================================

What:		/sys/fs/f2fs/<disk>/ckpt_thread_ioprio
@@ -503,7 +537,7 @@ Date: July 2021
Contact:	"Daeho Jeong" <daehojeong@google.com>
Description:	Show how many segments have been reclaimed by GC during a specific
		GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy,
		3: GC idle AT, 4: GC urgent high, 5: GC urgent low)
		3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid)
		You can re-initialize this value to "0".

What:		/sys/fs/f2fs/<disk>/gc_segment_mode
@@ -540,3 +574,9 @@ Contact: "Daeho Jeong" <daehojeong@google.com>
Description:	You can set the trial count limit for GC urgent high mode with this value.
		If GC thread gets to the limit, the mode will turn back to GC normal mode.
		By default, the value is zero, which means there is no limit like before.

What:		/sys/fs/f2fs/<disk>/max_roll_forward_node_blocks
Date:		January 2022
Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
Description:	Controls max # of node block writes to be used for roll forward
		recovery. This can limit the roll forward recovery time.
+7 −0
Original line number Diff line number Diff line
@@ -143,3 +143,10 @@ config F2FS_IOSTAT
	  Support getting IO statistics through sysfs and printing out periodic
	  IO statistics tracepoint events. You have to turn on "iostat_enable"
	  sysfs node to enable this feature.

config F2FS_UNFAIR_RWSEM
	bool "F2FS unfair rw_semaphore"
	depends on F2FS_FS && BLK_CGROUP
	help
	  Use unfair rw_semaphore, if system configured IO priority by block
	  cgroup.
+12 −9
Original line number Diff line number Diff line
@@ -204,7 +204,8 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu)
	return __f2fs_get_acl(inode, type, NULL);
}

static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
				struct inode *inode, umode_t *mode_p,
				struct posix_acl **acl)
{
	umode_t mode = inode->i_mode;
@@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
		return error;
	if (error == 0)
		*acl = NULL;
	if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
	if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
	    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
		mode &= ~S_ISGID;
	*mode_p = mode;
	return 0;
}

static int __f2fs_set_acl(struct inode *inode, int type,
static int __f2fs_set_acl(struct user_namespace *mnt_userns,
			struct inode *inode, int type,
			struct posix_acl *acl, struct page *ipage)
{
	int name_index;
@@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
	case ACL_TYPE_ACCESS:
		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
		if (acl && !ipage) {
			error = f2fs_acl_update_mode(inode, &mode, &acl);
			error = f2fs_acl_update_mode(mnt_userns, inode,
								&mode, &acl);
			if (error)
				return error;
			set_acl_inode(inode, mode);
@@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
		return -EIO;

	return __f2fs_set_acl(inode, type, acl, NULL);
	return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL);
}

/*
@@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
	f2fs_mark_inode_dirty_sync(inode, true);

	if (default_acl) {
		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
		error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl,
				       ipage);
		posix_acl_release(default_acl);
	} else {
@@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
	}
	if (acl) {
		if (!error)
			error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
			error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl,
					       ipage);
		posix_acl_release(acl);
	} else {
+36 −22
Original line number Diff line number Diff line
@@ -98,6 +98,13 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
	}

	if (unlikely(!PageUptodate(page))) {
		if (page->index == sbi->metapage_eio_ofs &&
			sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) {
			set_ckpt_flags(sbi, CP_ERROR_FLAG);
		} else {
			sbi->metapage_eio_ofs = page->index;
			sbi->metapage_eio_cnt = 0;
		}
		f2fs_put_page(page, 1);
		return ERR_PTR(-EIO);
	}
@@ -282,18 +289,22 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
	return blkno - start;
}

void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
							unsigned int ra_blocks)
{
	struct page *page;
	bool readahead = false;

	if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
		return;

	page = find_get_page(META_MAPPING(sbi), index);
	if (!page || !PageUptodate(page))
		readahead = true;
	f2fs_put_page(page, 0);

	if (readahead)
		f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
		f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}

static int __f2fs_write_meta_page(struct page *page,
@@ -351,13 +362,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
		goto skip_write;

	/* if locked failed, cp will flush dirty pages instead */
	if (!down_write_trylock(&sbi->cp_global_sem))
	if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
		goto skip_write;

	trace_f2fs_writepages(mapping->host, wbc, META);
	diff = nr_pages_to_write(sbi, META, wbc);
	written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
	up_write(&sbi->cp_global_sem);
	f2fs_up_write(&sbi->cp_global_sem);
	wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
	return 0;

@@ -864,6 +875,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
	struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
	struct f2fs_checkpoint *cp_block = NULL;
	unsigned long long cur_version = 0, pre_version = 0;
	unsigned int cp_blocks;
	int err;

	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -871,15 +883,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
	if (err)
		return NULL;

	if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
					sbi->blocks_per_seg) {
	cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);

	if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
		f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
			  le32_to_cpu(cp_block->cp_pack_total_block_count));
		goto invalid_cp;
	}
	pre_version = *version;

	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
	cp_addr += cp_blocks - 1;
	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
					&cp_page_2, version);
	if (err)
@@ -1159,7 +1172,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
	if (!is_journalled_quota(sbi))
		return false;

	if (!down_write_trylock(&sbi->quota_sem))
	if (!f2fs_down_write_trylock(&sbi->quota_sem))
		return true;
	if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
		ret = false;
@@ -1171,7 +1184,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
	} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
		ret = true;
	}
	up_write(&sbi->quota_sem);
	f2fs_up_write(&sbi->quota_sem);
	return ret;
}

@@ -1228,10 +1241,10 @@ static int block_operations(struct f2fs_sb_info *sbi)
	 * POR: we should ensure that there are no dirty node pages
	 * until finishing nat/sit flush. inode->i_blocks can be updated.
	 */
	down_write(&sbi->node_change);
	f2fs_down_write(&sbi->node_change);

	if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
		up_write(&sbi->node_change);
		f2fs_up_write(&sbi->node_change);
		f2fs_unlock_all(sbi);
		err = f2fs_sync_inode_meta(sbi);
		if (err)
@@ -1241,15 +1254,15 @@ static int block_operations(struct f2fs_sb_info *sbi)
	}

retry_flush_nodes:
	down_write(&sbi->node_write);
	f2fs_down_write(&sbi->node_write);

	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
		up_write(&sbi->node_write);
		f2fs_up_write(&sbi->node_write);
		atomic_inc(&sbi->wb_sync_req[NODE]);
		err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
		atomic_dec(&sbi->wb_sync_req[NODE]);
		if (err) {
			up_write(&sbi->node_change);
			f2fs_up_write(&sbi->node_change);
			f2fs_unlock_all(sbi);
			return err;
		}
@@ -1262,13 +1275,13 @@ static int block_operations(struct f2fs_sb_info *sbi)
	 * dirty node blocks and some checkpoint values by block allocation.
	 */
	__prepare_cp_block(sbi);
	up_write(&sbi->node_change);
	f2fs_up_write(&sbi->node_change);
	return err;
}

static void unblock_operations(struct f2fs_sb_info *sbi)
{
	up_write(&sbi->node_write);
	f2fs_up_write(&sbi->node_write);
	f2fs_unlock_all(sbi);
}

@@ -1543,6 +1556,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
	/* update user_block_counts */
	sbi->last_valid_block_count = sbi->total_valid_block_count;
	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
	percpu_counter_set(&sbi->rf_node_block_count, 0);

	/* Here, we have one bio having CP pack except cp pack 2 page */
	f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
@@ -1612,7 +1626,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
		f2fs_warn(sbi, "Start checkpoint disabled!");
	}
	if (cpc->reason != CP_RESIZE)
		down_write(&sbi->cp_global_sem);
		f2fs_down_write(&sbi->cp_global_sem);

	if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
		((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1693,7 +1707,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
	if (cpc->reason != CP_RESIZE)
		up_write(&sbi->cp_global_sem);
		f2fs_up_write(&sbi->cp_global_sem);
	return err;
}

@@ -1741,9 +1755,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
	struct cp_control cpc = { .reason = CP_SYNC, };
	int err;

	down_write(&sbi->gc_lock);
	f2fs_down_write(&sbi->gc_lock);
	err = f2fs_write_checkpoint(sbi, &cpc);
	up_write(&sbi->gc_lock);
	f2fs_up_write(&sbi->gc_lock);

	return err;
}
@@ -1831,9 +1845,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
	if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
		int ret;

		down_write(&sbi->gc_lock);
		f2fs_down_write(&sbi->gc_lock);
		ret = f2fs_write_checkpoint(sbi, &cpc);
		up_write(&sbi->gc_lock);
		f2fs_up_write(&sbi->gc_lock);

		return ret;
	}
+5 −6
Original line number Diff line number Diff line
@@ -314,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
	}

	if (ret != PAGE_SIZE << dic->log_cluster_size) {
		printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
		printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
					"expected:%lu\n", KERN_ERR,
					F2FS_I_SB(dic->inode)->sb->s_id,
					dic->rlen,
					F2FS_I_SB(dic->inode)->sb->s_id, ret,
					PAGE_SIZE << dic->log_cluster_size);
		return -EIO;
	}
@@ -1267,7 +1266,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
		 * checkpoint. This can only happen to quota writes which can cause
		 * the below discard race condition.
		 */
		down_read(&sbi->node_write);
		f2fs_down_read(&sbi->node_write);
	} else if (!f2fs_trylock_op(sbi)) {
		goto out_free;
	}
@@ -1384,7 +1383,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,

	f2fs_put_dnode(&dn);
	if (IS_NOQUOTA(inode))
		up_read(&sbi->node_write);
		f2fs_up_read(&sbi->node_write);
	else
		f2fs_unlock_op(sbi);

@@ -1410,7 +1409,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
	f2fs_put_dnode(&dn);
out_unlock_op:
	if (IS_NOQUOTA(inode))
		up_read(&sbi->node_write);
		f2fs_up_read(&sbi->node_write);
	else
		f2fs_unlock_op(sbi);
out_free:
Loading