Commit 71644dff authored by Jaegeuk Kim's avatar Jaegeuk Kim
Browse files

f2fs: add block_age-based extent cache



This patch introduces a runtime hot/cold data separation method
for f2fs, in order to improve the accuracy for data temperature
classification, reduce the garbage collection overhead after
long-term data updates.

Enhanced hot/cold data separation can record data block update
frequency as "age" of the extent per inode, and take use of the age
info to indicate better temperature type for data block allocation:
 - It records total data blocks allocated since mount;
 - When file extent has been updated, it calculate the count of data
blocks allocated since last update as the age of the extent;
 - Before the data block allocated, it searches for the age info and
chooses the suitable segment for allocation.

Test and result:
 - Prepare: create about 30000 files
  * 3% for cold files (with cold file extension like .apk, from 3M to 10M)
  * 50% for warm files (with random file extension like .FcDxq, from 1K
to 4M)
  * 47% for hot files (with hot file extension like .db, from 1K to 256K)
 - create(5%)/random update(90%)/delete(5%) the files
  * total write amount is about 70G
  * fsync will be called for .db files, and buffered write will be used
for other files

The storage of test device is large enough(128G) so that it will not
switch to SSR mode during the test.

Benefit: dirty segment count increment reduce about 14%
 - before: Dirty +21110
 - after:  Dirty +18286

Signed-off-by: default avatarqixiaoyu1 <qixiaoyu1@xiaomi.com>
Signed-off-by: default avatarxiongping1 <xiongping1@xiaomi.com>
Signed-off-by: default avatarJaegeuk Kim <jaegeuk@kernel.org>
parent 72840ccc
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -655,3 +655,17 @@ Description: When space utilization exceeds this, do background DISCARD aggressi
		Does DISCARD forcibly in a period of given min_discard_issue_time when the number
		of discards is not 0 and set discard granularity to 1.
		Default: 80

What:		/sys/fs/f2fs/<disk>/hot_data_age_threshold
Date:		November 2022
Contact:	"Ping Xiong" <xiongping1@xiaomi.com>
Description:	When DATA SEPARATION is on, it controls the age threshold to indicate
		the data blocks as hot. By default it was initialized as 262144 blocks
		(equals to 1GB).

What:		/sys/fs/f2fs/<disk>/warm_data_age_threshold
Date:		November 2022
Contact:	"Ping Xiong" <xiongping1@xiaomi.com>
Description:	When DATA SEPARATION is on, it controls the age threshold to indicate
		the data blocks as warm. By default it was initialized as 2621440 blocks
		(equals to 10GB).
+4 −0
Original line number Diff line number Diff line
@@ -347,6 +347,10 @@ memory=%s Control memory mode. This supports "normal" and "low" modes.
			 Because of the nature of low memory devices, in this mode, f2fs
			 will try to save memory sometimes by sacrificing performance.
			 "normal" mode is the default mode and same as before.
age_extent_cache	 Enable an age extent cache based on rb-tree. It records
			 data block update frequency of the extent per inode, in
			 order to provide better temperature hints for data block
			 allocation.
======================== ============================================================

Debugfs Entries
+21 −0
Original line number Diff line number Diff line
@@ -88,6 +88,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
	si->hit_total[EX_READ] += si->hit_largest;

	/* block age extent_cache only */
	si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks);

	/* validation check of the segment numbers */
	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
@@ -516,6 +519,22 @@ static int stat_show(struct seq_file *s, void *v)
		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
				si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
				si->ext_node[EX_READ]);
		seq_puts(s, "\nExtent Cache (Block Age):\n");
		seq_printf(s, "  - Allocated Data Blocks: %llu\n",
				si->allocated_data_blocks);
		seq_printf(s, "  - Hit Count: L1:%llu L2:%llu\n",
				si->hit_cached[EX_BLOCK_AGE],
				si->hit_rbtree[EX_BLOCK_AGE]);
		seq_printf(s, "  - Hit Ratio: %llu%% (%llu / %llu)\n",
				!si->total_ext[EX_BLOCK_AGE] ? 0 :
				div64_u64(si->hit_total[EX_BLOCK_AGE] * 100,
				si->total_ext[EX_BLOCK_AGE]),
				si->hit_total[EX_BLOCK_AGE],
				si->total_ext[EX_BLOCK_AGE]);
		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
				si->ext_tree[EX_BLOCK_AGE],
				si->zombie_tree[EX_BLOCK_AGE],
				si->ext_node[EX_BLOCK_AGE]);
		seq_puts(s, "\nBalancing F2FS Async:\n");
		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
			   si->nr_dio_read, si->nr_dio_write);
@@ -586,6 +605,8 @@ static int stat_show(struct seq_file *s, void *v)
				si->cache_mem >> 10);
		seq_printf(s, "  - read extent cache: %llu KB\n",
				si->ext_mem[EX_READ] >> 10);
		seq_printf(s, "  - block age extent cache: %llu KB\n",
				si->ext_mem[EX_BLOCK_AGE] >> 10);
		seq_printf(s, "  - paged : %llu KB\n",
				si->page_mem >> 10);
	}
+181 −2
Original line number Diff line number Diff line
@@ -6,6 +6,10 @@
 * Copyright (c) 2015 Samsung Electronics
 * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
 *          Chao Yu <chao2.yu@samsung.com>
 *
 * block_age-based extent cache added by:
 * Copyright (c) 2022 xiaomi Co., Ltd.
 *             http://www.xiaomi.com/
 */

#include <linux/fs.h>
@@ -18,6 +22,7 @@
static void __set_extent_info(struct extent_info *ei,
				unsigned int fofs, unsigned int len,
				block_t blk, bool keep_clen,
				unsigned long age, unsigned long last_blocks,
				enum extent_type type)
{
	ei->fofs = fofs;
@@ -30,6 +35,9 @@ static void __set_extent_info(struct extent_info *ei,
#ifdef CONFIG_F2FS_FS_COMPRESSION
		ei->c_len = 0;
#endif
	} else if (type == EX_BLOCK_AGE) {
		ei->age = age;
		ei->last_blocks = last_blocks;
	}
}

@@ -47,10 +55,27 @@ static bool __may_read_extent_tree(struct inode *inode)
	return S_ISREG(inode->i_mode);
}

static bool __may_age_extent_tree(struct inode *inode)
{
	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);

	if (!test_opt(sbi, AGE_EXTENT_CACHE))
		return false;
	/* don't cache block age info for cold file */
	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
		return false;
	if (file_is_cold(inode))
		return false;

	return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
}

static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
{
	if (type == EX_READ)
		return __may_read_extent_tree(inode);
	else if (type == EX_BLOCK_AGE)
		return __may_age_extent_tree(inode);
	return false;
}

@@ -90,6 +115,11 @@ static bool __is_extent_mergeable(struct extent_info *back,
#endif
		return (back->fofs + back->len == front->fofs &&
				back->blk + back->len == front->blk);
	} else if (type == EX_BLOCK_AGE) {
		return (back->fofs + back->len == front->fofs &&
			abs(back->age - front->age) <= SAME_AGE_REGION &&
			abs(back->last_blocks - front->last_blocks) <=
							SAME_AGE_REGION);
	}
	return false;
}
@@ -489,11 +519,22 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
		set_inode_flag(inode, FI_NO_EXTENT);
}

void f2fs_init_age_extent_tree(struct inode *inode)
{
	if (!__init_may_extent_tree(inode, EX_BLOCK_AGE))
		return;
	__grab_extent_tree(inode, EX_BLOCK_AGE);
}

void f2fs_init_extent_tree(struct inode *inode)
{
	/* initialize read cache */
	if (__init_may_extent_tree(inode, EX_READ))
		__grab_extent_tree(inode, EX_READ);

	/* initialize block age cache */
	if (__init_may_extent_tree(inode, EX_BLOCK_AGE))
		__grab_extent_tree(inode, EX_BLOCK_AGE);
}

static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -544,6 +585,8 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,

	if (type == EX_READ)
		trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
	else if (type == EX_BLOCK_AGE)
		trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei);
	return ret;
}

@@ -642,6 +685,10 @@ static void __update_extent_tree_range(struct inode *inode,
	if (type == EX_READ)
		trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
						tei->blk, 0);
	else if (type == EX_BLOCK_AGE)
		trace_f2fs_update_age_extent_tree_range(inode, fofs, len,
						tei->age, tei->last_blocks);

	write_lock(&et->lock);

	if (type == EX_READ) {
@@ -694,6 +741,7 @@ static void __update_extent_tree_range(struct inode *inode,
				__set_extent_info(&ei,
					end, org_end - end,
					end - dei.fofs + dei.blk, false,
					dei.age, dei.last_blocks,
					type);
				en1 = __insert_extent_tree(sbi, et, &ei,
							NULL, NULL, true);
@@ -702,6 +750,7 @@ static void __update_extent_tree_range(struct inode *inode,
				__set_extent_info(&en->ei,
					end, en->ei.len - (end - dei.fofs),
					en->ei.blk + (end - dei.fofs), true,
					dei.age, dei.last_blocks,
					type);
				next_en = en;
			}
@@ -732,11 +781,15 @@ static void __update_extent_tree_range(struct inode *inode,
		en = next_en;
	}

	if (type == EX_BLOCK_AGE)
		goto update_age_extent_cache;

	/* 3. update extent in read extent cache */
	BUG_ON(type != EX_READ);

	if (tei->blk) {
		__set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ);
		__set_extent_info(&ei, fofs, len, tei->blk, false,
				  0, 0, EX_READ);
		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
			__insert_extent_tree(sbi, et, &ei,
					insert_p, insert_parent, leftmost);
@@ -758,7 +811,17 @@ static void __update_extent_tree_range(struct inode *inode,
		et->largest_updated = false;
		updated = true;
	}
	goto out_read_extent_cache;
update_age_extent_cache:
	if (!tei->last_blocks)
		goto out_read_extent_cache;

	__set_extent_info(&ei, fofs, len, 0, false,
			tei->age, tei->last_blocks, EX_BLOCK_AGE);
	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
		__insert_extent_tree(sbi, et, &ei,
					insert_p, insert_parent, leftmost);
out_read_extent_cache:
	write_unlock(&et->lock);

	if (updated)
@@ -796,7 +859,7 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
	if (en)
		goto unlock_out;

	__set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ);
	__set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ);
	ei.c_len = c_len;

	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -807,6 +870,72 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
}
#endif

static unsigned long long __calculate_block_age(unsigned long long new,
						unsigned long long old)
{
	unsigned long long diff;

	diff = (new >= old) ? new - (new - old) : new + (old - new);

	return div_u64(diff * LAST_AGE_WEIGHT, 100);
}

/* This returns a new age and allocated blocks in ei */
static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
{
	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
	loff_t f_size = i_size_read(inode);
	unsigned long long cur_blocks =
				atomic64_read(&sbi->allocated_data_blocks);

	/*
	 * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
	 * file block even in seq write. So don't record age for newly last file
	 * block here.
	 */
	if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
			ei->blk == NEW_ADDR)
		return -EINVAL;

	if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) {
		unsigned long long cur_age;

		if (cur_blocks >= ei->last_blocks)
			cur_age = cur_blocks - ei->last_blocks;
		else
			/* allocated_data_blocks overflow */
			cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks;

		if (ei->age)
			ei->age = __calculate_block_age(cur_age, ei->age);
		else
			ei->age = cur_age;
		ei->last_blocks = cur_blocks;
		WARN_ON(ei->age > cur_blocks);
		return 0;
	}

	f2fs_bug_on(sbi, ei->blk == NULL_ADDR);

	/* the data block was allocated for the first time */
	if (ei->blk == NEW_ADDR)
		goto out;

	if (__is_valid_data_blkaddr(ei->blk) &&
			!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) {
		f2fs_bug_on(sbi, 1);
		return -EINVAL;
	}
out:
	/*
	 * init block age with zero, this can happen when the block age extent
	 * was reclaimed due to memory constraint or system reboot
	 */
	ei->age = 0;
	ei->last_blocks = cur_blocks;
	return 0;
}

static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
{
	struct extent_info ei;
@@ -823,6 +952,10 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ
			ei.blk = NULL_ADDR;
		else
			ei.blk = dn->data_blkaddr;
	} else if (type == EX_BLOCK_AGE) {
		ei.blk = dn->data_blkaddr;
		if (__get_new_block_age(dn->inode, &ei))
			return;
	}
	__update_extent_tree_range(dn->inode, &ei, type);
}
@@ -940,6 +1073,43 @@ unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrin
	return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
}

/* block age extent cache operations */
bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
				struct extent_info *ei)
{
	if (!__may_extent_tree(inode, EX_BLOCK_AGE))
		return false;

	return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE);
}

void f2fs_update_age_extent_cache(struct dnode_of_data *dn)
{
	return __update_extent_cache(dn, EX_BLOCK_AGE);
}

void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
				pgoff_t fofs, unsigned int len)
{
	struct extent_info ei = {
		.fofs = fofs,
		.len = len,
	};

	if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
		return;

	__update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE);
}

unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
{
	if (!test_opt(sbi, AGE_EXTENT_CACHE))
		return 0;

	return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
}

static unsigned int __destroy_extent_node(struct inode *inode,
					enum extent_type type)
{
@@ -960,6 +1130,7 @@ static unsigned int __destroy_extent_node(struct inode *inode,
void f2fs_destroy_extent_node(struct inode *inode)
{
	__destroy_extent_node(inode, EX_READ);
	__destroy_extent_node(inode, EX_BLOCK_AGE);
}

static void __drop_extent_tree(struct inode *inode, enum extent_type type)
@@ -988,6 +1159,7 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type)
void f2fs_drop_extent_tree(struct inode *inode)
{
	__drop_extent_tree(inode, EX_READ);
	__drop_extent_tree(inode, EX_BLOCK_AGE);
}

static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
@@ -1028,6 +1200,7 @@ static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
void f2fs_destroy_extent_tree(struct inode *inode)
{
	__destroy_extent_tree(inode, EX_READ);
	__destroy_extent_tree(inode, EX_BLOCK_AGE);
}

static void __init_extent_tree_info(struct extent_tree_info *eti)
@@ -1045,6 +1218,12 @@ static void __init_extent_tree_info(struct extent_tree_info *eti)
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
{
	__init_extent_tree_info(&sbi->extent_tree[EX_READ]);
	__init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]);

	/* initialize for block age extents */
	atomic64_set(&sbi->allocated_data_blocks, 0);
	sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
	sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
}

int __init f2fs_create_extent_cache(void)
+38 −0
Original line number Diff line number Diff line
@@ -107,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_MERGE_CHECKPOINT	0x10000000
#define	F2FS_MOUNT_GC_MERGE		0x20000000
#define F2FS_MOUNT_COMPRESS_CACHE	0x40000000
#define F2FS_MOUNT_AGE_EXTENT_CACHE	0x80000000

#define F2FS_OPTION(sbi)	((sbi)->mount_opt)
#define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -607,9 +608,22 @@ enum {
/* number of extent info in extent cache we try to shrink */
#define READ_EXTENT_CACHE_SHRINK_NUMBER	128

/* number of age extent info in extent cache we try to shrink */
#define AGE_EXTENT_CACHE_SHRINK_NUMBER	128
#define LAST_AGE_WEIGHT			30
#define SAME_AGE_REGION			1024

/*
 * Define data block with age less than 1GB as hot data
 * define data block with age less than 10GB but more than 1GB as warm data
 */
#define DEF_HOT_DATA_AGE_THRESHOLD	262144
#define DEF_WARM_DATA_AGE_THRESHOLD	2621440

/* extent cache type */
enum extent_type {
	EX_READ,
	EX_BLOCK_AGE,
	NR_EXTENT_CACHES,
};

@@ -637,6 +651,13 @@ struct extent_info {
			unsigned int c_len;
#endif
		};
		/* block age extent_cache */
		struct {
			/* block age of the extent */
			unsigned long long age;
			/* last total blocks allocated */
			unsigned long long last_blocks;
		};
	};
};

@@ -1653,6 +1674,11 @@ struct f2fs_sb_info {

	/* for extent tree cache */
	struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
	atomic64_t allocated_data_blocks;	/* for block age extent_cache */

	/* The threshold used for hot and warm data seperation*/
	unsigned int hot_data_age_threshold;
	unsigned int warm_data_age_threshold;

	/* basic filesystem units */
	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -3857,6 +3883,8 @@ struct f2fs_stat_info {
	unsigned long long ext_mem[NR_EXTENT_CACHES];
	/* for read extent cache */
	unsigned long long hit_largest;
	/* for block age extent cache */
	unsigned long long allocated_data_blocks;
	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
	int ndirty_data, ndirty_qdata;
	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -4168,6 +4196,16 @@ void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
			int nr_shrink);

/* block age extent cache ops */
void f2fs_init_age_extent_tree(struct inode *inode);
bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
			struct extent_info *ei);
void f2fs_update_age_extent_cache(struct dnode_of_data *dn);
void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
			pgoff_t fofs, unsigned int len);
unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi,
			int nr_shrink);

/*
 * sysfs.c
 */
Loading