Commit b5b3097d authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull erofs updates from Gao Xiang:
 "In this cycle, we would like to introduce a new feature called big
  pcluster so EROFS can compress file data into more than 1 fs block and
  different pcluster size can be selected for each (sub-)files by
  design.

  The current EROFS test results on my laptop are [1]:

    Testscript: erofs-openbenchmark
    Testdata: enwik9 (1000000000 bytes)
     ________________________________________________________________
    |  file system  |   size    | seq read | rand read | rand9m read |
    |_______________|___________|_ MiB/s __|__ MiB/s __|___ MiB/s ___|
    |___erofs_4k____|_556879872_|_ 781.4 __|__ 55.3 ___|___ 25.3  ___|
    |___erofs_16k___|_452509696_|_ 864.8 __|_ 123.2 ___|___ 20.8  ___|
    |___erofs_32k___|_415223808_|_ 899.8 __|_ 105.8 _*_|___ 16.8 ____|
    |___erofs_64k___|_393814016_|_ 906.6 __|__ 66.6 _*_|___ 11.8 ____|
    |__squashfs_8k__|_556191744_|_  64.9 __|__ 19.3 ___|____ 9.1 ____|
    |__squashfs_16k_|_502661120_|_  98.9 __|__ 38.0 ___|____ 9.8 ____|
    |__squashfs_32k_|_458784768_|_ 115.4 __|__ 71.6 _*_|___ 10.0 ____|
    |_squashfs_128k_|_398204928_|_ 257.2 __|_ 253.8 _*_|___ 10.9 ____|
    |____ext4_4k____|____()_____|_ 786.6 __|__ 28.6 ___|___ 27.8 ____|

  which has been verified but I'd like warn it as experimental for a
  while. This matches erofs-utils dev branch and I'll also release a new
  userspace version for this later.

  Apart from that, several improvements are also included: eg complete a
  missing case for inplace I/O, optimize endio decompression logic for
  non-atomic contexts and support adjustable sliding window size, ... In
  addition to those, there are some cleanups as always.

  Summary:

   - avoid memory failure when applying rolling decompression

   - optimize endio decompression logic for non-atomic contexts

   - complete a missing case which can be safely selected for inplace
     I/O and thus decreasing more memory footprint

   - check unsupported on-disk inode i_format strictly

   - support adjustable lz4 sliding window size to decrease runtime
     memory footprint

   - support on-disk compression configurations

   - support big pcluster decompression

   - several code cleanups / spelling correction"

* tag 'erofs-for-5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs: (21 commits)
  erofs: enable big pcluster feature
  erofs: support decompress big pcluster for lz4 backend
  erofs: support parsing big pcluster compact indexes
  erofs: support parsing big pcluster compress indexes
  erofs: adjust per-CPU buffers according to max_pclusterblks
  erofs: add big physical cluster definition
  erofs: fix up inplace I/O pointer for big pcluster
  erofs: introduce physical cluster slab pools
  erofs: introduce multipage per-CPU buffers
  erofs: reserve physical_clusterbits[]
  erofs: Clean up spelling mistakes found in fs/erofs
  erofs: add on-disk compression configurations
  erofs: introduce on-disk lz4 fs configurations
  erofs: support adjust lz4 history window size
  erofs: introduce erofs_sb_has_xxx() helpers
  erofs: add unsupported inode i_format check
  erofs: don't use erofs_map_blocks() any more
  erofs: complete a missing case for inplace I/O
  erofs: use sync decompression for atomic contexts only
  erofs: use workqueue decompression for atomic contexts only
  ...
parents befbfe07 8e6c8fa9
Loading
Loading
Loading
Loading
+0 −14
Original line number Diff line number Diff line
@@ -76,17 +76,3 @@ config EROFS_FS_ZIP

	  If you don't want to enable compression feature, say N.
config EROFS_FS_CLUSTER_PAGE_LIMIT
	int "EROFS Cluster Pages Hard Limit"
	depends on EROFS_FS_ZIP
	range 1 256
	default "1"
	help
	  Indicates maximum # of pages of a compressed
	  physical cluster.

	  For example, if files in a image were compressed
	  into 8k-unit, hard limit should not be configured
	  less than 2. Otherwise, the image will be refused
	  to mount on this kernel.
+1 −1
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only

obj-$(CONFIG_EROFS_FS) += erofs.o
erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+2 −17
Original line number Diff line number Diff line
@@ -109,21 +109,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
	return err;
}

int erofs_map_blocks(struct inode *inode,
		     struct erofs_map_blocks *map, int flags)
{
	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
		int err = z_erofs_map_blocks_iter(inode, map, flags);

		if (map->mpage) {
			put_page(map->mpage);
			map->mpage = NULL;
		}
		return err;
	}
	return erofs_map_blocks_flatmode(inode, map, flags);
}

static inline struct bio *erofs_read_raw_page(struct bio *bio,
					      struct address_space *mapping,
					      struct page *page,
@@ -159,7 +144,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio,
		erofs_blk_t blknr;
		unsigned int blkoff;

		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
		err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
		if (err)
			goto err_out;

@@ -318,7 +303,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
			return 0;
	}

	if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
	if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
		return erofs_blknr(map.m_pa);

	return 0;
+168 −104
Original line number Diff line number Diff line
@@ -28,6 +28,42 @@ struct z_erofs_decompressor {
	char *name;
};

int z_erofs_load_lz4_config(struct super_block *sb,
			    struct erofs_super_block *dsb,
			    struct z_erofs_lz4_cfgs *lz4, int size)
{
	struct erofs_sb_info *sbi = EROFS_SB(sb);
	u16 distance;

	if (lz4) {
		if (size < sizeof(struct z_erofs_lz4_cfgs)) {
			erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
			return -EINVAL;
		}
		distance = le16_to_cpu(lz4->max_distance);

		sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
		if (!sbi->lz4.max_pclusterblks) {
			sbi->lz4.max_pclusterblks = 1;	/* reserved case */
		} else if (sbi->lz4.max_pclusterblks >
			   Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
			erofs_err(sb, "too large lz4 pclusterblks %u",
				  sbi->lz4.max_pclusterblks);
			return -EINVAL;
		} else if (sbi->lz4.max_pclusterblks >= 2) {
			erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
		}
	} else {
		distance = le16_to_cpu(dsb->u1.lz4_max_distance);
		sbi->lz4.max_pclusterblks = 1;
	}

	sbi->lz4.max_distance_pages = distance ?
					DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
					LZ4_MAX_DISTANCE_PAGES;
	return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
}

static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
					 struct list_head *pagepool)
{
@@ -36,6 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
	struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
	unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
					   BITS_PER_LONG)] = { 0 };
	unsigned int lz4_max_distance_pages =
				EROFS_SB(rq->sb)->lz4.max_distance_pages;
	void *kaddr = NULL;
	unsigned int i, j, top;

@@ -44,14 +82,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
		struct page *const page = rq->out[i];
		struct page *victim;

		if (j >= LZ4_MAX_DISTANCE_PAGES)
		if (j >= lz4_max_distance_pages)
			j = 0;

		/* 'valid' bounced can only be tested after a complete round */
		if (test_bit(j, bounced)) {
			DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
			DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
			availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
			DBG_BUGON(i < lz4_max_distance_pages);
			DBG_BUGON(top >= lz4_max_distance_pages);
			availables[top++] = rq->out[i - lz4_max_distance_pages];
		}

		if (page) {
@@ -73,9 +111,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
			victim = availables[--top];
			get_page(victim);
		} else {
			victim = erofs_allocpage(pagepool, GFP_KERNEL);
			if (!victim)
				return -ENOMEM;
			victim = erofs_allocpage(pagepool,
						 GFP_KERNEL | __GFP_NOFAIL);
			set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
		}
		rq->out[i] = victim;
@@ -83,96 +120,123 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
	return kaddr ? 1 : 0;
}

static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
				       u8 *src, unsigned int pageofs_in)
static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
			void *inpage, unsigned int *inputmargin, int *maptype,
			bool support_0padding)
{
	/*
	 * if in-place decompression is ongoing, those decompressed
	 * pages should be copied in order to avoid being overlapped.
	 */
	struct page **in = rq->in;
	u8 *const tmp = erofs_get_pcpubuf(0);
	u8 *tmpp = tmp;
	unsigned int inlen = rq->inputsize - pageofs_in;
	unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
	unsigned int nrpages_in, nrpages_out;
	unsigned int ofull, oend, inputsize, total, i, j;
	struct page **in;
	void *src, *tmp;

	while (tmpp < tmp + inlen) {
	inputsize = rq->inputsize;
	nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
	oend = rq->pageofs_out + rq->outputsize;
	ofull = PAGE_ALIGN(oend);
	nrpages_out = ofull >> PAGE_SHIFT;

	if (rq->inplace_io) {
		if (rq->partial_decoding || !support_0padding ||
		    ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
			goto docopy;

		for (i = 0; i < nrpages_in; ++i) {
			DBG_BUGON(rq->in[i] == NULL);
			for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
				if (rq->out[j] == rq->in[i])
					goto docopy;
		}
	}

	if (nrpages_in <= 1) {
		*maptype = 0;
		return inpage;
	}
	kunmap_atomic(inpage);
	might_sleep();
	src = erofs_vm_map_ram(rq->in, nrpages_in);
	if (!src)
			src = kmap_atomic(*in);
		memcpy(tmpp, src + pageofs_in, count);
		kunmap_atomic(src);
		src = NULL;
		tmpp += count;
		pageofs_in = 0;
		count = PAGE_SIZE;
		return ERR_PTR(-ENOMEM);
	*maptype = 1;
	return src;

docopy:
	/* Or copy compressed data which can be overlapped to per-CPU buffer */
	in = rq->in;
	src = erofs_get_pcpubuf(nrpages_in);
	if (!src) {
		DBG_BUGON(1);
		kunmap_atomic(inpage);
		return ERR_PTR(-EFAULT);
	}

	tmp = src;
	total = rq->inputsize;
	while (total) {
		unsigned int page_copycnt =
			min_t(unsigned int, total, PAGE_SIZE - *inputmargin);

		if (!inpage)
			inpage = kmap_atomic(*in);
		memcpy(tmp, inpage + *inputmargin, page_copycnt);
		kunmap_atomic(inpage);
		inpage = NULL;
		tmp += page_copycnt;
		total -= page_copycnt;
		++in;
		*inputmargin = 0;
	}
	return tmp;
	*maptype = 2;
	return src;
}

static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
{
	unsigned int inputmargin, inlen;
	u8 *src;
	bool copied, support_0padding;
	int ret;
	unsigned int inputmargin;
	u8 *headpage, *src;
	bool support_0padding;
	int ret, maptype;

	if (rq->inputsize > PAGE_SIZE)
		return -EOPNOTSUPP;

	src = kmap_atomic(*rq->in);
	DBG_BUGON(*rq->in == NULL);
	headpage = kmap_atomic(*rq->in);
	inputmargin = 0;
	support_0padding = false;

	/* decompression inplace is only safe when 0padding is enabled */
	if (EROFS_SB(rq->sb)->feature_incompat &
	    EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
	if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
		support_0padding = true;

		while (!src[inputmargin & ~PAGE_MASK])
		while (!headpage[inputmargin & ~PAGE_MASK])
			if (!(++inputmargin & ~PAGE_MASK))
				break;

		if (inputmargin >= rq->inputsize) {
			kunmap_atomic(src);
			kunmap_atomic(headpage);
			return -EIO;
		}
	}

	copied = false;
	inlen = rq->inputsize - inputmargin;
	if (rq->inplace_io) {
		const uint oend = (rq->pageofs_out +
				   rq->outputsize) & ~PAGE_MASK;
		const uint nr = PAGE_ALIGN(rq->pageofs_out +
					   rq->outputsize) >> PAGE_SHIFT;

		if (rq->partial_decoding || !support_0padding ||
		    rq->out[nr - 1] != rq->in[0] ||
		    rq->inputsize - oend <
		      LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
			src = generic_copy_inplace_data(rq, src, inputmargin);
			inputmargin = 0;
			copied = true;
		}
	}
	rq->inputsize -= inputmargin;
	src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
					support_0padding);
	if (IS_ERR(src))
		return PTR_ERR(src);

	/* legacy format could compress extra data in a pcluster. */
	if (rq->partial_decoding || !support_0padding)
		ret = LZ4_decompress_safe_partial(src + inputmargin, out,
						  inlen, rq->outputsize,
						  rq->outputsize);
				rq->inputsize, rq->outputsize, rq->outputsize);
	else
		ret = LZ4_decompress_safe(src + inputmargin, out,
					  inlen, rq->outputsize);
					  rq->inputsize, rq->outputsize);

	if (ret != rq->outputsize) {
		erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
			  ret, inlen, inputmargin, rq->outputsize);
			  ret, rq->inputsize, inputmargin, rq->outputsize);

		WARN_ON(1);
		print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
			       16, 1, src + inputmargin, inlen, true);
			       16, 1, src + inputmargin, rq->inputsize, true);
		print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
			       16, 1, out, rq->outputsize, true);

@@ -181,10 +245,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
		ret = -EIO;
	}

	if (copied)
		erofs_put_pcpubuf(src);
	else
	if (maptype == 0) {
		kunmap_atomic(src);
	} else if (maptype == 1) {
		vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
	} else if (maptype == 2) {
		erofs_put_pcpubuf(src);
	} else {
		DBG_BUGON(1);
		return -EFAULT;
	}
	return ret;
}

@@ -234,8 +304,10 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
	const struct z_erofs_decompressor *alg = decompressors + rq->alg;
	unsigned int dst_maptype;
	void *dst;
	int ret, i;
	int ret;

	/* two optimized fast paths only for non bigpcluster cases yet */
	if (rq->inputsize <= PAGE_SIZE) {
		if (nrpages_out == 1 && !rq->inplace_io) {
			DBG_BUGON(!*rq->out);
			dst = kmap_atomic(*rq->out);
@@ -249,7 +321,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
		 * compressed data is preferred.
		 */
		if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
		dst = erofs_get_pcpubuf(0);
			dst = erofs_get_pcpubuf(1);
			if (IS_ERR(dst))
				return PTR_ERR(dst);

@@ -262,29 +334,21 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
			erofs_put_pcpubuf(dst);
			return ret;
		}
	}

	/* general decoding path which can be used for all cases */
	ret = alg->prepare_destpages(rq, pagepool);
	if (ret < 0) {
	if (ret < 0)
		return ret;
	} else if (ret) {
	if (ret) {
		dst = page_address(*rq->out);
		dst_maptype = 1;
		goto dstmap_out;
	}

	i = 0;
	while (1) {
		dst = vm_map_ram(rq->out, nrpages_out, -1);

		/* retry two more times (totally 3 times) */
		if (dst || ++i >= 3)
			break;
		vm_unmap_aliases();
	}

	dst = erofs_vm_map_ram(rq->out, nrpages_out);
	if (!dst)
		return -ENOMEM;

	dst_maptype = 2;

dstmap_out:
+44 −10
Original line number Diff line number Diff line
@@ -18,15 +18,22 @@
 * be incompatible with this kernel version.
 */
#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
#define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS	0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER	0x00000002
#define EROFS_ALL_FEATURE_INCOMPAT		\
	(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
	 EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
	 EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)

/* 128-byte erofs on-disk super block */
#define EROFS_SB_EXTSLOT_SIZE	16

/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
	__le32 magic;           /* file system magic number */
	__le32 checksum;        /* crc32c(super_block) */
	__le32 feature_compat;
	__u8 blkszbits;         /* support block_size == PAGE_SIZE only */
	__u8 reserved;
	__u8 sb_extslots;	/* superblock size = 128 + sb_extslots * 16 */

	__le16 root_nid;	/* nid of root directory */
	__le64 inos;            /* total valid ino # (== f_files - f_favail) */
@@ -39,7 +46,13 @@ struct erofs_super_block {
	__u8 uuid[16];          /* 128-bit uuid for volume */
	__u8 volume_name[16];   /* volume name */
	__le32 feature_incompat;
	__u8 reserved2[44];
	union {
		/* bitmap for available compression algorithms */
		__le16 available_compr_algs;
		/* customized sliding window size instead of 64k by default */
		__le16 lz4_max_distance;
	} __packed u1;
	__u8 reserved2[42];
};

/*
@@ -75,6 +88,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_BIT             0
#define EROFS_I_DATALAYOUT_BIT          1

#define EROFS_I_ALL	\
	((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)

/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
	__le16 i_format;	/* inode format hints */
@@ -189,20 +205,33 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
				 e->e_name_len + le16_to_cpu(e->e_value_size));
}

/* maximum supported size of a physical compression cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE	(1024 * 1024)

/* available compression algorithm types (for h_algorithmtype) */
enum {
	Z_EROFS_COMPRESSION_LZ4	= 0,
	Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS		(1 << (Z_EROFS_COMPRESSION_MAX - 1))

/* 14 bytes (+ length field = 16 bytes) */
struct z_erofs_lz4_cfgs {
	__le16 max_distance;
	__le16 max_pclusterblks;
	u8 reserved[10];
} __packed;

/*
 * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
 *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
 *                                  (4B) + 2B + (4B) if compacted 2B is on.
 * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
 * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
 */
#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0

#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
#define Z_EROFS_ADVISE_COMPACTED_2B		0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1		0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2		0x0004

struct z_erofs_map_header {
	__le32	h_reserved1;
@@ -214,9 +243,7 @@ struct z_erofs_map_header {
	__u8	h_algorithmtype;
	/*
	 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
	 * bit 3-4 : (physical - logical) cluster bits of head 1:
	 *       For example, if logical clustersize = 4096, 1 for 8192.
	 * bit 5-7 : (physical - logical) cluster bits of head 2.
	 * bit 3-7 : reserved.
	 */
	__u8	h_clusterbits;
};
@@ -259,6 +286,13 @@ enum {
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0

/*
 * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
 * compressed block count of a compressed extent (in logical clusters, aka.
 * block count of a pcluster).
 */
#define Z_EROFS_VLE_DI_D0_CBLKCNT		(1 << 11)

struct z_erofs_vle_decompressed_index {
	__le16 di_advise;
	/* where to decompress in the head cluster */
Loading