Commit 967c28b2 authored by Gao Xiang's avatar Gao Xiang
Browse files

erofs: kill hooked chains to avoid loops on deduplicated compressed images



After heavily stressing EROFS with several images which include a
hand-crafted image of repeated patterns for more than 46 days, I found
two chains could be linked with each other almost simultaneously and
form a loop so that the entire loop won't be submitted.  As a
consequence, the corresponding file pages will remain locked forever.

It can be _only_ observed on data-deduplicated compressed images.
For example, consider two chains with five pclusters in total:
	Chain 1:  2->3->4->5    -- The tail pcluster is 5;
        Chain 2:  5->1->2       -- The tail pcluster is 2.

Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link
to Chain 2 at the same time with pcluster 2.

Since hooked chains are all linked locklessly now, I have no idea how
to simply avoid the race.  Instead, let's avoid hooked chains completely
until I could work out a proper way to fix this and end users finally
tell us that it's needed to add it back.

Actually, this optimization can be found with multi-threaded workloads
(especially even more often on deduplicated compressed images), yet I'm
not sure about the overall system impacts of not having this compared
with implementation complexity.

Fixes: 267f2492 ("erofs: introduce multi-reference pclusters (fully-referenced)")
Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: default avatarYue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-4-hsiangkao@linux.alibaba.com
parent 6ab5eed6
Loading
Loading
Loading
Loading
+11 −61
Original line number Diff line number Diff line
@@ -93,11 +93,8 @@ struct z_erofs_pcluster {

/* let's avoid the valid 32-bit kernel addresses */

/* the chained workgroup has't submitted io (still open) */
/* the end of a chain of pclusters */
#define Z_EROFS_PCLUSTER_TAIL           ((void *)0x5F0ECAFE)
/* the chained workgroup has already submitted io */
#define Z_EROFS_PCLUSTER_TAIL_CLOSED    ((void *)0x5F0EDEAD)

#define Z_EROFS_PCLUSTER_NIL            (NULL)

struct z_erofs_decompressqueue {
@@ -504,20 +501,6 @@ int __init z_erofs_init_zip_subsystem(void)

enum z_erofs_pclustermode {
	Z_EROFS_PCLUSTER_INFLIGHT,
	/*
	 * The current pclusters was the tail of an exist chain, in addition
	 * that the previous processed chained pclusters are all decided to
	 * be hooked up to it.
	 * A new chain will be created for the remaining pclusters which are
	 * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
	 * the next pcluster cannot reuse the whole page safely for inplace I/O
	 * in the following scenario:
	 *  ________________________________________________________________
	 * |      tail (partial) page     |       head (partial) page       |
	 * |   (belongs to the next pcl)  |   (belongs to the current pcl)  |
	 * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
	 */
	Z_EROFS_PCLUSTER_HOOKED,
	/*
	 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
	 * could be dispatched into bypass queue later due to uptodated managed
@@ -535,8 +518,8 @@ enum z_erofs_pclustermode {
	 *  ________________________________________________________________
	 * |  tail (partial) page |          head (partial) page           |
	 * |  (of the current cl) |      (of the previous collection)      |
	 * | PCLUSTER_FOLLOWED or |                                        |
	 * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
	 * |                      |                                        |
	 * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
	 *
	 * [  (*) the above page can be used as inplace I/O.               ]
	 */
@@ -550,7 +533,7 @@ struct z_erofs_decompress_frontend {

	struct page *pagepool;
	struct page *candidate_bvpage;
	struct z_erofs_pcluster *pcl, *tailpcl;
	struct z_erofs_pcluster *pcl;
	z_erofs_next_pcluster_t owned_head;
	enum z_erofs_pclustermode mode;

@@ -755,19 +738,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
		return;
	}

	/*
	 * type 2, link to the end of an existing open chain, be careful
	 * that its submission is controlled by the original attached chain.
	 */
	if (*owned_head != &pcl->next && pcl != f->tailpcl &&
	    cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
		    *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
		*owned_head = Z_EROFS_PCLUSTER_TAIL;
		f->mode = Z_EROFS_PCLUSTER_HOOKED;
		f->tailpcl = NULL;
		return;
	}
	/* type 3, it belongs to a chain, but it isn't the end of the chain */
	/* type 2, it belongs to an ongoing chain */
	f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}

@@ -828,9 +799,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
			goto err_out;
		}
	}
	/* used to check tail merging loop due to corrupted images */
	if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
		fe->tailpcl = pcl;
	fe->owned_head = &pcl->next;
	fe->pcl = pcl;
	return 0;
@@ -851,7 +819,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)

	/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
	DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
	DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);

	if (!(map->m_flags & EROFS_MAP_META)) {
		grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -870,10 +837,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)

	if (ret == -EEXIST) {
		mutex_lock(&fe->pcl->lock);
		/* used to check tail merging loop due to corrupted images */
		if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
			fe->tailpcl = fe->pcl;

		z_erofs_try_to_claim_pcluster(fe);
	} else if (ret) {
		return ret;
@@ -1028,8 +991,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
	 * those chains are handled asynchronously thus the page cannot be used
	 * for inplace I/O or bvpage (should be processed in a strict order.)
	 */
	tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
		  fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
	tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);

	cur = end - min_t(unsigned int, offset + end - map->m_la, end);
	if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1398,10 +1360,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
	};
	z_erofs_next_pcluster_t owned = io->head;

	while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
		/* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
		DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
		/* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
	while (owned != Z_EROFS_PCLUSTER_TAIL) {
		DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);

		be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1418,7 +1377,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
		container_of(work, struct z_erofs_decompressqueue, u.work);
	struct page *pagepool = NULL;

	DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
	DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
	z_erofs_decompress_queue(bgq, &pagepool);
	erofs_release_pages(&pagepool);
	kvfree(bgq);
@@ -1606,7 +1565,7 @@ static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
		q->sync = true;
	}
	q->sb = sb;
	q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
	q->head = Z_EROFS_PCLUSTER_TAIL;
	return q;
}

@@ -1624,11 +1583,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
	z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
	z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];

	DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
	if (owned_head == Z_EROFS_PCLUSTER_TAIL)
		owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;

	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);

	WRITE_ONCE(*submit_qtail, owned_head);
	WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1698,15 +1653,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
		unsigned int i = 0;
		bool bypass = true;

		/* no possible 'owned_head' equals the following */
		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);

		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
		owned_head = READ_ONCE(pcl->next);

		/* close the main owned chain at first */
		owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
				     Z_EROFS_PCLUSTER_TAIL_CLOSED);
		if (z_erofs_is_inline_pcluster(pcl)) {
			move_to_bypass_jobqueue(pcl, qtail, owned_head);
			continue;