Commit a44a027a authored by Dave Chinner's avatar Dave Chinner
Browse files

Merge tag 'large-extent-counters-v9' of https://github.com/chandanr/linux into xfs-5.19-for-next



xfs: Large extent counters

The commit xfs: fix inode fork extent count overflow
(3f8a4f1d) mentions that 10 billion
data fork extents should be possible to create. However the
corresponding on-disk field has a signed 32-bit type. Hence this
patchset extends the per-inode data fork extent counter to 64 bits
(out of which 48 bits are used to store the extent count).

Also, XFS has an attribute fork extent counter which is 16 bits
wide. A workload that,
1. Creates 1 million 255-byte sized xattrs,
2. Deletes 50% of these xattrs in an alternating manner,
3. Tries to insert 400,000 new 255-byte sized xattrs
   causes the xattr extent counter to overflow.

Dave tells me that there are instances where a single file has more
than 100 million hardlinks. With parent pointers being stored in
xattrs, we will overflow the signed 16-bits wide attribute extent
counter when large number of hardlinks are created. Hence this
patchset extends the on-disk field to 32-bits.

The following changes are made to accomplish this,
1. A 64-bit inode field is carved out of existing di_pad and
   di_flushiter fields to hold the 64-bit data fork extent counter.
2. The existing 32-bit inode data fork extent counter will be used to
   hold the attribute fork extent counter.
3. A new incompat superblock flag to prevent older kernels from mounting
   the filesystem.

Signed-off-by: default avatarChandan Babu R <chandan.babu@oracle.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parents 463260d7 973ac0eb
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(

	ASSERT(bno != NULLFSBLOCK);
	ASSERT(len > 0);
	ASSERT(len <= MAXEXTLEN);
	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
	ASSERT(!isnullstartblock(bno));
	agno = XFS_FSB_TO_AGNO(mp, bno);
	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+3 −0
Original line number Diff line number Diff line
@@ -776,6 +776,9 @@ xfs_attr_set(
	if (args->value || xfs_inode_hasattr(dp)) {
		error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
				XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
		if (error == -EFBIG)
			error = xfs_iext_count_upgrade(args->trans, dp,
					XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
		if (error)
			goto out_trans_cancel;
	}
+45 −64
Original line number Diff line number Diff line
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
	xfs_mount_t	*mp,		/* file system mount structure */
	int		whichfork)	/* data or attr fork */
{
	uint64_t	maxblocks;	/* max blocks at this level */
	xfs_extnum_t	maxleafents;	/* max leaf entries possible */
	int		level;		/* btree level */
	uint		maxblocks;	/* max blocks at this level */
	uint		maxleafents;	/* max leaf entries possible */
	int		maxrootrecs;	/* max records in root block */
	int		minleafrecs;	/* min records in leaf block */
	int		minnoderecs;	/* min records in node block */
	int		sz;		/* root block size */

	/*
	 * The maximum number of extents in a file, hence the maximum number of
	 * leaf entries, is controlled by the size of the on-disk extent count,
	 * either a signed 32-bit number for the data fork, or a signed 16-bit
	 * number for the attr fork.
	 * The maximum number of extents in a fork, hence the maximum number of
	 * leaf entries, is controlled by the size of the on-disk extent count.
	 *
	 * Note that we can no longer assume that if we are in ATTR1 that the
	 * fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
	 * ATTR2 we have to assume the worst case scenario of a minimum size
	 * available.
	 */
	if (whichfork == XFS_DATA_FORK) {
		maxleafents = MAXEXTNUM;
	maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
				whichfork);
	if (whichfork == XFS_DATA_FORK)
		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
	} else {
		maxleafents = MAXAEXTNUM;
	else
		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
	}

	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
	minleafrecs = mp->m_bmap_dmnr[0];
	minnoderecs = mp->m_bmap_dmnr[1];
	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
	maxblocks = howmany_64(maxleafents, minleafrecs);
	for (level = 1; maxblocks > 1; level++) {
		if (maxblocks <= maxrootrecs)
			maxblocks = 1;
		else
			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
			maxblocks = howmany_64(maxblocks, minnoderecs);
	}
	mp->m_bm_maxlevels[whichfork] = level;
	ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -468,7 +466,7 @@ xfs_bmap_check_leaf_extents(
	if (bp_release)
		xfs_trans_brelse(NULL, bp);
error_norelse:
	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
	xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
		__func__, i);
	xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
	    LEFT.br_state == new->br_state &&
	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
	    LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
		state |= BMAP_LEFT_CONTIG;

	/*
@@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
	    new_endoff == RIGHT.br_startoff &&
	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
	    new->br_state == RIGHT.br_state &&
	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
	    new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
		       BMAP_RIGHT_FILLING)) !=
		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
		       BMAP_RIGHT_FILLING) ||
	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
			<= MAXEXTLEN))
			<= XFS_MAX_BMBT_EXTLEN))
		state |= BMAP_RIGHT_CONTIG;

	error = 0;
@@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
	    LEFT.br_state == new->br_state &&
	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
	    LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
		state |= BMAP_LEFT_CONTIG;

	/*
@@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
	    new_endoff == RIGHT.br_startoff &&
	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
	    new->br_state == RIGHT.br_state &&
	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
	    new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
		       BMAP_RIGHT_FILLING)) !=
		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
		       BMAP_RIGHT_FILLING) ||
	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
			<= MAXEXTLEN))
			<= XFS_MAX_BMBT_EXTLEN))
		state |= BMAP_RIGHT_CONTIG;

	/*
@@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
	 */
	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
	    left.br_startoff + left.br_blockcount == new->br_startoff &&
	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
	    left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
		state |= BMAP_LEFT_CONTIG;

	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
	    new->br_startoff + new->br_blockcount == right.br_startoff &&
	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
	    new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
	    (!(state & BMAP_LEFT_CONTIG) ||
	     (left.br_blockcount + new->br_blockcount +
	      right.br_blockcount <= MAXEXTLEN)))
	      right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
		state |= BMAP_RIGHT_CONTIG;

	/*
@@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
	    left.br_startoff + left.br_blockcount == new->br_startoff &&
	    left.br_startblock + left.br_blockcount == new->br_startblock &&
	    left.br_state == new->br_state &&
	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
	    left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
		state |= BMAP_LEFT_CONTIG;

	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
	    new->br_startoff + new->br_blockcount == right.br_startoff &&
	    new->br_startblock + new->br_blockcount == right.br_startblock &&
	    new->br_state == right.br_state &&
	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
	    new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
	    (!(state & BMAP_LEFT_CONTIG) ||
	     left.br_blockcount + new->br_blockcount +
	     right.br_blockcount <= MAXEXTLEN))
	     right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
		state |= BMAP_RIGHT_CONTIG;

	error = 0;
@@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(

	/*
	 * For large extent hint sizes, the aligned extent might be larger than
	 * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
	 * the length back under MAXEXTLEN. The outer allocation loops handle
	 * short allocation just fine, so it is safe to do this. We only want to
	 * do it when we are forced to, though, because it means more allocation
	 * operations are required.
	 * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
	 * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
	 * allocation loops handle short allocation just fine, so it is safe to
	 * do this. We only want to do it when we are forced to, though, because
	 * it means more allocation operations are required.
	 */
	while (align_alen > MAXEXTLEN)
	while (align_alen > XFS_MAX_BMBT_EXTLEN)
		align_alen -= extsz;
	ASSERT(align_alen <= MAXEXTLEN);
	ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);

	/*
	 * If the previous block overlaps with this proposed allocation
@@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
			return -EINVAL;
	} else {
		ASSERT(orig_off >= align_off);
		/* see MAXEXTLEN handling above */
		/* see XFS_BMBT_MAX_EXTLEN handling above */
		ASSERT(orig_end <= align_off + align_alen ||
		       align_alen + extsz > MAXEXTLEN);
		       align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
	}

#ifdef DEBUG
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
	 * Cap the alloc length. Keep track of prealloc so we know whether to
	 * tag the inode before we return.
	 */
	alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
	alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
	if (!eof)
		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
	if (prealloc && alen >= len)
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
		if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
			bma->prev.br_startoff = NULLFILEOFF;
	} else {
		bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
		bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
		if (!bma->eof)
			bma->length = XFS_FILBLKS_MIN(bma->length,
					bma->got.br_startoff - bma->offset);
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
			 * xfs_extlen_t and therefore 32 bits. Hence we have to
			 * check for 32-bit overflows and handle them here.
			 */
			if (len > (xfs_filblks_t)MAXEXTLEN)
				bma.length = MAXEXTLEN;
			if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
				bma.length = XFS_MAX_BMBT_EXTLEN;
			else
				bma.length = len;

@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
		return error;

	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, ip, 0);

	error = xfs_iext_count_may_overflow(ip, whichfork,
			XFS_IEXT_ADD_NOSPLIT_CNT);
	if (error == -EFBIG)
		error = xfs_iext_count_upgrade(tp, ip,
				XFS_IEXT_ADD_NOSPLIT_CNT);
	if (error)
		goto out_trans_cancel;

	xfs_trans_ijoin(tp, ip, 0);

	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
	    bma.got.br_startoff > offset_fsb) {
		/*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
	bma.ip = ip;
	bma.wasdel = true;
	bma.offset = bma.got.br_startoff;
	bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
	bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
			XFS_MAX_BMBT_EXTLEN);
	bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);

	/*
@@ -4641,7 +4642,7 @@ xfs_bmapi_remap(

	ifp = XFS_IFORK_PTR(ip, whichfork);
	ASSERT(len > 0);
	ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
	ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
			   XFS_BMAPI_NORMAP)));
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
		 * Deleting the middle of the extent.
		 */

		/*
		 * For directories, -ENOSPC is returned since a directory entry
		 * remove operation must not fail due to low extent count
		 * availability. -ENOSPC will be handled by higher layers of XFS
		 * by letting the corresponding empty Data/Free blocks to linger
		 * until a future remove operation. Dabtree blocks would be
		 * swapped with the last block in the leaf space and then the
		 * new last block will be unmapped.
		 *
		 * The above logic also applies to the source directory entry of
		 * a rename operation.
		 */
		error = xfs_iext_count_may_overflow(ip, whichfork, 1);
		if (error) {
			ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
				whichfork == XFS_DATA_FORK);
			error = -ENOSPC;
			goto done;
		}

		old = got;

		got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5641,7 +5622,7 @@ xfs_bmse_can_merge(
	if ((left->br_startoff + left->br_blockcount != startoff) ||
	    (left->br_startblock + left->br_blockcount != got->br_startblock) ||
	    (left->br_state != got->br_state) ||
	    (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
	    (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
		return false;

	return true;
+7 −2
Original line number Diff line number Diff line
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
	return xfs_bmbt_block_maxrecs(blocklen, leaf);
}

/* Compute the max possible height for block mapping btrees. */
/*
 * Calculate the maximum possible height of the btree that the on-disk format
 * supports. This is used for sizing structures large enough to support every
 * possible configuration of a filesystem that might get mounted.
 */
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
	minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;

	/* One extra level for the inode root. */
	return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
	return xfs_btree_compute_maxlevels(minrecs,
			XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}

/*
+1 −0
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
	unsigned int	free_hdr_size;	/* dir2 free header size */
	unsigned int	free_max_bests;	/* # of bests entries in dir2 free */
	xfs_dablk_t	freeblk;	/* blockno of free data v2 */
	xfs_extnum_t	max_extents;	/* Max. extents in corresponding fork */

	xfs_dir2_data_aoff_t data_first_offset;
	size_t		data_entry_offset;
Loading