Merge tag 'xfs-5.14-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux (aa660326) · Commits · EulixOS / Software / Kernel

fs/xfs/libxfs/xfs_log_format.h

+10 −1

Original line number	Diff line number	Diff line
		@@ -411,7 +411,16 @@ struct xfs_log_dinode {
		/* start of the extended dinode, writable fields */
		uint32_t di_crc; /* CRC of the inode */
		uint64_t di_changecount; /* number of attribute changes */
		xfs_lsn_t di_lsn; /* flush sequence */

		/*
		* The LSN we write to this field during formatting is not a reflection
		* of the current on-disk LSN. It should never be used for recovery
		* sequencing, nor should it be recovered into the on-disk inode at all.
		* See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
		* for details.
		*/
		xfs_lsn_t di_lsn;

		uint64_t di_flags2; /* more random flags */
		uint32_t di_cowextsize; /* basic cow extent size for file */
		uint8_t di_pad2[12]; /* more padding for future expansion */

fs/xfs/xfs_buf_item_recover.c

+13 −2

Original line number	Diff line number	Diff line
		@@ -698,7 +698,8 @@ xlog_recover_do_inode_buffer(
		static xfs_lsn_t
		xlog_recover_get_buf_lsn(
		struct xfs_mount *mp,
		struct xfs_buf *bp)
		struct xfs_buf *bp,
		struct xfs_buf_log_format *buf_f)
		{
		uint32_t magic32;
		uint16_t magic16;
		@@ -706,11 +707,20 @@ xlog_recover_get_buf_lsn(
		void *blk = bp->b_addr;
		uuid_t *uuid;
		xfs_lsn_t lsn = -1;
		uint16_t blft;

		/* v4 filesystems always recover immediately */
		if (!xfs_sb_version_hascrc(&mp->m_sb))
		goto recover_immediately;

		/*
		* realtime bitmap and summary file blocks do not have magic numbers or
		* UUIDs, so we must recover them immediately.
		*/
		blft = xfs_blft_from_flags(buf_f);
		if (blft == XFS_BLFT_RTBITMAP_BUF \|\| blft == XFS_BLFT_RTSUMMARY_BUF)
		goto recover_immediately;

		magic32 = be32_to_cpu((__be32 )blk);
		switch (magic32) {
		case XFS_ABTB_CRC_MAGIC:
		@@ -796,6 +806,7 @@ xlog_recover_get_buf_lsn(
		switch (magicda) {
		case XFS_DIR3_LEAF1_MAGIC:
		case XFS_DIR3_LEAFN_MAGIC:
		case XFS_ATTR3_LEAF_MAGIC:
		case XFS_DA3_NODE_MAGIC:
		lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
		uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
		@@ -919,7 +930,7 @@ xlog_recover_buf_commit_pass2(
		* the verifier will be reset to match whatever recover turns that
		* buffer into.
		*/
		lsn = xlog_recover_get_buf_lsn(mp, bp);
		lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f);
		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
		trace_xfs_log_recover_buf_skip(log, buf_f);
		xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);

fs/xfs/xfs_inode_item_recover.c

+29 −10

Original line number	Diff line number	Diff line
		@@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts(
		STATIC void
		xfs_log_dinode_to_disk(
		struct xfs_log_dinode *from,
		struct xfs_dinode *to)
		struct xfs_dinode *to,
		xfs_lsn_t lsn)
		{
		to->di_magic = cpu_to_be16(from->di_magic);
		to->di_mode = cpu_to_be16(from->di_mode);
		@@ -182,7 +183,7 @@ xfs_log_dinode_to_disk(
		to->di_flags2 = cpu_to_be64(from->di_flags2);
		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
		to->di_ino = cpu_to_be64(from->di_ino);
		to->di_lsn = cpu_to_be64(from->di_lsn);
		to->di_lsn = cpu_to_be64(lsn);
		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
		uuid_copy(&to->di_uuid, &from->di_uuid);
		to->di_flushiter = 0;
		@@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2(
		}

		/*
		* If the inode has an LSN in it, recover the inode only if it's less
		* than the lsn of the transaction we are replaying. Note: we still
		* need to replay an owner change even though the inode is more recent
		* than the transaction as there is no guarantee that all the btree
		* blocks are more recent than this transaction, too.
		* If the inode has an LSN in it, recover the inode only if the on-disk
		* inode's LSN is older than the lsn of the transaction we are
		* replaying. We can have multiple checkpoints with the same start LSN,
		* so the current LSN being equal to the on-disk LSN doesn't necessarily
		* mean that the on-disk inode is more recent than the change being
		* replayed.
		*
		* We must check the current_lsn against the on-disk inode
		* here because the we can't trust the log dinode to contain a valid LSN
		* (see comment below before replaying the log dinode for details).
		*
		* Note: we still need to replay an owner change even though the inode
		* is more recent than the transaction as there is no guarantee that all
		* the btree blocks are more recent than this transaction, too.
		*/
		if (dip->di_version >= 3) {
		xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);

		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
		trace_xfs_log_recover_inode_skip(log, in_f);
		error = 0;
		goto out_owner_change;
		@@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2(
		goto out_release;
		}

		/* recover the log dinode inode into the on disk inode */
		xfs_log_dinode_to_disk(ldip, dip);
		/*
		* Recover the log dinode inode into the on disk inode.
		*
		* The LSN in the log dinode is garbage - it can be zero or reflect
		* stale in-memory runtime state that isn't coherent with the changes
		* logged in this transaction or the changes written to the on-disk
		* inode. Hence we write the current lSN into the inode because that
		* matches what xfs_iflush() would write inode the inode when flushing
		* the changes in this transaction.
		*/
		xfs_log_dinode_to_disk(ldip, dip, current_lsn);

		fields = in_f->ilf_fields;
		if (fields & XFS_ILOG_DEV)

fs/xfs/xfs_log.c

+165 −86

Original line number	Diff line number	Diff line
		@@ -78,13 +78,12 @@ xlog_verify_iclog(
		STATIC void
		xlog_verify_tail_lsn(
		struct xlog *log,
		struct xlog_in_core *iclog,
		xfs_lsn_t tail_lsn);
		struct xlog_in_core *iclog);
		#else
		#define xlog_verify_dest_ptr(a,b)
		#define xlog_verify_grant_tail(a)
		#define xlog_verify_iclog(a,b,c)
		#define xlog_verify_tail_lsn(a,b,c)
		#define xlog_verify_tail_lsn(a,b)
		#endif

		STATIC int
		@@ -487,51 +486,80 @@ xfs_log_reserve(
		return error;
		}

		static bool
		__xlog_state_release_iclog(
		struct xlog *log,
		struct xlog_in_core *iclog)
		{
		lockdep_assert_held(&log->l_icloglock);

		if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
		/* update tail before writing to iclog */
		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);

		iclog->ic_state = XLOG_STATE_SYNCING;
		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
		xlog_verify_tail_lsn(log, iclog, tail_lsn);
		/* cycle incremented when incrementing curr_block */
		trace_xlog_iclog_syncing(iclog, _RET_IP_);
		return true;
		}

		ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
		return false;
		}

		/*
		* Flush iclog to disk if this is the last reference to the given iclog and the
		* it is in the WANT_SYNC state.
		*
		* If the caller passes in a non-zero @old_tail_lsn and the current log tail
		* does not match, there may be metadata on disk that must be persisted before
		* this iclog is written. To satisfy that requirement, set the
		* XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
		* log tail value.
		*
		* If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
		* log tail is updated correctly. NEED_FUA indicates that the iclog will be
		* written to stable storage, and implies that a commit record is contained
		* within the iclog. We need to ensure that the log tail does not move beyond
		* the tail that the first commit record in the iclog ordered against, otherwise
		* correct recovery of that checkpoint becomes dependent on future operations
		* performed on this iclog.
		*
		* Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
		* current tail into iclog. Once the iclog tail is set, future operations must
		* not modify it, otherwise they potentially violate ordering constraints for
		* the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
		* the iclog will get zeroed on activation of the iclog after sync, so we
		* always capture the tail lsn on the iclog on the first NEED_FUA release
		* regardless of the number of active reference counts on this iclog.
		*/

		int
		xlog_state_release_iclog(
		struct xlog *log,
		struct xlog_in_core *iclog)
		struct xlog_in_core *iclog,
		xfs_lsn_t old_tail_lsn)
		{
		xfs_lsn_t tail_lsn;
		lockdep_assert_held(&log->l_icloglock);

		trace_xlog_iclog_release(iclog, _RET_IP_);
		if (iclog->ic_state == XLOG_STATE_IOERROR)
		return -EIO;

		if (atomic_dec_and_test(&iclog->ic_refcnt) &&
		__xlog_state_release_iclog(log, iclog)) {
		/*
		* Grabbing the current log tail needs to be atomic w.r.t. the writing
		* of the tail LSN into the iclog so we guarantee that the log tail does
		* not move between deciding if a cache flush is required and writing
		* the LSN into the iclog below.
		*/
		if (old_tail_lsn \|\| iclog->ic_state == XLOG_STATE_WANT_SYNC) {
		tail_lsn = xlog_assign_tail_lsn(log->l_mp);

		if (old_tail_lsn && tail_lsn != old_tail_lsn)
		iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH;

		if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
		!iclog->ic_header.h_tail_lsn)
		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
		}

		if (!atomic_dec_and_test(&iclog->ic_refcnt))
		return 0;

		if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
		ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
		return 0;
		}

		iclog->ic_state = XLOG_STATE_SYNCING;
		if (!iclog->ic_header.h_tail_lsn)
		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
		xlog_verify_tail_lsn(log, iclog);
		trace_xlog_iclog_syncing(iclog, _RET_IP_);

		spin_unlock(&log->l_icloglock);
		xlog_sync(log, iclog);
		spin_lock(&log->l_icloglock);
		}

		return 0;
		}

		@@ -773,6 +801,21 @@ xfs_log_mount_cancel(
		xfs_log_unmount(mp);
		}

		/*
		* Flush out the iclog to disk ensuring that device caches are flushed and
		* the iclog hits stable storage before any completion waiters are woken.
		*/
		static inline int
		xlog_force_iclog(
		struct xlog_in_core *iclog)
		{
		atomic_inc(&iclog->ic_refcnt);
		iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
		if (iclog->ic_state == XLOG_STATE_ACTIVE)
		xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
		return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
		}

		/*
		* Wait for the iclog and all prior iclogs to be written disk as required by the
		* log force state machine. Waiting on ic_force_wait ensures iclog completions
		@@ -827,13 +870,6 @@ xlog_write_unmount_record(
		/* account for space used by record data */
		ticket->t_curr_res -= sizeof(ulf);

		/*
		* For external log devices, we need to flush the data device cache
		* first to ensure all metadata writeback is on stable storage before we
		* stamp the tail LSN into the unmount record.
		*/
		if (log->l_targ != log->l_mp->m_ddev_targp)
		blkdev_issue_flush(log->l_targ->bt_bdev);
		return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS);
		}

		@@ -865,18 +901,7 @@ xlog_unmount_write(

		spin_lock(&log->l_icloglock);
		iclog = log->l_iclog;
		atomic_inc(&iclog->ic_refcnt);
		if (iclog->ic_state == XLOG_STATE_ACTIVE)
		xlog_state_switch_iclogs(log, iclog, 0);
		else
		ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
		iclog->ic_state == XLOG_STATE_IOERROR);
		/*
		* Ensure the journal is fully flushed and on stable storage once the
		* iclog containing the unmount record is written.
		*/
		iclog->ic_flags \|= (XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA);
		error = xlog_state_release_iclog(log, iclog);
		error = xlog_force_iclog(iclog);
		xlog_wait_on_iclog(iclog);

		if (tic) {
		@@ -1796,10 +1821,20 @@ xlog_write_iclog(
		* metadata writeback and causing priority inversions.
		*/
		iclog->ic_bio.bi_opf = REQ_OP_WRITE \| REQ_META \| REQ_SYNC \| REQ_IDLE;
		if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH)
		if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
		iclog->ic_bio.bi_opf \|= REQ_PREFLUSH;
		/*
		* For external log devices, we also need to flush the data
		* device cache first to ensure all metadata writeback covered
		* by the LSN in this iclog is on stable storage. This is slow,
		* but it must complete before we issue the external log IO.
		*/
		if (log->l_targ != log->l_mp->m_ddev_targp)
		blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev);
		}
		if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
		iclog->ic_bio.bi_opf \|= REQ_FUA;

		iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA);

		if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
		@@ -2310,7 +2345,7 @@ xlog_write_copy_finish(
		return 0;

		release_iclog:
		error = xlog_state_release_iclog(log, iclog);
		error = xlog_state_release_iclog(log, iclog, 0);
		spin_unlock(&log->l_icloglock);
		return error;
		}
		@@ -2529,7 +2564,7 @@ xlog_write(
		ASSERT(optype & XLOG_COMMIT_TRANS);
		*commit_iclog = iclog;
		} else {
		error = xlog_state_release_iclog(log, iclog);
		error = xlog_state_release_iclog(log, iclog, 0);
		}
		spin_unlock(&log->l_icloglock);

		@@ -2567,6 +2602,7 @@ xlog_state_activate_iclog(
		memset(iclog->ic_header.h_cycle_data, 0,
		sizeof(iclog->ic_header.h_cycle_data));
		iclog->ic_header.h_lsn = 0;
		iclog->ic_header.h_tail_lsn = 0;
		}

		/*
		@@ -2967,7 +3003,7 @@ xlog_state_get_iclog_space(
		* reference to the iclog.
		*/
		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
		error = xlog_state_release_iclog(log, iclog);
		error = xlog_state_release_iclog(log, iclog, 0);
		spin_unlock(&log->l_icloglock);
		if (error)
		return error;
		@@ -3131,6 +3167,35 @@ xlog_state_switch_iclogs(
		log->l_iclog = iclog->ic_next;
		}

		/*
		* Force the iclog to disk and check if the iclog has been completed before
		* xlog_force_iclog() returns. This can happen on synchronous (e.g.
		* pmem) or fast async storage because we drop the icloglock to issue the IO.
		* If completion has already occurred, tell the caller so that it can avoid an
		* unnecessary wait on the iclog.
		*/
		static int
		xlog_force_and_check_iclog(
		struct xlog_in_core *iclog,
		bool *completed)
		{
		xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
		int error;

		*completed = false;
		error = xlog_force_iclog(iclog);
		if (error)
		return error;

		/*
		* If the iclog has already been completed and reused the header LSN
		* will have been rewritten by completion
		*/
		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
		*completed = true;
		return 0;
		}

		/*
		* Write out all data in the in-core log as of this exact moment in time.
		*
		@@ -3165,7 +3230,6 @@ xfs_log_force(
		{
		struct xlog *log = mp->m_log;
		struct xlog_in_core *iclog;
		xfs_lsn_t lsn;

		XFS_STATS_INC(mp, xs_log_force);
		trace_xfs_log_force(mp, 0, _RET_IP_);
		@@ -3193,38 +3257,32 @@ xfs_log_force(
		iclog = iclog->ic_prev;
		} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
		if (atomic_read(&iclog->ic_refcnt) == 0) {
		/*
		* We are the only one with access to this iclog.
		*
		* Flush it out now. There should be a roundoff of zero
		* to show that someone has already taken care of the
		* roundoff from the previous sync.
		*/
		atomic_inc(&iclog->ic_refcnt);
		lsn = be64_to_cpu(iclog->ic_header.h_lsn);
		xlog_state_switch_iclogs(log, iclog, 0);
		if (xlog_state_release_iclog(log, iclog))
		/* We have exclusive access to this iclog. */
		bool completed;

		if (xlog_force_and_check_iclog(iclog, &completed))
		goto out_error;

		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
		if (completed)
		goto out_unlock;
		} else {
		/*
		* Someone else is writing to this iclog.
		*
		* Use its call to flush out the data. However, the
		* other thread may not force out this LR, so we mark
		* it WANT_SYNC.
		* Someone else is still writing to this iclog, so we
		* need to ensure that when they release the iclog it
		* gets synced immediately as we may be waiting on it.
		*/
		xlog_state_switch_iclogs(log, iclog, 0);
		}
		} else {
		}

		/*
		* If the head iclog is not active nor dirty, we just attach
		* ourselves to the head and go to sleep if necessary.
		* The iclog we are about to wait on may contain the checkpoint pushed
		* by the above xlog_cil_force() call, but it may not have been pushed
		* to disk yet. Like the ACTIVE case above, we need to make sure caches
		* are flushed when this iclog is written.
		*/
		;
		}
		if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
		iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;

		if (flags & XFS_LOG_SYNC)
		return xlog_wait_on_iclog(iclog);
		@@ -3245,6 +3303,7 @@ xlog_force_lsn(
		bool already_slept)
		{
		struct xlog_in_core *iclog;
		bool completed;

		spin_lock(&log->l_icloglock);
		iclog = log->l_iclog;
		@@ -3258,7 +3317,8 @@ xlog_force_lsn(
		goto out_unlock;
		}

		if (iclog->ic_state == XLOG_STATE_ACTIVE) {
		switch (iclog->ic_state) {
		case XLOG_STATE_ACTIVE:
		/*
		* We sleep here if we haven't already slept (e.g. this is the
		* first time we've looked at the correct iclog buf) and the
		@@ -3281,12 +3341,31 @@ xlog_force_lsn(
		&log->l_icloglock);
		return -EAGAIN;
		}
		atomic_inc(&iclog->ic_refcnt);
		xlog_state_switch_iclogs(log, iclog, 0);
		if (xlog_state_release_iclog(log, iclog))
		if (xlog_force_and_check_iclog(iclog, &completed))
		goto out_error;
		if (log_flushed)
		*log_flushed = 1;
		if (completed)
		goto out_unlock;
		break;
		case XLOG_STATE_WANT_SYNC:
		/*
		* This iclog may contain the checkpoint pushed by the
		* xlog_cil_force_seq() call, but there are other writers still
		* accessing it so it hasn't been pushed to disk yet. Like the
		* ACTIVE case above, we need to make sure caches are flushed
		* when this iclog is written.
		*/
		iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
		break;
		default:
		/*
		* The entire checkpoint was written by the CIL force and is on
		* its way to disk already. It will be stable when it
		* completes, so we don't need to manipulate caches here at all.
		* We just need to wait for completion if necessary.
		*/
		break;
		}

		if (flags & XFS_LOG_SYNC)
		@@ -3559,9 +3638,9 @@ xlog_verify_grant_tail(
		STATIC void
		xlog_verify_tail_lsn(
		struct xlog *log,
		struct xlog_in_core *iclog,
		xfs_lsn_t tail_lsn)
		struct xlog_in_core *iclog)
		{
		xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
		int blocks;

		if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {

fs/xfs/xfs_log_cil.c

+11 −2

Original line number	Diff line number	Diff line
		@@ -654,8 +654,9 @@ xlog_cil_push_work(
		struct xfs_trans_header thdr;
		struct xfs_log_iovec lhdr;
		struct xfs_log_vec lvhdr = { NULL };
		xfs_lsn_t preflush_tail_lsn;
		xfs_lsn_t commit_lsn;
		xfs_lsn_t push_seq;
		xfs_csn_t push_seq;
		struct bio bio;
		DECLARE_COMPLETION_ONSTACK(bdev_flush);

		@@ -730,7 +731,15 @@ xlog_cil_push_work(
		* because we hold the flush lock exclusively. Hence we can now issue
		* a cache flush to ensure all the completed metadata in the journal we
		* are about to overwrite is on stable storage.
		*/
		*
		* Because we are issuing this cache flush before we've written the
		* tail lsn to the iclog, we can have metadata IO completions move the
		* tail forwards between the completion of this flush and the iclog
		* being written. In this case, we need to re-issue the cache flush
		* before the iclog write. To detect whether the log tail moves, sample
		* the tail LSN before we issue the flush.
		*/
		preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
		xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
		&bdev_flush);

		@@ -941,7 +950,7 @@ xlog_cil_push_work(
		* storage.
		*/
		commit_iclog->ic_flags \|= XLOG_ICL_NEED_FUA;
		xlog_state_release_iclog(log, commit_iclog);
		xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
		spin_unlock(&log->l_icloglock);
		return;