Commit 0a229c93 authored by Chandan Babu R's avatar Chandan Babu R
Browse files

Merge tag 'fix-percpu-lists-6.6_2023-09-12' of...

Merge tag 'fix-percpu-lists-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux

 into xfs-6.6-fixesA

xfs: fix cpu hotplug mess

Ritesh and Eric separately reported crashes in XFS's hook function for
CPU hot remove if the remove event races with a filesystem being
mounted.  I also noticed via generic/650 that once in a while the log
will shut down over an apparent overrun of a transaction reservation;
this turned out to be due to CIL percpu list aggregation failing to pick
up the percpu list items from a dying CPU.

Either way, the solution here is to eliminate the need for a CPU dying
hook by using a private cpumask to track which CPUs have added to their
percpu lists directly, and iterating with that mask.  This fixes the log
problems and (I think) solves a theoretical UAF bug in the inodegc code
too.

Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'fix-percpu-lists-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: remove CPU hotplug infrastructure
  xfs: remove the all-mounts list
  xfs: use per-mount cpumask to track nonempty percpu inodegc lists
  xfs: fix per-cpu CIL structure aggregation racing with dying cpus
parents da6f8410 ef7d9593
Loading
Loading
Loading
Loading
+28 −50
Original line number Diff line number Diff line
@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
	int			cpu;
	bool			ret = false;

	for_each_online_cpu(cpu) {
	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list)) {
			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
	int			error = 0;

	flush_workqueue(mp->m_inodegc_wq);
	for_each_online_cpu(cpu) {
	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		struct xfs_inodegc	*gc;

		gc = per_cpu_ptr(mp->m_inodegc, cpu);
@@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
						struct xfs_inodegc, work);
	struct llist_node	*node = llist_del_all(&gc->list);
	struct xfs_inode	*ip, *n;
	struct xfs_mount	*mp = gc->mp;
	unsigned int		nofs_flag;

	ASSERT(gc->cpu == smp_processor_id());
	/*
	 * Clear the cpu mask bit and ensure that we have seen the latest
	 * update of the gc structure associated with this CPU. This matches
	 * with the release semantics used when setting the cpumask bit in
	 * xfs_inodegc_queue.
	 */
	cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
	smp_mb__after_atomic();

	WRITE_ONCE(gc->items, 0);

@@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
	nofs_flag = memalloc_nofs_save();

	ip = llist_entry(node, struct xfs_inode, i_gclist);
	trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
	trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));

	WRITE_ONCE(gc->shrinker_hits, 0);
	llist_for_each_entry_safe(ip, n, node, i_gclist) {
@@ -2057,6 +2065,7 @@ xfs_inodegc_queue(
	struct xfs_inodegc	*gc;
	int			items;
	unsigned int		shrinker_hits;
	unsigned int		cpu_nr;
	unsigned long		queue_delay = 1;

	trace_xfs_inode_set_need_inactive(ip);
@@ -2064,18 +2073,28 @@ xfs_inodegc_queue(
	ip->i_flags |= XFS_NEED_INACTIVE;
	spin_unlock(&ip->i_flags_lock);

	gc = get_cpu_ptr(mp->m_inodegc);
	cpu_nr = get_cpu();
	gc = this_cpu_ptr(mp->m_inodegc);
	llist_add(&ip->i_gclist, &gc->list);
	items = READ_ONCE(gc->items);
	WRITE_ONCE(gc->items, items + 1);
	shrinker_hits = READ_ONCE(gc->shrinker_hits);

	/*
	 * Ensure the list add is always seen by anyone who finds the cpumask
	 * bit set. This effectively gives the cpumask bit set operation
	 * release ordering semantics.
	 */
	smp_mb__before_atomic();
	if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
		cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);

	/*
	 * We queue the work while holding the current CPU so that the work
	 * is scheduled to run on this CPU.
	 */
	if (!xfs_is_inodegc_enabled(mp)) {
		put_cpu_ptr(gc);
		put_cpu();
		return;
	}

@@ -2085,7 +2104,7 @@ xfs_inodegc_queue(
	trace_xfs_inodegc_queue(mp, __return_address);
	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
			queue_delay);
	put_cpu_ptr(gc);
	put_cpu();

	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
		trace_xfs_inodegc_throttle(mp, __return_address);
@@ -2093,47 +2112,6 @@ xfs_inodegc_queue(
	}
}

/*
 * Fold the dead CPU inodegc queue into the current CPUs queue.
 */
void
xfs_inodegc_cpu_dead(
	struct xfs_mount	*mp,
	unsigned int		dead_cpu)
{
	struct xfs_inodegc	*dead_gc, *gc;
	struct llist_node	*first, *last;
	unsigned int		count = 0;

	dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
	cancel_delayed_work_sync(&dead_gc->work);

	if (llist_empty(&dead_gc->list))
		return;

	first = dead_gc->list.first;
	last = first;
	while (last->next) {
		last = last->next;
		count++;
	}
	dead_gc->list.first = NULL;
	dead_gc->items = 0;

	/* Add pending work to current CPU */
	gc = get_cpu_ptr(mp->m_inodegc);
	llist_add_batch(first, last, &gc->list);
	count += READ_ONCE(gc->items);
	WRITE_ONCE(gc->items, count);

	if (xfs_is_inodegc_enabled(mp)) {
		trace_xfs_inodegc_queue(mp, __return_address);
		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
				0);
	}
	put_cpu_ptr(gc);
}

/*
 * We set the inode flag atomically with the radix tree tag.  Once we get tag
 * lookups on the radix tree, this inode flag can go away.
@@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
	if (!xfs_is_inodegc_enabled(mp))
		return 0;

	for_each_online_cpu(cpu) {
	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list))
			return XFS_INODEGC_SHRINKER_COUNT;
@@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(

	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);

	for_each_online_cpu(cpu) {
	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list)) {
			unsigned int	h = READ_ONCE(gc->shrinker_hits);
+0 −1
Original line number Diff line number Diff line
@@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);

#endif
+16 −36
Original line number Diff line number Diff line
@@ -124,7 +124,7 @@ xlog_cil_push_pcp_aggregate(
	struct xlog_cil_pcp	*cilpcp;
	int			cpu;

	for_each_online_cpu(cpu) {
	for_each_cpu(cpu, &ctx->cil_pcpmask) {
		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);

		ctx->ticket->t_curr_res += cilpcp->space_reserved;
@@ -165,7 +165,13 @@ xlog_cil_insert_pcp_aggregate(
	if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
		return;

	for_each_online_cpu(cpu) {
	/*
	 * We can race with other cpus setting cil_pcpmask.  However, we've
	 * atomically cleared PCP_SPACE which forces other threads to add to
	 * the global space used count.  cil_pcpmask is a superset of cilpcp
	 * structures that could have a nonzero space_used.
	 */
	for_each_cpu(cpu, &ctx->cil_pcpmask) {
		int	old, prev;

		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
@@ -554,6 +560,7 @@ xlog_cil_insert_items(
	int			iovhdr_res = 0, split_res = 0, ctx_res = 0;
	int			space_used;
	int			order;
	unsigned int		cpu_nr;
	struct xlog_cil_pcp	*cilpcp;

	ASSERT(tp);
@@ -577,7 +584,12 @@ xlog_cil_insert_items(
	 * can't be scheduled away between split sample/update operations that
	 * are done without outside locking to serialise them.
	 */
	cilpcp = get_cpu_ptr(cil->xc_pcp);
	cpu_nr = get_cpu();
	cilpcp = this_cpu_ptr(cil->xc_pcp);

	/* Tell the future push that there was work added by this CPU. */
	if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
		cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);

	/*
	 * We need to take the CIL checkpoint unit reservation on the first
@@ -663,7 +675,7 @@ xlog_cil_insert_items(
			continue;
		list_add_tail(&lip->li_cil, &cilpcp->log_items);
	}
	put_cpu_ptr(cilpcp);
	put_cpu();

	/*
	 * If we've overrun the reservation, dump the tx details before we move
@@ -1790,38 +1802,6 @@ xlog_cil_force_seq(
	return 0;
}

/*
 * Move dead percpu state to the relevant CIL context structures.
 *
 * We have to lock the CIL context here to ensure that nothing is modifying
 * the percpu state, either addition or removal. Both of these are done under
 * the CIL context lock, so grabbing that exclusively here will ensure we can
 * safely drain the cilpcp for the CPU that is dying.
 */
void
xlog_cil_pcp_dead(
	struct xlog		*log,
	unsigned int		cpu)
{
	struct xfs_cil		*cil = log->l_cilp;
	struct xlog_cil_pcp	*cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
	struct xfs_cil_ctx	*ctx;

	down_write(&cil->xc_ctx_lock);
	ctx = cil->xc_ctx;
	if (ctx->ticket)
		ctx->ticket->t_curr_res += cilpcp->space_reserved;
	cilpcp->space_reserved = 0;

	if (!list_empty(&cilpcp->log_items))
		list_splice_init(&cilpcp->log_items, &ctx->log_items);
	if (!list_empty(&cilpcp->busy_extents))
		list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
	atomic_add(cilpcp->space_used, &ctx->space_used);
	cilpcp->space_used = 0;
	up_write(&cil->xc_ctx_lock);
}

/*
 * Perform initial CIL structure initialisation.
 */
+6 −8
Original line number Diff line number Diff line
@@ -231,6 +231,12 @@ struct xfs_cil_ctx {
	struct work_struct	discard_endio_work;
	struct work_struct	push_work;
	atomic_t		order_id;

	/*
	 * CPUs that could have added items to the percpu CIL data.  Access is
	 * coordinated with xc_ctx_lock.
	 */
	struct cpumask		cil_pcpmask;
};

/*
@@ -278,9 +284,6 @@ struct xfs_cil {
	wait_queue_head_t	xc_push_wait;	/* background push throttle */

	void __percpu		*xc_pcp;	/* percpu CIL structures */
#ifdef CONFIG_HOTPLUG_CPU
	struct list_head	xc_pcp_list;
#endif
} ____cacheline_aligned_in_smp;

/* xc_flags bit values */
@@ -705,9 +708,4 @@ xlog_kvmalloc(
	return p;
}

/*
 * CIL CPU dead notifier
 */
void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);

#endif	/* __XFS_LOG_PRIV_H__ */
+4 −3
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ struct xfs_error_cfg {
 * Per-cpu deferred inode inactivation GC lists.
 */
struct xfs_inodegc {
	struct xfs_mount	*mp;
	struct llist_head	list;
	struct delayed_work	work;
	int			error;
@@ -67,9 +68,7 @@ struct xfs_inodegc {
	/* approximate count of inodes in the list */
	unsigned int		items;
	unsigned int		shrinker_hits;
#if defined(DEBUG) || defined(XFS_WARN)
	unsigned int		cpu;
#endif
};

/*
@@ -98,7 +97,6 @@ typedef struct xfs_mount {
	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
	struct list_head	m_mount_list;	/* global mount list */
	void __percpu		*m_inodegc;	/* percpu inodegc structures */

	/*
@@ -249,6 +247,9 @@ typedef struct xfs_mount {
	unsigned int		*m_errortag;
	struct xfs_kobj		m_errortag_kobj;
#endif

	/* cpus that have inodes queued for inactivation */
	struct cpumask		m_inodegc_cpumask;
} xfs_mount_t;

#define M_IGEO(mp)		(&(mp)->m_ino_geo)
Loading