Merge tag 'fix-percpu-lists-6.6_2023-09-12' of... (0a229c93) · Commits · EulixOS / Software / Kernel

fs/xfs/xfs_icache.c

+28 −50

Original line number	Diff line number	Diff line
		@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
		int cpu;
		bool ret = false;

		for_each_online_cpu(cpu) {
		for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list)) {
		mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
		@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
		int error = 0;

		flush_workqueue(mp->m_inodegc_wq);
		for_each_online_cpu(cpu) {
		for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		struct xfs_inodegc *gc;

		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		@@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
		struct xfs_inodegc, work);
		struct llist_node *node = llist_del_all(&gc->list);
		struct xfs_inode ip, n;
		struct xfs_mount *mp = gc->mp;
		unsigned int nofs_flag;

		ASSERT(gc->cpu == smp_processor_id());
		/*
		* Clear the cpu mask bit and ensure that we have seen the latest
		* update of the gc structure associated with this CPU. This matches
		* with the release semantics used when setting the cpumask bit in
		* xfs_inodegc_queue.
		*/
		cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
		smp_mb__after_atomic();

		WRITE_ONCE(gc->items, 0);

		@@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
		nofs_flag = memalloc_nofs_save();

		ip = llist_entry(node, struct xfs_inode, i_gclist);
		trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
		trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));

		WRITE_ONCE(gc->shrinker_hits, 0);
		llist_for_each_entry_safe(ip, n, node, i_gclist) {
		@@ -2057,6 +2065,7 @@ xfs_inodegc_queue(
		struct xfs_inodegc *gc;
		int items;
		unsigned int shrinker_hits;
		unsigned int cpu_nr;
		unsigned long queue_delay = 1;

		trace_xfs_inode_set_need_inactive(ip);
		@@ -2064,18 +2073,28 @@ xfs_inodegc_queue(
		ip->i_flags \|= XFS_NEED_INACTIVE;
		spin_unlock(&ip->i_flags_lock);

		gc = get_cpu_ptr(mp->m_inodegc);
		cpu_nr = get_cpu();
		gc = this_cpu_ptr(mp->m_inodegc);
		llist_add(&ip->i_gclist, &gc->list);
		items = READ_ONCE(gc->items);
		WRITE_ONCE(gc->items, items + 1);
		shrinker_hits = READ_ONCE(gc->shrinker_hits);

		/*
		* Ensure the list add is always seen by anyone who finds the cpumask
		* bit set. This effectively gives the cpumask bit set operation
		* release ordering semantics.
		*/
		smp_mb__before_atomic();
		if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
		cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);

		/*
		* We queue the work while holding the current CPU so that the work
		* is scheduled to run on this CPU.
		*/
		if (!xfs_is_inodegc_enabled(mp)) {
		put_cpu_ptr(gc);
		put_cpu();
		return;
		}

		@@ -2085,7 +2104,7 @@ xfs_inodegc_queue(
		trace_xfs_inodegc_queue(mp, __return_address);
		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
		queue_delay);
		put_cpu_ptr(gc);
		put_cpu();

		if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
		trace_xfs_inodegc_throttle(mp, __return_address);
		@@ -2093,47 +2112,6 @@ xfs_inodegc_queue(
		}
		}

		/*
		* Fold the dead CPU inodegc queue into the current CPUs queue.
		*/
		void
		xfs_inodegc_cpu_dead(
		struct xfs_mount *mp,
		unsigned int dead_cpu)
		{
		struct xfs_inodegc dead_gc, gc;
		struct llist_node first, last;
		unsigned int count = 0;

		dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
		cancel_delayed_work_sync(&dead_gc->work);

		if (llist_empty(&dead_gc->list))
		return;

		first = dead_gc->list.first;
		last = first;
		while (last->next) {
		last = last->next;
		count++;
		}
		dead_gc->list.first = NULL;
		dead_gc->items = 0;

		/* Add pending work to current CPU */
		gc = get_cpu_ptr(mp->m_inodegc);
		llist_add_batch(first, last, &gc->list);
		count += READ_ONCE(gc->items);
		WRITE_ONCE(gc->items, count);

		if (xfs_is_inodegc_enabled(mp)) {
		trace_xfs_inodegc_queue(mp, __return_address);
		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
		0);
		}
		put_cpu_ptr(gc);
		}

		/*
		* We set the inode flag atomically with the radix tree tag. Once we get tag
		* lookups on the radix tree, this inode flag can go away.
		@@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
		if (!xfs_is_inodegc_enabled(mp))
		return 0;

		for_each_online_cpu(cpu) {
		for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list))
		return XFS_INODEGC_SHRINKER_COUNT;
		@@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(

		trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);

		for_each_online_cpu(cpu) {
		for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list)) {
		unsigned int h = READ_ONCE(gc->shrinker_hits);

fs/xfs/xfs_icache.h

+0 −1

Original line number	Diff line number	Diff line
		@@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
		int xfs_inodegc_flush(struct xfs_mount *mp);
		void xfs_inodegc_stop(struct xfs_mount *mp);
		void xfs_inodegc_start(struct xfs_mount *mp);
		void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
		int xfs_inodegc_register_shrinker(struct xfs_mount *mp);

		#endif

fs/xfs/xfs_log_cil.c

+16 −36

Original line number	Diff line number	Diff line
		@@ -124,7 +124,7 @@ xlog_cil_push_pcp_aggregate(
		struct xlog_cil_pcp *cilpcp;
		int cpu;

		for_each_online_cpu(cpu) {
		for_each_cpu(cpu, &ctx->cil_pcpmask) {
		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);

		ctx->ticket->t_curr_res += cilpcp->space_reserved;
		@@ -165,7 +165,13 @@ xlog_cil_insert_pcp_aggregate(
		if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
		return;

		for_each_online_cpu(cpu) {
		/*
		* We can race with other cpus setting cil_pcpmask. However, we've
		* atomically cleared PCP_SPACE which forces other threads to add to
		* the global space used count. cil_pcpmask is a superset of cilpcp
		* structures that could have a nonzero space_used.
		*/
		for_each_cpu(cpu, &ctx->cil_pcpmask) {
		int old, prev;

		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
		@@ -554,6 +560,7 @@ xlog_cil_insert_items(
		int iovhdr_res = 0, split_res = 0, ctx_res = 0;
		int space_used;
		int order;
		unsigned int cpu_nr;
		struct xlog_cil_pcp *cilpcp;

		ASSERT(tp);
		@@ -577,7 +584,12 @@ xlog_cil_insert_items(
		* can't be scheduled away between split sample/update operations that
		* are done without outside locking to serialise them.
		*/
		cilpcp = get_cpu_ptr(cil->xc_pcp);
		cpu_nr = get_cpu();
		cilpcp = this_cpu_ptr(cil->xc_pcp);

		/* Tell the future push that there was work added by this CPU. */
		if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
		cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);

		/*
		* We need to take the CIL checkpoint unit reservation on the first
		@@ -663,7 +675,7 @@ xlog_cil_insert_items(
		continue;
		list_add_tail(&lip->li_cil, &cilpcp->log_items);
		}
		put_cpu_ptr(cilpcp);
		put_cpu();

		/*
		* If we've overrun the reservation, dump the tx details before we move
		@@ -1790,38 +1802,6 @@ xlog_cil_force_seq(
		return 0;
		}

		/*
		* Move dead percpu state to the relevant CIL context structures.
		*
		* We have to lock the CIL context here to ensure that nothing is modifying
		* the percpu state, either addition or removal. Both of these are done under
		* the CIL context lock, so grabbing that exclusively here will ensure we can
		* safely drain the cilpcp for the CPU that is dying.
		*/
		void
		xlog_cil_pcp_dead(
		struct xlog *log,
		unsigned int cpu)
		{
		struct xfs_cil *cil = log->l_cilp;
		struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
		struct xfs_cil_ctx *ctx;

		down_write(&cil->xc_ctx_lock);
		ctx = cil->xc_ctx;
		if (ctx->ticket)
		ctx->ticket->t_curr_res += cilpcp->space_reserved;
		cilpcp->space_reserved = 0;

		if (!list_empty(&cilpcp->log_items))
		list_splice_init(&cilpcp->log_items, &ctx->log_items);
		if (!list_empty(&cilpcp->busy_extents))
		list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
		atomic_add(cilpcp->space_used, &ctx->space_used);
		cilpcp->space_used = 0;
		up_write(&cil->xc_ctx_lock);
		}

		/*
		* Perform initial CIL structure initialisation.
		*/

fs/xfs/xfs_log_priv.h

+6 −8

Original line number	Diff line number	Diff line
		@@ -231,6 +231,12 @@ struct xfs_cil_ctx {
		struct work_struct discard_endio_work;
		struct work_struct push_work;
		atomic_t order_id;

		/*
		* CPUs that could have added items to the percpu CIL data. Access is
		* coordinated with xc_ctx_lock.
		*/
		struct cpumask cil_pcpmask;
		};

		/*
		@@ -278,9 +284,6 @@ struct xfs_cil {
		wait_queue_head_t xc_push_wait; /* background push throttle */

		void __percpu xc_pcp; / percpu CIL structures */
		#ifdef CONFIG_HOTPLUG_CPU
		struct list_head xc_pcp_list;
		#endif
		} ____cacheline_aligned_in_smp;

		/* xc_flags bit values */
		@@ -705,9 +708,4 @@ xlog_kvmalloc(
		return p;
		}

		/*
		* CIL CPU dead notifier
		*/
		void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);

		#endif /* __XFS_LOG_PRIV_H__ */

fs/xfs/xfs_mount.h

+4 −3

Original line number	Diff line number	Diff line
		@@ -60,6 +60,7 @@ struct xfs_error_cfg {
		* Per-cpu deferred inode inactivation GC lists.
		*/
		struct xfs_inodegc {
		struct xfs_mount *mp;
		struct llist_head list;
		struct delayed_work work;
		int error;
		@@ -67,9 +68,7 @@ struct xfs_inodegc {
		/* approximate count of inodes in the list */
		unsigned int items;
		unsigned int shrinker_hits;
		#if defined(DEBUG) \|\| defined(XFS_WARN)
		unsigned int cpu;
		#endif
		};

		/*
		@@ -98,7 +97,6 @@ typedef struct xfs_mount {
		xfs_buftarg_t m_ddev_targp; / saves taking the address */
		xfs_buftarg_t m_logdev_targp;/ ptr to log device */
		xfs_buftarg_t m_rtdev_targp; / ptr to rt device */
		struct list_head m_mount_list; /* global mount list */
		void __percpu m_inodegc; / percpu inodegc structures */

		/*
		@@ -249,6 +247,9 @@ typedef struct xfs_mount {
		unsigned int *m_errortag;
		struct xfs_kobj m_errortag_kobj;
		#endif

		/* cpus that have inodes queued for inactivation */
		struct cpumask m_inodegc_cpumask;
		} xfs_mount_t;

		#define M_IGEO(mp) (&(mp)->m_ino_geo)