!9679 cgroup/cpuset: Make cpuset hotplug processing synchronous (83ed2eb1) · Commits · EulixOS / Software / Kernel

include/linux/cpuset.h

+0 −3

Original line number	Diff line number	Diff line
		@@ -70,7 +70,6 @@ extern int cpuset_init(void);
		extern void cpuset_init_smp(void);
		extern void cpuset_force_rebuild(void);
		extern void cpuset_update_active_cpus(void);
		extern void cpuset_wait_for_hotplug(void);
		extern void inc_dl_tasks_cs(struct task_struct *task);
		extern void dec_dl_tasks_cs(struct task_struct *task);
		extern void cpuset_lock(void);
		@@ -189,8 +188,6 @@ static inline void cpuset_update_active_cpus(void)
		partition_sched_domains(1, NULL, NULL);
		}

		static inline void cpuset_wait_for_hotplug(void) { }

		static inline void inc_dl_tasks_cs(struct task_struct *task) { }
		static inline void dec_dl_tasks_cs(struct task_struct *task) { }
		static inline void cpuset_lock(void) { }

kernel/cgroup/cpuset.c

+55 −48

Original line number	Diff line number	Diff line
		@@ -208,6 +208,14 @@ struct cpuset {
		KABI_RESERVE(4)
		};

		/*
		* Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
		*/
		struct cpuset_remove_tasks_struct {
		struct work_struct work;
		struct cpuset *cs;
		};

		/*
		* Exclusive CPUs distributed out to sub-partitions of top_cpuset
		*/
		@@ -454,12 +462,6 @@ static DEFINE_SPINLOCK(callback_lock);

		static struct workqueue_struct *cpuset_migrate_mm_wq;

		/*
		* CPU / memory hotplug is handled asynchronously.
		*/
		static void cpuset_hotplug_workfn(struct work_struct *work);
		static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

		static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

		static inline void check_insane_mems_config(nodemask_t *nodes)
		@@ -545,22 +547,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
		rcu_read_lock();
		cs = task_cs(tsk);

		while (!cpumask_intersects(cs->effective_cpus, pmask)) {
		while (!cpumask_intersects(cs->effective_cpus, pmask))
		cs = parent_cs(cs);
		if (unlikely(!cs)) {
		/*
		* The top cpuset doesn't have any online cpu as a
		* consequence of a race between cpuset_hotplug_work
		* and cpu hotplug notifier. But we know the top
		* cpuset's effective_cpus is on its way to be
		* identical to cpu_online_mask.
		*/
		goto out_unlock;
		}
		}
		cpumask_and(pmask, pmask, cs->effective_cpus);

		out_unlock:
		cpumask_and(pmask, pmask, cs->effective_cpus);
		rcu_read_unlock();
		}

		@@ -1318,7 +1308,7 @@ static void rebuild_sched_domains_locked(void)
		/*
		* If we have raced with CPU hotplug, return early to avoid
		* passing doms with offlined cpu to partition_sched_domains().
		* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
		* Anyways, cpuset_handle_hotplug() will rebuild sched domains.
		*
		* With no CPUs in any subpartitions, top_cpuset's effective CPUs
		* should be the same as the active CPUs, so checking only top_cpuset
		@@ -1361,12 +1351,17 @@ static void rebuild_sched_domains_locked(void)
		}
		#endif /* CONFIG_SMP */

		void rebuild_sched_domains(void)
		static void rebuild_sched_domains_cpuslocked(void)
		{
		cpus_read_lock();
		mutex_lock(&cpuset_mutex);
		rebuild_sched_domains_locked();
		mutex_unlock(&cpuset_mutex);
		}

		void rebuild_sched_domains(void)
		{
		cpus_read_lock();
		rebuild_sched_domains_cpuslocked();
		cpus_read_unlock();
		}

		@@ -2091,14 +2086,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,

		/*
		* For partcmd_update without newmask, it is being called from
		* cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
		* Update the load balance flag and scheduling domain if
		* cpus_read_trylock() is successful.
		* cpuset_handle_hotplug(). Update the load balance flag and
		* scheduling domain accordingly.
		*/
		if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
		if ((cmd == partcmd_update) && !newmask)
		update_partition_sd_lb(cs, old_prs);
		cpus_read_unlock();
		}

		notify_partition_change(cs, old_prs);
		return 0;
		@@ -3611,8 +3603,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
		* proceeding, so that we don't end up keep removing tasks added
		* after execution capability is restored.
		*
		* cpuset_hotplug_work calls back into cgroup core via
		* cgroup_transfer_tasks() and waiting for it from a cgroupfs
		* cpuset_handle_hotplug may call back into cgroup core asynchronously
		* via cgroup_transfer_tasks() and waiting for it from a cgroupfs
		* operation like this one can lead to a deadlock through kernfs
		* active_ref protection. Let's break the protection. Losing the
		* protection is okay as we check whether @cs is online after
		@@ -3621,7 +3613,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
		*/
		css_get(&cs->css);
		kernfs_break_active_protection(of->kn);
		flush_work(&cpuset_hotplug_work);

		cpus_read_lock();
		mutex_lock(&cpuset_mutex);
		@@ -4387,6 +4378,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
		}
		}

		static void cpuset_migrate_tasks_workfn(struct work_struct *work)
		{
		struct cpuset_remove_tasks_struct *s;

		s = container_of(work, struct cpuset_remove_tasks_struct, work);
		remove_tasks_in_empty_cpuset(s->cs);
		css_put(&s->cs->css);
		kfree(s);
		}

		static void
		hotplug_update_tasks_legacy(struct cpuset *cs,
		struct cpumask new_cpus, nodemask_t new_mems,
		@@ -4426,12 +4427,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
		/*
		* Move tasks to the nearest ancestor with execution resources,
		* This is full cgroup operation which will also call back into
		* cpuset. Should be done outside any lock.
		* cpuset. Execute it asynchronously using workqueue.
		*/
		if (is_empty) {
		mutex_unlock(&cpuset_mutex);
		remove_tasks_in_empty_cpuset(cs);
		mutex_lock(&cpuset_mutex);
		if (is_empty && cs->css.cgroup->nr_populated_csets &&
		css_tryget_online(&cs->css)) {
		struct cpuset_remove_tasks_struct *s;

		s = kzalloc(sizeof(*s), GFP_KERNEL);
		if (WARN_ON_ONCE(!s)) {
		css_put(&cs->css);
		return;
		}

		s->cs = cs;
		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
		schedule_work(&s->work);
		}
		}

		@@ -4564,8 +4574,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset cs, struct tmpmasks tmp)
		}

		/**
		* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
		* @work: unused
		* cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
		*
		* This function is called after either CPU or memory configuration has
		* changed and updates cpuset accordingly. The top_cpuset is always
		@@ -4579,8 +4588,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset cs, struct tmpmasks tmp)
		*
		* Note that CPU offlining during suspend is ignored. We don't modify
		* cpusets across suspend/resume cycles at all.
		*
		* CPU / memory hotplug is handled synchronously.
		*/
		static void cpuset_hotplug_workfn(struct work_struct *work)
		static void cpuset_handle_hotplug(void)
		{
		static cpumask_t new_cpus;
		static nodemask_t new_mems;
		@@ -4591,6 +4602,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
		if (on_dfl && !alloc_cpumasks(NULL, &tmp))
		ptmp = &tmp;

		lockdep_assert_cpus_held();
		mutex_lock(&cpuset_mutex);

		/* fetch the available cpus/mems and find out which changed how */
		@@ -4672,7 +4684,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
		/* rebuild sched domains if cpus_allowed has changed */
		if (cpus_updated \|\| force_rebuild) {
		force_rebuild = false;
		rebuild_sched_domains();
		rebuild_sched_domains_cpuslocked();
		}

		free_cpumasks(NULL, ptmp);
		@@ -4685,12 +4697,7 @@ void cpuset_update_active_cpus(void)
		* inside cgroup synchronization. Bounce actual hotplug processing
		* to a work item to avoid reverse locking order.
		*/
		schedule_work(&cpuset_hotplug_work);
		}

		void cpuset_wait_for_hotplug(void)
		{
		flush_work(&cpuset_hotplug_work);
		cpuset_handle_hotplug();
		}

		/*
		@@ -4701,7 +4708,7 @@ void cpuset_wait_for_hotplug(void)
		static int cpuset_track_online_nodes(struct notifier_block *self,
		unsigned long action, void *arg)
		{
		schedule_work(&cpuset_hotplug_work);
		cpuset_handle_hotplug();
		return NOTIFY_OK;
		}

kernel/cpu.c

+0 −48

Original line number	Diff line number	Diff line
		@@ -1209,52 +1209,6 @@ void __init cpuhp_threads_init(void)
		kthread_unpark(this_cpu_read(cpuhp_state.thread));
		}

		/*
		*
		* Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
		* protected region.
		*
		* The operation is still serialized against concurrent CPU hotplug via
		* cpu_add_remove_lock, i.e. CPU map protection. But it is _not_
		* serialized against other hotplug related activity like adding or
		* removing of state callbacks and state instances, which invoke either the
		* startup or the teardown callback of the affected state.
		*
		* This is required for subsystems which are unfixable vs. CPU hotplug and
		* evade lock inversion problems by scheduling work which has to be
		* completed _before_ cpu_up()/_cpu_down() returns.
		*
		* Don't even think about adding anything to this for any new code or even
		* drivers. It's only purpose is to keep existing lock order trainwrecks
		* working.
		*
		* For cpu_down() there might be valid reasons to finish cleanups which are
		* not required to be done under cpu_hotplug_lock, but that's a different
		* story and would be not invoked via this.
		*/
		static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
		{
		/*
		* cpusets delegate hotplug operations to a worker to "solve" the
		* lock order problems. Wait for the worker, but only if tasks are
		* _not_ frozen (suspend, hibernate) as that would wait forever.
		*
		* The wait is required because otherwise the hotplug operation
		* returns with inconsistent state, which could even be observed in
		* user space when a new CPU is brought up. The CPU plug uevent
		* would be delivered and user space reacting on it would fail to
		* move tasks to the newly plugged CPU up to the point where the
		* work has finished because up to that point the newly plugged CPU
		* is not assignable in cpusets/cgroups. On unplug that's not
		* necessarily a visible issue, but it is still inconsistent state,
		* which is the real problem which needs to be "fixed". This can't
		* prevent the transient state between scheduling the work and
		* returning from waiting for it.
		*/
		if (!tasks_frozen)
		cpuset_wait_for_hotplug();
		}

		#ifdef CONFIG_HOTPLUG_CPU
		#ifndef arch_clear_mm_cpumask_cpu
		#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
		@@ -1491,7 +1445,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
		*/
		lockup_detector_cleanup();
		arch_smt_update();
		cpu_up_down_serialize_trainwrecks(tasks_frozen);
		return ret;
		}

		@@ -1725,7 +1678,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
		out:
		cpus_write_unlock();
		arch_smt_update();
		cpu_up_down_serialize_trainwrecks(tasks_frozen);
		return ret;
		}

kernel/power/process.c

+0 −2

Original line number	Diff line number	Diff line
		@@ -194,8 +194,6 @@ void thaw_processes(void)
		__usermodehelper_set_disable_depth(UMH_FREEZING);
		thaw_workqueues();

		cpuset_wait_for_hotplug();

		read_lock(&tasklist_lock);
		for_each_process_thread(g, p) {
		/* No other threads should have PF_SUSPEND_TASK set */