Unverified Commit 83ed2eb1 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!9679 cgroup/cpuset: Make cpuset hotplug processing synchronous

Merge Pull Request from: @ci-robot 
 
PR sync from: Chen Ridong <chenridong@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/4BFLPF7EGDRZMUJDNQRBWGAHVQZBP54F/ 
*** BLURB HERE ***

Waiman Long (1):
  cgroup/cpuset: Make cpuset hotplug processing synchronous


-- 
2.34.1
 
https://gitee.com/openeuler/kernel/issues/I9ER36 
 
Link:https://gitee.com/openeuler/kernel/pulls/9679

 

Reviewed-by: default avatarWei Li <liwei391@huawei.com>
Signed-off-by: default avatarZhang Peng <zhangpeng362@huawei.com>
parents b206e4b3 a03f2f9b
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -70,7 +70,6 @@ extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_wait_for_hotplug(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
@@ -189,8 +188,6 @@ static inline void cpuset_update_active_cpus(void)
	partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_wait_for_hotplug(void) { }

static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
+55 −48
Original line number Diff line number Diff line
@@ -208,6 +208,14 @@ struct cpuset {
	KABI_RESERVE(4)
};

/*
 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
 */
struct cpuset_remove_tasks_struct {
	struct work_struct work;
	struct cpuset *cs;
};

/*
 * Exclusive CPUs distributed out to sub-partitions of top_cpuset
 */
@@ -454,12 +462,6 @@ static DEFINE_SPINLOCK(callback_lock);

static struct workqueue_struct *cpuset_migrate_mm_wq;

/*
 * CPU / memory hotplug is handled asynchronously.
 */
static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -545,22 +547,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
	rcu_read_lock();
	cs = task_cs(tsk);

	while (!cpumask_intersects(cs->effective_cpus, pmask)) {
	while (!cpumask_intersects(cs->effective_cpus, pmask))
		cs = parent_cs(cs);
		if (unlikely(!cs)) {
			/*
			 * The top cpuset doesn't have any online cpu as a
			 * consequence of a race between cpuset_hotplug_work
			 * and cpu hotplug notifier.  But we know the top
			 * cpuset's effective_cpus is on its way to be
			 * identical to cpu_online_mask.
			 */
			goto out_unlock;
		}
	}
	cpumask_and(pmask, pmask, cs->effective_cpus);

out_unlock:
	cpumask_and(pmask, pmask, cs->effective_cpus);
	rcu_read_unlock();
}

@@ -1318,7 +1308,7 @@ static void rebuild_sched_domains_locked(void)
	/*
	 * If we have raced with CPU hotplug, return early to avoid
	 * passing doms with offlined cpu to partition_sched_domains().
	 * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
	 * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
	 *
	 * With no CPUs in any subpartitions, top_cpuset's effective CPUs
	 * should be the same as the active CPUs, so checking only top_cpuset
@@ -1361,12 +1351,17 @@ static void rebuild_sched_domains_locked(void)
}
#endif /* CONFIG_SMP */

void rebuild_sched_domains(void)
static void rebuild_sched_domains_cpuslocked(void)
{
	cpus_read_lock();
	mutex_lock(&cpuset_mutex);
	rebuild_sched_domains_locked();
	mutex_unlock(&cpuset_mutex);
}

void rebuild_sched_domains(void)
{
	cpus_read_lock();
	rebuild_sched_domains_cpuslocked();
	cpus_read_unlock();
}

@@ -2091,14 +2086,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,

	/*
	 * For partcmd_update without newmask, it is being called from
	 * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
	 * Update the load balance flag and scheduling domain if
	 * cpus_read_trylock() is successful.
	 * cpuset_handle_hotplug(). Update the load balance flag and
	 * scheduling domain accordingly.
	 */
	if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
	if ((cmd == partcmd_update) && !newmask)
		update_partition_sd_lb(cs, old_prs);
		cpus_read_unlock();
	}

	notify_partition_change(cs, old_prs);
	return 0;
@@ -3611,8 +3603,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
	 * proceeding, so that we don't end up keep removing tasks added
	 * after execution capability is restored.
	 *
	 * cpuset_hotplug_work calls back into cgroup core via
	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
	 * cpuset_handle_hotplug may call back into cgroup core asynchronously
	 * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
	 * operation like this one can lead to a deadlock through kernfs
	 * active_ref protection.  Let's break the protection.  Losing the
	 * protection is okay as we check whether @cs is online after
@@ -3621,7 +3613,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
	 */
	css_get(&cs->css);
	kernfs_break_active_protection(of->kn);
	flush_work(&cpuset_hotplug_work);

	cpus_read_lock();
	mutex_lock(&cpuset_mutex);
@@ -4387,6 +4378,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
	}
}

static void cpuset_migrate_tasks_workfn(struct work_struct *work)
{
	struct cpuset_remove_tasks_struct *s;

	s = container_of(work, struct cpuset_remove_tasks_struct, work);
	remove_tasks_in_empty_cpuset(s->cs);
	css_put(&s->cs->css);
	kfree(s);
}

static void
hotplug_update_tasks_legacy(struct cpuset *cs,
			    struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4426,12 +4427,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
	/*
	 * Move tasks to the nearest ancestor with execution resources,
	 * This is full cgroup operation which will also call back into
	 * cpuset. Should be done outside any lock.
	 * cpuset. Execute it asynchronously using workqueue.
	 */
	if (is_empty) {
		mutex_unlock(&cpuset_mutex);
		remove_tasks_in_empty_cpuset(cs);
		mutex_lock(&cpuset_mutex);
	if (is_empty && cs->css.cgroup->nr_populated_csets &&
	    css_tryget_online(&cs->css)) {
		struct cpuset_remove_tasks_struct *s;

		s = kzalloc(sizeof(*s), GFP_KERNEL);
		if (WARN_ON_ONCE(!s)) {
			css_put(&cs->css);
			return;
		}

		s->cs = cs;
		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
		schedule_work(&s->work);
	}
}

@@ -4564,8 +4574,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
}

/**
 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
 * @work: unused
 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
 *
 * This function is called after either CPU or memory configuration has
 * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -4579,8 +4588,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 *
 * CPU / memory hotplug is handled synchronously.
 */
static void cpuset_hotplug_workfn(struct work_struct *work)
static void cpuset_handle_hotplug(void)
{
	static cpumask_t new_cpus;
	static nodemask_t new_mems;
@@ -4591,6 +4602,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
		ptmp = &tmp;

	lockdep_assert_cpus_held();
	mutex_lock(&cpuset_mutex);

	/* fetch the available cpus/mems and find out which changed how */
@@ -4672,7 +4684,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
	/* rebuild sched domains if cpus_allowed has changed */
	if (cpus_updated || force_rebuild) {
		force_rebuild = false;
		rebuild_sched_domains();
		rebuild_sched_domains_cpuslocked();
	}

	free_cpumasks(NULL, ptmp);
@@ -4685,12 +4697,7 @@ void cpuset_update_active_cpus(void)
	 * inside cgroup synchronization.  Bounce actual hotplug processing
	 * to a work item to avoid reverse locking order.
	 */
	schedule_work(&cpuset_hotplug_work);
}

void cpuset_wait_for_hotplug(void)
{
	flush_work(&cpuset_hotplug_work);
	cpuset_handle_hotplug();
}

/*
@@ -4701,7 +4708,7 @@ void cpuset_wait_for_hotplug(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
				unsigned long action, void *arg)
{
	schedule_work(&cpuset_hotplug_work);
	cpuset_handle_hotplug();
	return NOTIFY_OK;
}

+0 −48
Original line number Diff line number Diff line
@@ -1209,52 +1209,6 @@ void __init cpuhp_threads_init(void)
	kthread_unpark(this_cpu_read(cpuhp_state.thread));
}

/*
 *
 * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
 * protected region.
 *
 * The operation is still serialized against concurrent CPU hotplug via
 * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
 * serialized against other hotplug related activity like adding or
 * removing of state callbacks and state instances, which invoke either the
 * startup or the teardown callback of the affected state.
 *
 * This is required for subsystems which are unfixable vs. CPU hotplug and
 * evade lock inversion problems by scheduling work which has to be
 * completed _before_ cpu_up()/_cpu_down() returns.
 *
 * Don't even think about adding anything to this for any new code or even
 * drivers. It's only purpose is to keep existing lock order trainwrecks
 * working.
 *
 * For cpu_down() there might be valid reasons to finish cleanups which are
 * not required to be done under cpu_hotplug_lock, but that's a different
 * story and would be not invoked via this.
 */
static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
{
	/*
	 * cpusets delegate hotplug operations to a worker to "solve" the
	 * lock order problems. Wait for the worker, but only if tasks are
	 * _not_ frozen (suspend, hibernate) as that would wait forever.
	 *
	 * The wait is required because otherwise the hotplug operation
	 * returns with inconsistent state, which could even be observed in
	 * user space when a new CPU is brought up. The CPU plug uevent
	 * would be delivered and user space reacting on it would fail to
	 * move tasks to the newly plugged CPU up to the point where the
	 * work has finished because up to that point the newly plugged CPU
	 * is not assignable in cpusets/cgroups. On unplug that's not
	 * necessarily a visible issue, but it is still inconsistent state,
	 * which is the real problem which needs to be "fixed". This can't
	 * prevent the transient state between scheduling the work and
	 * returning from waiting for it.
	 */
	if (!tasks_frozen)
		cpuset_wait_for_hotplug();
}

#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -1491,7 +1445,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
	 */
	lockup_detector_cleanup();
	arch_smt_update();
	cpu_up_down_serialize_trainwrecks(tasks_frozen);
	return ret;
}

@@ -1725,7 +1678,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
out:
	cpus_write_unlock();
	arch_smt_update();
	cpu_up_down_serialize_trainwrecks(tasks_frozen);
	return ret;
}

+0 −2
Original line number Diff line number Diff line
@@ -194,8 +194,6 @@ void thaw_processes(void)
	__usermodehelper_set_disable_depth(UMH_FREEZING);
	thaw_workqueues();

	cpuset_wait_for_hotplug();

	read_lock(&tasklist_lock);
	for_each_process_thread(g, p) {
		/* No other threads should have PF_SUSPEND_TASK set */