Commit 44149752 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'cgroup-for-6.3-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
 "This is a relatively big pull request this late in the cycle but the
  major contributor is the cpuset bug which is rather significant:

   - Fix several cpuset bugs including one where it wasn't applying the
     target cgroup when tasks are created with CLONE_INTO_CGROUP

  With a few smaller fixes:

   - Fix inversed locking order in cgroup1 freezer implementation

   - Fix garbage cpu.stat::core_sched.forceidle_usec reporting in the
     root cgroup"

* tag 'cgroup-for-6.3-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup/cpuset: Make cpuset_attach_task() skip subpartitions CPUs for top_cpuset
  cgroup/cpuset: Add cpuset_can_fork() and cpuset_cancel_fork() methods
  cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly
  cgroup/cpuset: Wake up cpuset_attach_wq tasks in cpuset_cancel_attach()
  cgroup,freezer: hold cpu_hotplug_lock before freezer_mutex
  cgroup/cpuset: Fix partition root's cpuset.cpus update bug
  cgroup: fix display of forceidle time at root
parents e44f45fe 7e27cb6a
Loading
Loading
Loading
Loading
+144 −34
Original line number Diff line number Diff line
@@ -1513,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
	spin_unlock_irq(&callback_lock);

	if (adding || deleting)
		update_tasks_cpumask(parent, tmp->new_cpus);
		update_tasks_cpumask(parent, tmp->addmask);

	/*
	 * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1770,10 +1770,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
	/*
	 * Use the cpumasks in trialcs for tmpmasks when they are pointers
	 * to allocated cpumasks.
	 *
	 * Note that update_parent_subparts_cpumask() uses only addmask &
	 * delmask, but not new_cpus.
	 */
	tmp.addmask  = trialcs->subparts_cpus;
	tmp.delmask  = trialcs->effective_cpus;
	tmp.new_cpus = trialcs->cpus_allowed;
	tmp.new_cpus = NULL;
#endif

	retval = validate_change(cs, trialcs);
@@ -1838,6 +1841,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
	}
	spin_unlock_irq(&callback_lock);

#ifdef CONFIG_CPUMASK_OFFSTACK
	/* Now trialcs->cpus_allowed is available */
	tmp.new_cpus = trialcs->cpus_allowed;
#endif

	/* effective_cpus will be updated here */
	update_cpumasks_hier(cs, &tmp, false);

@@ -2445,6 +2453,20 @@ static int fmeter_getrate(struct fmeter *fmp)

static struct cpuset *cpuset_attach_old_cs;

/*
 * Check to see if a cpuset can accept a new task
 * For v1, cpus_allowed and mems_allowed can't be empty.
 * For v2, effective_cpus can't be empty.
 * Note that in v1, effective_cpus = cpus_allowed.
 */
static int cpuset_can_attach_check(struct cpuset *cs)
{
	if (cpumask_empty(cs->effective_cpus) ||
	   (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
		return -ENOSPC;
	return 0;
}

/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
@@ -2459,16 +2481,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)

	percpu_down_write(&cpuset_rwsem);

	/* allow moving tasks into an empty cpuset if on default hierarchy */
	ret = -ENOSPC;
	if (!is_in_v2_mode() &&
	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
		goto out_unlock;

	/*
	 * Task cannot be moved to a cpuset with empty effective cpus.
	 */
	if (cpumask_empty(cs->effective_cpus))
	/* Check to see if task is allowed in the cpuset */
	ret = cpuset_can_attach_check(cs);
	if (ret)
		goto out_unlock;

	cgroup_taskset_for_each(task, css, tset) {
@@ -2485,7 +2500,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
	 * changes which zero cpus/mems_allowed.
	 */
	cs->attach_in_progress++;
	ret = 0;
out_unlock:
	percpu_up_write(&cpuset_rwsem);
	return ret;
@@ -2494,25 +2508,47 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
	struct cgroup_subsys_state *css;
	struct cpuset *cs;

	cgroup_taskset_first(tset, &css);
	cs = css_cs(css);

	percpu_down_write(&cpuset_rwsem);
	css_cs(css)->attach_in_progress--;
	cs->attach_in_progress--;
	if (!cs->attach_in_progress)
		wake_up(&cpuset_attach_wq);
	percpu_up_write(&cpuset_rwsem);
}

/*
 * Protected by cpuset_rwsem.  cpus_attach is used only by cpuset_attach()
 * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 */
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_to;

static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
	percpu_rwsem_assert_held(&cpuset_rwsem);

	if (cs != &top_cpuset)
		guarantee_online_cpus(task, cpus_attach);
	else
		cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
			       cs->subparts_cpus);
	/*
	 * can_attach beforehand should guarantee that this doesn't
	 * fail.  TODO: have a better way to handle failure here
	 */
	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
	cpuset_update_task_spread_flags(cs, task);
}

static void cpuset_attach(struct cgroup_taskset *tset)
{
	/* static buf protected by cpuset_rwsem */
	static nodemask_t cpuset_attach_nodemask_to;
	struct task_struct *task;
	struct task_struct *leader;
	struct cgroup_subsys_state *css;
@@ -2543,20 +2579,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)

	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

	cgroup_taskset_for_each(task, css, tset) {
		if (cs != &top_cpuset)
			guarantee_online_cpus(task, cpus_attach);
		else
			cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
		/*
		 * can_attach beforehand should guarantee that this doesn't
		 * fail.  TODO: have a better way to handle failure here
		 */
		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
		cpuset_update_task_spread_flags(cs, task);
	}
	cgroup_taskset_for_each(task, css, tset)
		cpuset_attach_task(cs, task);

	/*
	 * Change mm for all threadgroup leaders. This is expensive and may
@@ -3247,6 +3271,68 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
	percpu_up_write(&cpuset_rwsem);
}

/*
 * In case the child is cloned into a cpuset different from its parent,
 * additional checks are done to see if the move is allowed.
 */
static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
{
	struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
	bool same_cs;
	int ret;

	rcu_read_lock();
	same_cs = (cs == task_cs(current));
	rcu_read_unlock();

	if (same_cs)
		return 0;

	lockdep_assert_held(&cgroup_mutex);
	percpu_down_write(&cpuset_rwsem);

	/* Check to see if task is allowed in the cpuset */
	ret = cpuset_can_attach_check(cs);
	if (ret)
		goto out_unlock;

	ret = task_can_attach(task, cs->effective_cpus);
	if (ret)
		goto out_unlock;

	ret = security_task_setscheduler(task);
	if (ret)
		goto out_unlock;

	/*
	 * Mark attach is in progress.  This makes validate_change() fail
	 * changes which zero cpus/mems_allowed.
	 */
	cs->attach_in_progress++;
out_unlock:
	percpu_up_write(&cpuset_rwsem);
	return ret;
}

static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
{
	struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
	bool same_cs;

	rcu_read_lock();
	same_cs = (cs == task_cs(current));
	rcu_read_unlock();

	if (same_cs)
		return;

	percpu_down_write(&cpuset_rwsem);
	cs->attach_in_progress--;
	if (!cs->attach_in_progress)
		wake_up(&cpuset_attach_wq);
	percpu_up_write(&cpuset_rwsem);
}

/*
 * Make sure the new task conform to the current state of its parent,
 * which could have been changed by cpuset just after it inherits the
@@ -3254,11 +3340,33 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 */
static void cpuset_fork(struct task_struct *task)
{
	if (task_css_is_root(task, cpuset_cgrp_id))
	struct cpuset *cs;
	bool same_cs;

	rcu_read_lock();
	cs = task_cs(task);
	same_cs = (cs == task_cs(current));
	rcu_read_unlock();

	if (same_cs) {
		if (cs == &top_cpuset)
			return;

		set_cpus_allowed_ptr(task, current->cpus_ptr);
		task->mems_allowed = current->mems_allowed;
		return;
	}

	/* CLONE_INTO_CGROUP */
	percpu_down_write(&cpuset_rwsem);
	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
	cpuset_attach_task(cs, task);

	cs->attach_in_progress--;
	if (!cs->attach_in_progress)
		wake_up(&cpuset_attach_wq);

	percpu_up_write(&cpuset_rwsem);
}

struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -3271,6 +3379,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
	.attach		= cpuset_attach,
	.post_attach	= cpuset_post_attach,
	.bind		= cpuset_bind,
	.can_fork	= cpuset_can_fork,
	.cancel_fork	= cpuset_cancel_fork,
	.fork		= cpuset_fork,
	.legacy_cftypes	= legacy_files,
	.dfl_cftypes	= dfl_files,
+5 −2
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/cpu.h>

/*
 * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
@@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,

	if (freeze) {
		if (!(freezer->state & CGROUP_FREEZING))
			static_branch_inc(&freezer_active);
			static_branch_inc_cpuslocked(&freezer_active);
		freezer->state |= state;
		freeze_cgroup(freezer);
	} else {
@@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
		if (!(freezer->state & CGROUP_FREEZING)) {
			freezer->state &= ~CGROUP_FROZEN;
			if (was_freezing)
				static_branch_dec(&freezer_active);
				static_branch_dec_cpuslocked(&freezer_active);
			unfreeze_cgroup(freezer);
		}
	}
@@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
	struct cgroup_subsys_state *pos;

	cpus_read_lock();
	/*
	 * Update all its descendants in pre-order traversal.  Each
	 * descendant will try to inherit its parent's FREEZING state as
@@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
	}
	rcu_read_unlock();
	mutex_unlock(&freezer_mutex);
	cpus_read_unlock();
}

static ssize_t freezer_write(struct kernfs_open_file *of,
+1 −3
Original line number Diff line number Diff line
@@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
	struct task_cputime *cputime = &bstat->cputime;
	int i;

	cputime->stime = 0;
	cputime->utime = 0;
	cputime->sum_exec_runtime = 0;
	memset(bstat, 0, sizeof(*bstat));
	for_each_possible_cpu(i) {
		struct kernel_cpustat kcpustat;
		u64 *cpustat = kcpustat.cpustat;