Commit 3dbdb38e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup updates from Tejun Heo:

 - cgroup.kill is added which implements atomic killing of the whole
   subtree.

   Down the line, this should be able to replace the multiple userland
   implementations of "keep killing till empty".

 - PSI can now be turned off at boot time to avoid overhead for
   configurations which don't care about PSI.

* 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: make per-cgroup pressure stall tracking configurable
  cgroup: Fix kernel-doc
  cgroup: inline cgroup_task_freeze()
  tests/cgroup: test cgroup.kill
  tests/cgroup: move cg_wait_for(), cg_prepare_for_wait()
  tests/cgroup: use cgroup.kill in cg_killall()
  docs/cgroup: add entry for cgroup.kill
  cgroup: introduce cgroup.kill
parents e267992f 3958e2d0
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -953,6 +953,21 @@ All cgroup core files are prefixed with "cgroup."
	it's possible to delete a frozen (and empty) cgroup, as well as
	create new sub-cgroups.

  cgroup.kill
	A write-only single value file which exists in non-root cgroups.
	The only allowed value is "1".

	Writing "1" to the file causes the cgroup and all descendant cgroups to
	be killed. This means that all processes located in the affected cgroup
	tree will be killed via SIGKILL.

	Killing a cgroup tree will deal with concurrent forks appropriately and
	is protected against migrations.

	In a threaded cgroup, writing this file fails with EOPNOTSUPP as
	killing cgroups is a process directed operation, i.e. it affects
	the whole thread-group.

Controllers
===========

+7 −2
Original line number Diff line number Diff line
@@ -497,16 +497,21 @@
	ccw_timeout_log	[S390]
			See Documentation/s390/common_io.rst for details.

	cgroup_disable=	[KNL] Disable a particular controller
			Format: {name of the controller(s) to disable}
	cgroup_disable=	[KNL] Disable a particular controller or optional feature
			Format: {name of the controller(s) or feature(s) to disable}
			The effects of cgroup_disable=foo are:
			- foo isn't auto-mounted if you mount all cgroups in
			  a single hierarchy
			- foo isn't visible as an individually mountable
			  subsystem
			- if foo is an optional feature then the feature is
			  disabled and corresponding cgroup files are not
			  created
			{Currently only "memory" controller deal with this and
			cut the overhead, others just disable the usage. So
			only cgroup_disable=memory is actually worthy}
			Specifying "pressure" disables per-cgroup pressure
			stall information accounting feature

	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
			Format: { { controller | "all" | "named" }
+4 −0
Original line number Diff line number Diff line
@@ -71,6 +71,9 @@ enum {

	/* Cgroup is frozen. */
	CGRP_FROZEN,

	/* Control group has to be killed. */
	CGRP_KILL,
};

/* cgroup_root->flags */
@@ -110,6 +113,7 @@ enum {
	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */

	/* internal flags, do not use outside cgroup core proper */
	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
+7 −18
Original line number Diff line number Diff line
@@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
	return &cgrp->psi;
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
{
	/*
@@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
	return NULL;
}

static inline bool cgroup_psi_enabled(void)
{
	return false;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
					       struct cgroup *ancestor)
{
@@ -906,20 +913,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
				 struct cgroup *dst);

static inline bool cgroup_task_freeze(struct task_struct *task)
{
	bool ret;

	if (task->flags & PF_KTHREAD)
		return false;

	rcu_read_lock();
	ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
	rcu_read_unlock();

	return ret;
}

static inline bool cgroup_task_frozen(struct task_struct *task)
{
	return task->frozen;
@@ -929,10 +922,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task)

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_freeze(struct task_struct *task)
{
	return false;
}
static inline bool cgroup_task_frozen(struct task_struct *task)
{
	return false;
+164 −16
Original line number Diff line number Diff line
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];

/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
	OPT_FEATURE_PRESSURE,
#endif
	OPT_FEATURE_COUNT
};

static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
	"pressure",
#endif
};

static u16 cgroup_feature_disable_mask __read_mostly;

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
@@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}

/**
 * cgroup_taskset_migrate - migrate a taskset
 * cgroup_migrate_execute - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
	psi_trigger_replace(&of->priv, NULL);
}

bool cgroup_psi_enabled(void)
{
	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}

#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
	return false;
}

#endif /* CONFIG_PSI */

static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
	return nbytes;
}

static void __cgroup_kill(struct cgroup *cgrp)
{
	struct css_task_iter it;
	struct task_struct *task;

	lockdep_assert_held(&cgroup_mutex);

	spin_lock_irq(&css_set_lock);
	set_bit(CGRP_KILL, &cgrp->flags);
	spin_unlock_irq(&css_set_lock);

	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
	while ((task = css_task_iter_next(&it))) {
		/* Ignore kernel threads here. */
		if (task->flags & PF_KTHREAD)
			continue;

		/* Skip tasks that are already dying. */
		if (__fatal_signal_pending(task))
			continue;

		send_sig(SIGKILL, task, 0);
	}
	css_task_iter_end(&it);

	spin_lock_irq(&css_set_lock);
	clear_bit(CGRP_KILL, &cgrp->flags);
	spin_unlock_irq(&css_set_lock);
}

static void cgroup_kill(struct cgroup *cgrp)
{
	struct cgroup_subsys_state *css;
	struct cgroup *dsct;

	lockdep_assert_held(&cgroup_mutex);

	cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
		__cgroup_kill(dsct);
}

static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
				 size_t nbytes, loff_t off)
{
	ssize_t ret = 0;
	int kill;
	struct cgroup *cgrp;

	ret = kstrtoint(strstrip(buf), 0, &kill);
	if (ret)
		return ret;

	if (kill != 1)
		return -ERANGE;

	cgrp = cgroup_kn_lock_live(of->kn, false);
	if (!cgrp)
		return -ENOENT;

	/*
	 * Killing is a process directed operation, i.e. the whole thread-group
	 * is taken down so act like we do for cgroup.procs and only make this
	 * writable in non-threaded cgroups.
	 */
	if (cgroup_is_threaded(cgrp))
		ret = -EOPNOTSUPP;
	else
		cgroup_kill(cgrp);

	cgroup_kn_unlock(of->kn);

	return ret ?: nbytes;
}

static int cgroup_file_open(struct kernfs_open_file *of)
{
	struct cftype *cft = of_cft(of);
@@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
		/* does cft->flags tell us to skip this file on @cgrp? */
		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
			continue;
		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
			continue;
		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

		WARN_ON(cft->ss || cft->kf_ops);

		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
			continue;

		if (cft->seq_start)
			kf_ops = &cgroup_kf_ops;
		else
@@ -4860,6 +4967,11 @@ static struct cftype cgroup_base_files[] = {
		.seq_show = cgroup_freeze_show,
		.write = cgroup_freeze_write,
	},
	{
		.name = "cgroup.kill",
		.flags = CFTYPE_NOT_ON_ROOT,
		.write = cgroup_kill_write,
	},
	{
		.name = "cpu.stat",
		.seq_show = cpu_stat_show,
@@ -4867,6 +4979,7 @@ static struct cftype cgroup_base_files[] = {
#ifdef CONFIG_PSI
	{
		.name = "io.pressure",
		.flags = CFTYPE_PRESSURE,
		.seq_show = cgroup_io_pressure_show,
		.write = cgroup_io_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
	},
	{
		.name = "memory.pressure",
		.flags = CFTYPE_PRESSURE,
		.seq_show = cgroup_memory_pressure_show,
		.write = cgroup_memory_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
	},
	{
		.name = "cpu.pressure",
		.flags = CFTYPE_PRESSURE,
		.seq_show = cgroup_cpu_pressure_show,
		.write = cgroup_cpu_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
		      struct kernel_clone_args *kargs)
	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
	unsigned long cgrp_flags = 0;
	bool kill = false;
	struct cgroup_subsys *ss;
	struct css_set *cset;
	int i;
@@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,

	/* init tasks are special, only link regular threads */
	if (likely(child->pid)) {
		if (kargs->cgrp)
			cgrp_flags = kargs->cgrp->flags;
		else
			cgrp_flags = cset->dfl_cgrp->flags;

		WARN_ON_ONCE(!list_empty(&child->cg_list));
		cset->nr_tasks++;
		css_set_move_task(child, NULL, cset, false);
@@ -6099,12 +6221,13 @@ void cgroup_post_fork(struct task_struct *child,
		cset = NULL;
	}

	if (!(child->flags & PF_KTHREAD)) {
		if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
			/*
	 * If the cgroup has to be frozen, the new task has too.  Let's set
	 * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
	 * frozen state.
			 * If the cgroup has to be frozen, the new task has
			 * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
			 * get the task into the frozen state.
			 */
	if (unlikely(cgroup_task_freeze(child))) {
			spin_lock(&child->sighand->siglock);
			WARN_ON_ONCE(child->frozen);
			child->jobctl |= JOBCTL_TRAP_FREEZE;
@@ -6113,11 +6236,19 @@ void cgroup_post_fork(struct task_struct *child,
			/*
			 * Calling cgroup_update_frozen() isn't required here,
			 * because it will be called anyway a bit later from
		 * do_freezer_trap(). So we avoid cgroup's transient switch
		 * from the frozen state and back.
			 * do_freezer_trap(). So we avoid cgroup's transient
			 * switch from the frozen state and back.
			 */
		}

		/*
		 * If the cgroup is to be killed notice it now and take the
		 * child down right after we finished preparing it for
		 * userspace.
		 */
		kill = test_bit(CGRP_KILL, &cgrp_flags);
	}

	spin_unlock_irq(&css_set_lock);

	/*
@@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
		put_css_set(rcset);
	}

	/* Cgroup has to be killed so take down child immediately. */
	if (unlikely(kill))
		do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);

	cgroup_css_set_put_fork(kargs);
}

@@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
	cset->nr_tasks--;

	WARN_ON_ONCE(cgroup_task_frozen(tsk));
	if (unlikely(cgroup_task_freeze(tsk)))
	if (unlikely(!(tsk->flags & PF_KTHREAD) &&
		     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
		cgroup_update_frozen(task_dfl_cgroup(tsk));

	spin_unlock_irq(&css_set_lock);
@@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
			pr_info("Disabling %s control group subsystem\n",
				ss->name);
		}

		for (i = 0; i < OPT_FEATURE_COUNT; i++) {
			if (strcmp(token, cgroup_opt_feature_names[i]))
				continue;
			cgroup_feature_disable_mask |= 1 << i;
			pr_info("Disabling %s control group feature\n",
				cgroup_opt_feature_names[i]);
			break;
		}
	}
	return 1;
}
@@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
			continue;

		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
			continue;

		if (prefix)
			ret += snprintf(buf + ret, size - ret, "%s.", prefix);

Loading