Merge branch 'for-5.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup (d1587f7b) · Commits · EulixOS / Software / Kernel

kernel/cgroup/cgroup-internal.h

+19 −0

Original line number	Diff line number	Diff line
		@@ -65,6 +65,25 @@ static inline struct cgroup_fs_context cgroup_fc2context(struct fs_context fc)
		return container_of(kfc, struct cgroup_fs_context, kfc);
		}

		struct cgroup_pidlist;

		struct cgroup_file_ctx {
		struct cgroup_namespace *ns;

		struct {
		void *trigger;
		} psi;

		struct {
		bool started;
		struct css_task_iter iter;
		} procs;

		struct {
		struct cgroup_pidlist *pidlist;
		} procs1;
		};

		/*
		* A cgroup can be associated with multiple css_sets as different tasks may
		* belong to different cgroups on different hierarchies. In the other

kernel/cgroup/cgroup-v1.c

+18 −15

Original line number	Diff line number	Diff line
		@@ -394,6 +394,7 @@ static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
		* next pid to display, if any
		*/
		struct kernfs_open_file *of = s->private;
		struct cgroup_file_ctx *ctx = of->priv;
		struct cgroup *cgrp = seq_css(s)->cgroup;
		struct cgroup_pidlist *l;
		enum cgroup_filetype type = seq_cft(s)->private;
		@@ -403,25 +404,24 @@ static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
		mutex_lock(&cgrp->pidlist_mutex);

		/*
		* !NULL @of->priv indicates that this isn't the first start()
		* after open. If the matching pidlist is around, we can use that.
		* Look for it. Note that @of->priv can't be used directly. It
		* could already have been destroyed.
		* !NULL @ctx->procs1.pidlist indicates that this isn't the first
		* start() after open. If the matching pidlist is around, we can use
		* that. Look for it. Note that @ctx->procs1.pidlist can't be used
		* directly. It could already have been destroyed.
		*/
		if (of->priv)
		of->priv = cgroup_pidlist_find(cgrp, type);
		if (ctx->procs1.pidlist)
		ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);

		/*
		* Either this is the first start() after open or the matching
		* pidlist has been destroyed inbetween. Create a new one.
		*/
		if (!of->priv) {
		ret = pidlist_array_load(cgrp, type,
		(struct cgroup_pidlist **)&of->priv);
		if (!ctx->procs1.pidlist) {
		ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
		if (ret)
		return ERR_PTR(ret);
		}
		l = of->priv;
		l = ctx->procs1.pidlist;

		if (pid) {
		int end = l->length;
		@@ -449,7 +449,8 @@ static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
		static void cgroup_pidlist_stop(struct seq_file s, void v)
		{
		struct kernfs_open_file *of = s->private;
		struct cgroup_pidlist *l = of->priv;
		struct cgroup_file_ctx *ctx = of->priv;
		struct cgroup_pidlist *l = ctx->procs1.pidlist;

		if (l)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
		@@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file s, void v)
		static void cgroup_pidlist_next(struct seq_file s, void v, loff_t pos)
		{
		struct kernfs_open_file *of = s->private;
		struct cgroup_pidlist *l = of->priv;
		struct cgroup_file_ctx *ctx = of->priv;
		struct cgroup_pidlist *l = ctx->procs1.pidlist;
		pid_t *p = v;
		pid_t *end = l->list + l->length;
		/*
		@@ -504,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
		goto out_unlock;

		/*
		* Even if we're attaching all tasks in the thread group, we only
		* need to check permissions on one of them.
		* Even if we're attaching all tasks in the thread group, we only need
		* to check permissions on one of them. Check permissions using the
		* credentials from file open to protect against inherited fd attacks.
		*/
		cred = current_cred();
		cred = of->file->f_cred;
		tcred = get_task_cred(task);
		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
		!uid_eq(cred->euid, tcred->uid) &&

kernel/cgroup/cgroup.c

+60 −28

Original line number	Diff line number	Diff line
		@@ -3630,6 +3630,7 @@ static int cgroup_cpu_pressure_show(struct seq_file seq, void v)
		static ssize_t cgroup_pressure_write(struct kernfs_open_file of, char buf,
		size_t nbytes, enum psi_res res)
		{
		struct cgroup_file_ctx *ctx = of->priv;
		struct psi_trigger *new;
		struct cgroup *cgrp;
		struct psi_group *psi;
		@@ -3648,7 +3649,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file of, char buf,
		return PTR_ERR(new);
		}

		psi_trigger_replace(&of->priv, new);
		psi_trigger_replace(&ctx->psi.trigger, new);

		cgroup_put(cgrp);

		@@ -3679,12 +3680,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
		static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
		poll_table *pt)
		{
		return psi_trigger_poll(&of->priv, of->file, pt);
		struct cgroup_file_ctx *ctx = of->priv;

		return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
		}

		static void cgroup_pressure_release(struct kernfs_open_file *of)
		{
		psi_trigger_replace(&of->priv, NULL);
		struct cgroup_file_ctx *ctx = of->priv;

		psi_trigger_replace(&ctx->psi.trigger, NULL);
		}

		bool cgroup_psi_enabled(void)
		@@ -3811,24 +3816,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file of, char buf,
		static int cgroup_file_open(struct kernfs_open_file *of)
		{
		struct cftype *cft = of_cft(of);
		struct cgroup_file_ctx *ctx;
		int ret;

		if (cft->open)
		return cft->open(of);
		ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
		if (!ctx)
		return -ENOMEM;

		ctx->ns = current->nsproxy->cgroup_ns;
		get_cgroup_ns(ctx->ns);
		of->priv = ctx;

		if (!cft->open)
		return 0;

		ret = cft->open(of);
		if (ret) {
		put_cgroup_ns(ctx->ns);
		kfree(ctx);
		}
		return ret;
		}

		static void cgroup_file_release(struct kernfs_open_file *of)
		{
		struct cftype *cft = of_cft(of);
		struct cgroup_file_ctx *ctx = of->priv;

		if (cft->release)
		cft->release(of);
		put_cgroup_ns(ctx->ns);
		kfree(ctx);
		}

		static ssize_t cgroup_file_write(struct kernfs_open_file of, char buf,
		size_t nbytes, loff_t off)
		{
		struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
		struct cgroup_file_ctx *ctx = of->priv;
		struct cgroup *cgrp = of->kn->parent->priv;
		struct cftype *cft = of_cft(of);
		struct cgroup_subsys_state *css;
		@@ -3845,7 +3869,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file of, char buf,
		*/
		if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
		!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
		ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
		ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
		return -EPERM;

		if (cft->write)
		@@ -4751,21 +4775,21 @@ void css_task_iter_end(struct css_task_iter *it)

		static void cgroup_procs_release(struct kernfs_open_file *of)
		{
		if (of->priv) {
		css_task_iter_end(of->priv);
		kfree(of->priv);
		}
		struct cgroup_file_ctx *ctx = of->priv;

		if (ctx->procs.started)
		css_task_iter_end(&ctx->procs.iter);
		}

		static void cgroup_procs_next(struct seq_file s, void v, loff_t pos)
		{
		struct kernfs_open_file *of = s->private;
		struct css_task_iter *it = of->priv;
		struct cgroup_file_ctx *ctx = of->priv;

		if (pos)
		(*pos)++;

		return css_task_iter_next(it);
		return css_task_iter_next(&ctx->procs.iter);
		}

		static void __cgroup_procs_start(struct seq_file s, loff_t *pos,
		@@ -4773,21 +4797,18 @@ static void __cgroup_procs_start(struct seq_file s, loff_t *pos,
		{
		struct kernfs_open_file *of = s->private;
		struct cgroup *cgrp = seq_css(s)->cgroup;
		struct css_task_iter *it = of->priv;
		struct cgroup_file_ctx *ctx = of->priv;
		struct css_task_iter *it = &ctx->procs.iter;

		/*
		* When a seq_file is seeked, it's always traversed sequentially
		* from position 0, so we can simply keep iterating on !0 *pos.
		*/
		if (!it) {
		if (!ctx->procs.started) {
		if (WARN_ON_ONCE((*pos)))
		return ERR_PTR(-EINVAL);

		it = kzalloc(sizeof(*it), GFP_KERNEL);
		if (!it)
		return ERR_PTR(-ENOMEM);
		of->priv = it;
		css_task_iter_start(&cgrp->self, iter_flags, it);
		ctx->procs.started = true;
		} else if (!(*pos)) {
		css_task_iter_end(it);
		css_task_iter_start(&cgrp->self, iter_flags, it);
		@@ -4838,9 +4859,9 @@ static int cgroup_may_write(const struct cgroup cgrp, struct super_block sb)

		static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
		struct cgroup *dst_cgrp,
		struct super_block *sb)
		struct super_block *sb,
		struct cgroup_namespace *ns)
		{
		struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
		struct cgroup *com_cgrp = src_cgrp;
		int ret;

		@@ -4869,11 +4890,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,

		static int cgroup_attach_permissions(struct cgroup *src_cgrp,
		struct cgroup *dst_cgrp,
		struct super_block *sb, bool threadgroup)
		struct super_block *sb, bool threadgroup,
		struct cgroup_namespace *ns)
		{
		int ret = 0;

		ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
		ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
		if (ret)
		return ret;

		@@ -4890,8 +4912,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
		static ssize_t __cgroup_procs_write(struct kernfs_open_file of, char buf,
		bool threadgroup)
		{
		struct cgroup_file_ctx *ctx = of->priv;
		struct cgroup src_cgrp, dst_cgrp;
		struct task_struct *task;
		const struct cred *saved_cred;
		ssize_t ret;
		bool locked;

		@@ -4909,9 +4933,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file of, char buf,
		src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
		spin_unlock_irq(&css_set_lock);

		/* process and thread migrations follow same delegation rule */
		/*
		* Process and thread migrations follow same delegation rule. Check
		* permissions using the credentials from file open to protect against
		* inherited fd attacks.
		*/
		saved_cred = override_creds(of->file->f_cred);
		ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
		of->file->f_path.dentry->d_sb, threadgroup);
		of->file->f_path.dentry->d_sb,
		threadgroup, ctx->ns);
		revert_creds(saved_cred);
		if (ret)
		goto out_finish;

		@@ -6130,7 +6161,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
		goto err;

		ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
		!(kargs->flags & CLONE_THREAD));
		!(kargs->flags & CLONE_THREAD),
		current->nsproxy->cgroup_ns);
		if (ret)
		goto err;

tools/testing/selftests/cgroup/cgroup_util.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -221,7 +221,7 @@ int cg_find_unified_root(char *root, size_t len)

		int cg_create(const char *cgroup)
		{
		return mkdir(cgroup, 0644);
		return mkdir(cgroup, 0755);
		}

		int cg_wait_for_proc_count(const char *cgroup, int count)

tools/testing/selftests/cgroup/test_core.c

+165 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */

		#define _GNU_SOURCE
		#include <linux/limits.h>
		#include <linux/sched.h>
		#include <sys/types.h>
		#include <sys/mman.h>
		#include <sys/wait.h>
		#include <unistd.h>
		#include <fcntl.h>
		#include <sched.h>
		#include <stdio.h>
		#include <errno.h>
		#include <signal.h>
		@@ -674,6 +677,166 @@ static int test_cgcore_thread_migration(const char *root)
		return ret;
		}

		/*
		* cgroup migration permission check should be performed based on the
		* credentials at the time of open instead of write.
		*/
		static int test_cgcore_lesser_euid_open(const char *root)
		{
		const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
		int ret = KSFT_FAIL;
		char cg_test_a = NULL, cg_test_b = NULL;
		char cg_test_a_procs = NULL, cg_test_b_procs = NULL;
		int cg_test_b_procs_fd = -1;
		uid_t saved_uid;

		cg_test_a = cg_name(root, "cg_test_a");
		cg_test_b = cg_name(root, "cg_test_b");

		if (!cg_test_a \|\| !cg_test_b)
		goto cleanup;

		cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
		cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");

		if (!cg_test_a_procs \|\| !cg_test_b_procs)
		goto cleanup;

		if (cg_create(cg_test_a) \|\| cg_create(cg_test_b))
		goto cleanup;

		if (cg_enter_current(cg_test_a))
		goto cleanup;

		if (chown(cg_test_a_procs, test_euid, -1) \|\|
		chown(cg_test_b_procs, test_euid, -1))
		goto cleanup;

		saved_uid = geteuid();
		if (seteuid(test_euid))
		goto cleanup;

		cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);

		if (seteuid(saved_uid))
		goto cleanup;

		if (cg_test_b_procs_fd < 0)
		goto cleanup;

		if (write(cg_test_b_procs_fd, "0", 1) >= 0 \|\| errno != EACCES)
		goto cleanup;

		ret = KSFT_PASS;

		cleanup:
		cg_enter_current(root);
		if (cg_test_b_procs_fd >= 0)
		close(cg_test_b_procs_fd);
		if (cg_test_b)
		cg_destroy(cg_test_b);
		if (cg_test_a)
		cg_destroy(cg_test_a);
		free(cg_test_b_procs);
		free(cg_test_a_procs);
		free(cg_test_b);
		free(cg_test_a);
		return ret;
		}

		struct lesser_ns_open_thread_arg {
		const char *path;
		int fd;
		int err;
		};

		static int lesser_ns_open_thread_fn(void *arg)
		{
		struct lesser_ns_open_thread_arg *targ = arg;

		targ->fd = open(targ->path, O_RDWR);
		targ->err = errno;
		return 0;
		}

		/*
		* cgroup migration permission check should be performed based on the cgroup
		* namespace at the time of open instead of write.
		*/
		static int test_cgcore_lesser_ns_open(const char *root)
		{
		static char stack[65536];
		const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
		int ret = KSFT_FAIL;
		char cg_test_a = NULL, cg_test_b = NULL;
		char cg_test_a_procs = NULL, cg_test_b_procs = NULL;
		int cg_test_b_procs_fd = -1;
		struct lesser_ns_open_thread_arg targ = { .fd = -1 };
		pid_t pid;
		int status;

		cg_test_a = cg_name(root, "cg_test_a");
		cg_test_b = cg_name(root, "cg_test_b");

		if (!cg_test_a \|\| !cg_test_b)
		goto cleanup;

		cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
		cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");

		if (!cg_test_a_procs \|\| !cg_test_b_procs)
		goto cleanup;

		if (cg_create(cg_test_a) \|\| cg_create(cg_test_b))
		goto cleanup;

		if (cg_enter_current(cg_test_b))
		goto cleanup;

		if (chown(cg_test_a_procs, test_euid, -1) \|\|
		chown(cg_test_b_procs, test_euid, -1))
		goto cleanup;

		targ.path = cg_test_b_procs;
		pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
		CLONE_NEWCGROUP \| CLONE_FILES \| CLONE_VM \| SIGCHLD,
		&targ);
		if (pid < 0)
		goto cleanup;

		if (waitpid(pid, &status, 0) < 0)
		goto cleanup;

		if (!WIFEXITED(status))
		goto cleanup;

		cg_test_b_procs_fd = targ.fd;
		if (cg_test_b_procs_fd < 0)
		goto cleanup;

		if (cg_enter_current(cg_test_a))
		goto cleanup;

		if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 \|\| errno != ENOENT)
		goto cleanup;

		ret = KSFT_PASS;

		cleanup:
		cg_enter_current(root);
		if (cg_test_b_procs_fd >= 0)
		close(cg_test_b_procs_fd);
		if (cg_test_b)
		cg_destroy(cg_test_b);
		if (cg_test_a)
		cg_destroy(cg_test_a);
		free(cg_test_b_procs);
		free(cg_test_a_procs);
		free(cg_test_b);
		free(cg_test_a);
		return ret;
		}

		#define T(x) { x, #x }
		struct corecg_test {
		int (fn)(const char root);
		@@ -689,6 +852,8 @@ struct corecg_test {
		T(test_cgcore_proc_migration),
		T(test_cgcore_thread_migration),
		T(test_cgcore_destroy),
		T(test_cgcore_lesser_euid_open),
		T(test_cgcore_lesser_ns_open),
		};
		#undef T