Commit 6d05ce43 authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Lu Jialin
Browse files

sched/psi: use kernfs polling functions for PSI trigger polling

mainline inclusion
from mainline-v6.5-rc2
commit aff03707
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7DHPO

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aff037078ecaecf34a7c2afab1341815f90fba5e

-------------------------------------------------

Destroying psi trigger in cgroup_file_release causes UAF issues when
a cgroup is removed from under a polling process. This is happening
because cgroup removal causes a call to cgroup_file_release while the
actual file is still alive. Destroying the trigger at this point would
also destroy its waitqueue head and if there is still a polling process
on that file accessing the waitqueue, it will step on the freed pointer:

do_select
  vfs_poll
                           do_rmdir
                             cgroup_rmdir
                               kernfs_drain_open_files
                                 cgroup_file_release
                                   cgroup_pressure_release
                                     psi_trigger_destroy
                                       wake_up_pollfree(&t->event_wait)
// vfs_poll is unblocked
                                       synchronize_rcu
                                       kfree(t)
  poll_freewait -> UAF access to the trigger's waitqueue head

Patch [1] fixed this issue for epoll() case using wake_up_pollfree(),
however the same issue exists for synchronous poll() case.
The root cause of this issue is that the lifecycles of the psi trigger's
waitqueue and of the file associated with the trigger are different. Fix
this by using kernfs_generic_poll function when polling on cgroup-specific
psi triggers. It internally uses kernfs_open_node->poll waitqueue head
with its lifecycle tied to the file's lifecycle. This also renders the
fix in [1] obsolete, so revert it.

[1] commit c2dbe32d ("sched/psi: Fix use-after-free in ep_remove_wait_queue()")

Fixes: 0e94682b ("psi: introduce psi monitor")
Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/


Reported-by: default avatarLu Jialin <lujialin4@huawei.com>
Signed-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230630005612.1014540-1-surenb@google.com


Conflict:
	include/linux/psi.h
	kernel/cgroup/cgroup.c
	kernel/sched/psi.c
Signed-off-by: default avatarLu Jialin <lujialin4@huawei.com>
parent 646f8670
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -32,8 +32,9 @@ int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);

struct psi_trigger *psi_trigger_create(struct psi_group *group,
			char *buf, size_t nbytes, enum psi_res res);
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
				       size_t nbytes, enum psi_res res,
				       struct kernfs_open_file *of);
void psi_trigger_destroy(struct psi_trigger *t);

__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
+3 −0
Original line number Diff line number Diff line
@@ -173,6 +173,9 @@ struct psi_trigger {
	/* Wait queue for polling */
	wait_queue_head_t event_wait;

	/* Kernfs file for cgroup triggers */
	struct kernfs_open_file *of;

	/* Pending event flag */
	int event;

+1 −1
Original line number Diff line number Diff line
@@ -3676,7 +3676,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
	}

	psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	new = psi_trigger_create(psi, buf, nbytes, res);
	new = psi_trigger_create(psi, buf, nbytes, res, of);
	if (IS_ERR(new)) {
		cgroup_put(cgrp);
		return PTR_ERR(new);
+21 −8
Original line number Diff line number Diff line
@@ -553,8 +553,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
			continue;

		/* Generate an event */
		if (cmpxchg(&t->event, 0, 1) == 0)
		if (cmpxchg(&t->event, 0, 1) == 0) {
			if (t->of)
				kernfs_notify(t->of->kn);
			else
				wake_up_interruptible(&t->event_wait);
		}
		t->last_event_time = now;
	}

@@ -1132,8 +1136,9 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
	return single_open(file, psi_cpu_show, NULL);
}

struct psi_trigger *psi_trigger_create(struct psi_group *group,
			char *buf, size_t nbytes, enum psi_res res)
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
				       size_t nbytes, enum psi_res res,
				       struct kernfs_open_file *of)
{
	struct psi_trigger *t;
	enum psi_states state;
@@ -1173,6 +1178,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,

	t->event = 0;
	t->last_event_time = 0;
	t->of = of;
	if (!of)
		init_waitqueue_head(&t->event_wait);

	mutex_lock(&group->trigger_lock);
@@ -1219,6 +1226,9 @@ void psi_trigger_destroy(struct psi_trigger *t)
	 * Wakeup waiters to stop polling. Can happen if cgroup is deleted
	 * from under a polling process.
	 */
	if (t->of)
		kernfs_notify(t->of->kn);
	else
		wake_up_interruptible(&t->event_wait);

	mutex_lock(&group->trigger_lock);
@@ -1282,6 +1292,9 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
	if (!t)
		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;

	if (t->of)
		kernfs_generic_poll(t->of, wait);
	else
		poll_wait(file, &t->event_wait, wait);

	if (cmpxchg(&t->event, 1, 0) == 1)
@@ -1321,7 +1334,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
		return -EBUSY;
	}

	new = psi_trigger_create(&psi_system, buf, nbytes, res);
	new = psi_trigger_create(&psi_system, buf, nbytes, res, NULL);
	if (IS_ERR(new)) {
		mutex_unlock(&seq->lock);
		return PTR_ERR(new);