Commit cb0e52b7 authored by Brian Chen's avatar Brian Chen Committed by Peter Zijlstra
Browse files

psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim



We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: default avatarBrian Chen <brianchen118@gmail.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
parent 4feee7d1
Loading
Loading
Loading
Loading
+12 −1
Original line number Diff line number Diff line
@@ -22,7 +22,17 @@ enum psi_task_count {
	 * don't have to special case any state tracking for it.
	 */
	NR_ONCPU,
	NR_PSI_TASK_COUNTS = 4,
	/*
	 * For IO and CPU stalls the presence of running/oncpu tasks
	 * in the domain means a partial rather than a full stall.
	 * For memory it's not so simple because of page reclaimers:
	 * they are running/oncpu while representing a stall. To tell
	 * whether a domain has productivity left or not, we need to
	 * distinguish between regular running (i.e. productive)
	 * threads and memstall ones.
	 */
	NR_MEMSTALL_RUNNING,
	NR_PSI_TASK_COUNTS = 5,
};

/* Task state bitmasks */
@@ -30,6 +40,7 @@ enum psi_task_count {
#define TSK_MEMSTALL	(1 << NR_MEMSTALL)
#define TSK_RUNNING	(1 << NR_RUNNING)
#define TSK_ONCPU	(1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)

/* Resources that workloads could be stalled on */
enum psi_res {
+28 −17
Original line number Diff line number Diff line
@@ -35,13 +35,19 @@
 * delayed on that resource such that nobody is advancing and the CPU
 * goes idle. This leaves both workload and CPU unproductive.
 *
 * Naturally, the FULL state doesn't exist for the CPU resource at the
 * system level, but exist at the cgroup level, means all non-idle tasks
 * in a cgroup are delayed on the CPU resource which used by others outside
 * of the cgroup or throttled by the cgroup cpu.max configuration.
 *
 *	SOME = nr_delayed_tasks != 0
 *	FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
 *	FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
 *
 * What it means for a task to be productive is defined differently
 * for each resource. For IO, productive means a running task. For
 * memory, productive means a running task that isn't a reclaimer. For
 * CPU, productive means an oncpu task.
 *
 * Naturally, the FULL state doesn't exist for the CPU resource at the
 * system level, but exist at the cgroup level. At the cgroup level,
 * FULL means all non-idle tasks in the cgroup are delayed on the CPU
 * resource which is being used by others outside of the cgroup or
 * throttled by the cgroup cpu.max configuration.
 *
 * The percentage of wallclock time spent in those compound stall
 * states gives pressure numbers between 0 and 100 for each resource,
@@ -82,13 +88,13 @@
 *
 *	threads = min(nr_nonidle_tasks, nr_cpus)
 *	   SOME = min(nr_delayed_tasks / threads, 1)
 *	   FULL = (threads - min(nr_running_tasks, threads)) / threads
 *	   FULL = (threads - min(nr_productive_tasks, threads)) / threads
 *
 * For the 257 number crunchers on 256 CPUs, this yields:
 *
 *	threads = min(257, 256)
 *	   SOME = min(1 / 256, 1)             = 0.4%
 *	   FULL = (256 - min(257, 256)) / 256 = 0%
 *	   FULL = (256 - min(256, 256)) / 256 = 0%
 *
 * For the 1 out of 4 memory-delayed tasks, this yields:
 *
@@ -113,7 +119,7 @@
 * For each runqueue, we track:
 *
 *	   tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
 *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
 *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
 *	tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
 *
 * and then periodically aggregate:
@@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
	case PSI_MEM_SOME:
		return unlikely(tasks[NR_MEMSTALL]);
	case PSI_MEM_FULL:
		return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
		return unlikely(tasks[NR_MEMSTALL] &&
			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
	case PSI_CPU_SOME:
		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
	case PSI_CPU_FULL:
@@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
		if (groupc->tasks[t]) {
			groupc->tasks[t]--;
		} else if (!psi_bug) {
			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
					cpu, t, groupc->tasks[0],
					groupc->tasks[1], groupc->tasks[2],
					groupc->tasks[3], clear, set);
					groupc->tasks[3], groupc->tasks[4],
					clear, set);
			psi_bug = 1;
		}
	}
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
		int clear = TSK_ONCPU, set = 0;

		/*
		 * When we're going to sleep, psi_dequeue() lets us handle
		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
		 * with TSK_ONCPU and save walking common ancestors twice.
		 * When we're going to sleep, psi_dequeue() lets us
		 * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
		 * TSK_IOWAIT here, where we can combine it with
		 * TSK_ONCPU and save walking common ancestors twice.
		 */
		if (sleep) {
			clear |= TSK_RUNNING;
			if (prev->in_memstall)
				clear |= TSK_MEMSTALL_RUNNING;
			if (prev->in_iowait)
				set |= TSK_IOWAIT;
		}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
	rq = this_rq_lock_irq(&rf);

	current->in_memstall = 1;
	psi_task_change(current, 0, TSK_MEMSTALL);
	psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);

	rq_unlock_irq(rq, &rf);
}
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
	rq = this_rq_lock_irq(&rf);

	current->in_memstall = 0;
	psi_task_change(current, TSK_MEMSTALL, 0);
	psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);

	rq_unlock_irq(rq, &rf);
}
+4 −1
Original line number Diff line number Diff line
@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
	if (static_branch_likely(&psi_disabled))
		return;

	if (p->in_memstall)
		set |= TSK_MEMSTALL_RUNNING;

	if (!wakeup || p->sched_psi_wake_requeue) {
		if (p->in_memstall)
			set |= TSK_MEMSTALL;
@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
		return;

	if (p->in_memstall)
		clear |= TSK_MEMSTALL;
		clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);

	psi_task_change(p, clear, 0);
}