psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim (cb0e52b7) · Commits · EulixOS / Software / Kernel

include/linux/psi_types.h

+12 −1

Original line number	Diff line number	Diff line
		@@ -22,7 +22,17 @@ enum psi_task_count {
		* don't have to special case any state tracking for it.
		*/
		NR_ONCPU,
		NR_PSI_TASK_COUNTS = 4,
		/*
		* For IO and CPU stalls the presence of running/oncpu tasks
		* in the domain means a partial rather than a full stall.
		* For memory it's not so simple because of page reclaimers:
		* they are running/oncpu while representing a stall. To tell
		* whether a domain has productivity left or not, we need to
		* distinguish between regular running (i.e. productive)
		* threads and memstall ones.
		*/
		NR_MEMSTALL_RUNNING,
		NR_PSI_TASK_COUNTS = 5,
		};

		/* Task state bitmasks */
		@@ -30,6 +40,7 @@ enum psi_task_count {
		#define TSK_MEMSTALL (1 << NR_MEMSTALL)
		#define TSK_RUNNING (1 << NR_RUNNING)
		#define TSK_ONCPU (1 << NR_ONCPU)
		#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)

		/* Resources that workloads could be stalled on */
		enum psi_res {

kernel/sched/psi.c

+28 −17

Original line number	Diff line number	Diff line
		@@ -35,13 +35,19 @@
		* delayed on that resource such that nobody is advancing and the CPU
		* goes idle. This leaves both workload and CPU unproductive.
		*
		* Naturally, the FULL state doesn't exist for the CPU resource at the
		* system level, but exist at the cgroup level, means all non-idle tasks
		* in a cgroup are delayed on the CPU resource which used by others outside
		* of the cgroup or throttled by the cgroup cpu.max configuration.
		*
		* SOME = nr_delayed_tasks != 0
		* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
		* FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
		*
		* What it means for a task to be productive is defined differently
		* for each resource. For IO, productive means a running task. For
		* memory, productive means a running task that isn't a reclaimer. For
		* CPU, productive means an oncpu task.
		*
		* Naturally, the FULL state doesn't exist for the CPU resource at the
		* system level, but exist at the cgroup level. At the cgroup level,
		* FULL means all non-idle tasks in the cgroup are delayed on the CPU
		* resource which is being used by others outside of the cgroup or
		* throttled by the cgroup cpu.max configuration.
		*
		* The percentage of wallclock time spent in those compound stall
		* states gives pressure numbers between 0 and 100 for each resource,
		@@ -82,13 +88,13 @@
		*
		* threads = min(nr_nonidle_tasks, nr_cpus)
		* SOME = min(nr_delayed_tasks / threads, 1)
		* FULL = (threads - min(nr_running_tasks, threads)) / threads
		* FULL = (threads - min(nr_productive_tasks, threads)) / threads
		*
		* For the 257 number crunchers on 256 CPUs, this yields:
		*
		* threads = min(257, 256)
		* SOME = min(1 / 256, 1) = 0.4%
		* FULL = (256 - min(257, 256)) / 256 = 0%
		* FULL = (256 - min(256, 256)) / 256 = 0%
		*
		* For the 1 out of 4 memory-delayed tasks, this yields:
		*
		@@ -113,7 +119,7 @@
		* For each runqueue, we track:
		*
		* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
		* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
		* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
		* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
		*
		* and then periodically aggregate:
		@@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
		case PSI_MEM_SOME:
		return unlikely(tasks[NR_MEMSTALL]);
		case PSI_MEM_FULL:
		return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
		return unlikely(tasks[NR_MEMSTALL] &&
		tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
		case PSI_CPU_SOME:
		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
		case PSI_CPU_FULL:
		@@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
		if (groupc->tasks[t]) {
		groupc->tasks[t]--;
		} else if (!psi_bug) {
		printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
		printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
		cpu, t, groupc->tasks[0],
		groupc->tasks[1], groupc->tasks[2],
		groupc->tasks[3], clear, set);
		groupc->tasks[3], groupc->tasks[4],
		clear, set);
		psi_bug = 1;
		}
		}
		@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct prev, struct task_struct next,
		int clear = TSK_ONCPU, set = 0;

		/*
		* When we're going to sleep, psi_dequeue() lets us handle
		* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
		* with TSK_ONCPU and save walking common ancestors twice.
		* When we're going to sleep, psi_dequeue() lets us
		* handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
		* TSK_IOWAIT here, where we can combine it with
		* TSK_ONCPU and save walking common ancestors twice.
		*/
		if (sleep) {
		clear \|= TSK_RUNNING;
		if (prev->in_memstall)
		clear \|= TSK_MEMSTALL_RUNNING;
		if (prev->in_iowait)
		set \|= TSK_IOWAIT;
		}
		@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
		rq = this_rq_lock_irq(&rf);

		current->in_memstall = 1;
		psi_task_change(current, 0, TSK_MEMSTALL);
		psi_task_change(current, 0, TSK_MEMSTALL \| TSK_MEMSTALL_RUNNING);

		rq_unlock_irq(rq, &rf);
		}
		@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
		rq = this_rq_lock_irq(&rf);

		current->in_memstall = 0;
		psi_task_change(current, TSK_MEMSTALL, 0);
		psi_task_change(current, TSK_MEMSTALL \| TSK_MEMSTALL_RUNNING, 0);

		rq_unlock_irq(rq, &rf);
		}

kernel/sched/stats.h

+4 −1

Original line number	Diff line number	Diff line
		@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
		if (static_branch_likely(&psi_disabled))
		return;

		if (p->in_memstall)
		set \|= TSK_MEMSTALL_RUNNING;

		if (!wakeup \|\| p->sched_psi_wake_requeue) {
		if (p->in_memstall)
		set \|= TSK_MEMSTALL;
		@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
		return;

		if (p->in_memstall)
		clear \|= TSK_MEMSTALL;
		clear \|= (TSK_MEMSTALL \| TSK_MEMSTALL_RUNNING);

		psi_task_change(p, clear, 0);
		}