Commit 78f517b8 authored by Johannes Weiner's avatar Johannes Weiner Committed by Lu Jialin
Browse files

sched/psi: Remove NR_ONCPU task accounting

mainline inclusion
from mainline-v6.1-rc1
commit 71dbdde7
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCV4

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9



--------------------------------

We put all fields updated by the scheduler in the first cacheline of
struct psi_group_cpu for performance.

Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure,
we need to reclaim space first. This patch remove NR_ONCPU task accounting
in struct psi_group_cpu, use one bit in state_mask to track instead.

Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Tested-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com


Conflict:
	include/linux/psi_types.h
Signed-off-by: default avatarLu Jialin <lujialin4@huawei.com>
parent 224bd268
Loading
Loading
Loading
Loading
+5 −9
Original line number Diff line number Diff line
@@ -36,13 +36,6 @@ enum psi_task_count {
	NR_IOWAIT,
	NR_MEMSTALL,
	NR_RUNNING,
	/*
	 * This can't have values other than 0 or 1 and could be
	 * implemented as a bit flag. But for now we still have room
	 * in the first cacheline of psi_group_cpu, and this way we
	 * don't have to special case any state tracking for it.
	 */
	NR_ONCPU,
	/*
	 * For IO and CPU stalls the presence of running/oncpu tasks
	 * in the domain means a partial rather than a full stall.
@@ -53,7 +46,7 @@ enum psi_task_count {
	 * threads and memstall ones.
	 */
	NR_MEMSTALL_RUNNING,
	NR_PSI_TASK_COUNTS = 5,
	NR_PSI_TASK_COUNTS = 4,
};
#endif

@@ -61,8 +54,9 @@ enum psi_task_count {
#define TSK_IOWAIT	(1 << NR_IOWAIT)
#define TSK_MEMSTALL	(1 << NR_MEMSTALL)
#define TSK_RUNNING	(1 << NR_RUNNING)
#define TSK_ONCPU	(1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
/* Only one task can be scheduled, no corresponding task count */
#define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)

/* Resources that workloads could be stalled on */
enum psi_res {
@@ -110,6 +104,8 @@ enum psi_states {
};
#endif

/* Use one bit in the state mask to track TSK_ONCPU */
#define PSI_ONCPU	(1 << NR_PSI_STATES)

enum psi_aggregators {
	PSI_AVGS = 0,
+30 −11
Original line number Diff line number Diff line
@@ -228,7 +228,7 @@ void __init psi_init(void)
	group_init(&psi_system);
}

static bool test_state(unsigned int *tasks, enum psi_states state)
static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
	switch (state) {
	case PSI_IO_SOME:
@@ -241,9 +241,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
		return unlikely(tasks[NR_MEMSTALL] &&
			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
	case PSI_CPU_SOME:
		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
		return unlikely(tasks[NR_RUNNING] > oncpu);
	case PSI_CPU_FULL:
		return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
		return unlikely(tasks[NR_RUNNING] && !oncpu);
	case PSI_NONIDLE:
		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
			tasks[NR_RUNNING];
@@ -696,9 +696,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
			     bool wake_clock)
{
	struct psi_group_cpu *groupc;
	u32 state_mask = 0;
	unsigned int t, m;
	enum psi_states s;
	u32 state_mask;

	groupc = per_cpu_ptr(group->pcpu, cpu);

@@ -714,17 +714,36 @@ static void psi_group_change(struct psi_group *group, int cpu,

	record_times(groupc, now);

	/*
	 * Start with TSK_ONCPU, which doesn't have a corresponding
	 * task count - it's just a boolean flag directly encoded in
	 * the state mask. Clear, set, or carry the current state if
	 * no changes are requested.
	 */
	if (unlikely(clear & TSK_ONCPU)) {
		state_mask = 0;
		clear &= ~TSK_ONCPU;
	} else if (unlikely(set & TSK_ONCPU)) {
		state_mask = PSI_ONCPU;
		set &= ~TSK_ONCPU;
	} else {
		state_mask = groupc->state_mask & PSI_ONCPU;
	}

	/*
	 * The rest of the state mask is calculated based on the task
	 * counts. Update those first, then construct the mask.
	 */
	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
		if (!(m & (1 << t)))
			continue;
		if (groupc->tasks[t]) {
			groupc->tasks[t]--;
		} else if (!psi_bug) {
			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
					cpu, t, groupc->tasks[0],
					groupc->tasks[1], groupc->tasks[2],
					groupc->tasks[3], groupc->tasks[4],
					clear, set);
					groupc->tasks[3], clear, set);
			psi_bug = 1;
		}
	}
@@ -733,9 +752,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
		if (set & (1 << t))
			groupc->tasks[t]++;

	/* Calculate state mask representing active states */
	for (s = 0; s < NR_PSI_STATES; s++) {
		if (test_state(groupc->tasks, s))
		if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
			state_mask |= (1 << s);
	}

@@ -747,7 +765,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
	 * task in a cgroup is in_memstall, the corresponding groupc
	 * on that cpu is in PSI_MEM_FULL state.
	 */
	if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
	if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
		state_mask |= (1 << PSI_MEM_FULL);

	groupc->state_mask = state_mask;
@@ -849,7 +867,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
		 */
		iter = NULL;
		while ((group = iterate_groups(next, &iter))) {
			if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
			    PSI_ONCPU) {
				common = group;
				break;
			}