Commit cc71a821 authored by Lu Jialin's avatar Lu Jialin
Browse files

PSI: Introduce fine grained stall time collect for cgroup reclaim

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCV4



-------------------------------

PSI will tracking pressure stall for memory, cpu, io and irq. But, there
are differrnt pressure types which will cause memory pressure,
memory.pressure could not show the type of pressure effectively. The
same situation for cpu.pressure.
Introduce pressure.stat in psi, which will monitor specific reasons
for the memory.pressure and cpu.pressure, such as global/cgroup memory
reclaim, memory compact, cpu cfs bandwidth and so on. Therefore, userland
could make the right solution to reduce the pressure depends on the
specific pressure reasons.
This patch will introduce memory fine grained stall time collect for
cgroup reclaim.

Signed-off-by: default avatarLu Jialin <lujialin4@huawei.com>
parent 65479800
Loading
Loading
Loading
Loading
+34 −0
Original line number Diff line number Diff line
@@ -232,8 +232,29 @@ struct psi_group {
};

#ifdef CONFIG_PSI_FINE_GRAINED

enum psi_stat_states {
	PSI_MEMCG_RECLAIM_SOME,
	PSI_MEMCG_RECLAIM_FULL,
	NR_PSI_STAT_STATES,
};

enum psi_stat_task_count {
	NR_MEMCG_RECLAIM,
	NR_MEMCG_RECLAIM_RUNNING,
	NR_PSI_STAT_TASK_COUNTS,
};

struct psi_group_stat_cpu {
	u32 state_mask;
	u32 times[NR_PSI_STAT_STATES];
	u32 psi_delta;
	unsigned int tasks[NR_PSI_STAT_TASK_COUNTS];
};

struct psi_group_ext {
	struct psi_group psi;
	struct psi_group_stat_cpu __percpu *pcpu;
};
#else
struct psi_group_ext { };
@@ -245,4 +266,17 @@ struct psi_group { };

#endif /* CONFIG_PSI */

/*
 * one type should have two task stats: regular running and memstall
 * threads. The reason is the same as NR_MEMSTALL_RUNNING.
 * Because of the psi_memstall_type is start with 1, the correspondence
 * between psi_memstall_type and psi_stat_task_count should be as below:
 *
 * memstall : psi_memstall_type * 2 - 2;
 * running  : psi_memstall_type * 2 - 1;
 */
enum psi_memstall_type {
	PSI_MEMCG_RECLAIM = 1,
};

#endif /* _LINUX_PSI_TYPES_H */
+4 −0
Original line number Diff line number Diff line
@@ -1459,7 +1459,11 @@ struct task_struct {
#else
	KABI_RESERVE(12)
#endif
#ifdef CONFIG_PSI_FINE_GRAINED
	KABI_USE(13, int memstall_type)
#else
	KABI_RESERVE(13)
#endif
	KABI_RESERVE(14)
	KABI_RESERVE(15)
	KABI_RESERVE(16)
+151 −6
Original line number Diff line number Diff line
@@ -194,7 +194,10 @@ struct psi_group psi_system = {

#ifdef CONFIG_PSI_FINE_GRAINED
/* System-level fine grained pressure and stall tracking */
struct psi_group_ext psi_stat_system = { };
static DEFINE_PER_CPU(struct psi_group_stat_cpu, system_stat_group_pcpu);
struct psi_group_ext psi_stat_system = {
	.pcpu = &system_stat_group_pcpu,
};

struct psi_group_ext *to_psi_group_ext(struct psi_group *psi)
{
@@ -334,6 +337,109 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
	avg[2] = calc_load(avg[2], EXP_300s, pct);
}

#ifdef CONFIG_PSI_FINE_GRAINED

static void record_stat_times(struct psi_group_ext *psi_ext, int cpu)
{
	struct psi_group_stat_cpu *ext_grpc = per_cpu_ptr(psi_ext->pcpu, cpu);

	u32 delta = ext_grpc->psi_delta;

	if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) {
		ext_grpc->times[PSI_MEMCG_RECLAIM_SOME] += delta;
		if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL))
			ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta;
	}
}

static bool test_fine_grained_stat(unsigned int *stat_tasks,
				   unsigned int nr_running,
				   enum psi_stat_states state)
{
	switch (state) {
	case PSI_MEMCG_RECLAIM_SOME:
		return unlikely(stat_tasks[NR_MEMCG_RECLAIM]);
	case PSI_MEMCG_RECLAIM_FULL:
		return unlikely(stat_tasks[NR_MEMCG_RECLAIM] &&
		       nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]);
	default:
		return false;
	}
}

static void psi_group_stat_change(struct psi_group *group, int cpu,
				  int clear, int set)
{
	int t;
	u32 state_mask = 0;
	enum psi_stat_states s;
	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);

	write_seqcount_begin(&groupc->seq);
	record_stat_times(psi_ext, cpu);

	for (t = 0; clear; clear &= ~(1 << t), t++)
		if (clear & (1 << t))
			ext_groupc->tasks[t]--;
	for (t = 0; set; set &= ~(1 << t), t++)
		if (set & (1 << t))
			ext_groupc->tasks[t]++;
	for (s = 0; s < NR_PSI_STAT_STATES; s++)
		if (test_fine_grained_stat(ext_groupc->tasks,
					   groupc->tasks[NR_RUNNING], s))
			state_mask |= (1 << s);
	if (unlikely(groupc->state_mask & PSI_ONCPU) &&
		     cpu_curr(cpu)->memstall_type)
		state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1));

	ext_groupc->state_mask = state_mask;
	write_seqcount_end(&groupc->seq);
}

static void update_psi_stat_delta(struct psi_group *group, int cpu, u64 now)
{
	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);

	ext_groupc->psi_delta = now - groupc->state_start;
}

static void psi_stat_flags_change(struct task_struct *task, int *stat_set,
				  int *stat_clear, int set, int clear)
{
	if (!task->memstall_type)
		return;

	if (clear) {
		if (clear & TSK_MEMSTALL)
			*stat_clear |= 1 << (2 * task->memstall_type - 2);
		if (clear & TSK_MEMSTALL_RUNNING)
			*stat_clear |= 1 << (2 * task->memstall_type - 1);
	}
	if (set) {
		if (set & TSK_MEMSTALL)
			*stat_set |= 1 << (2 * task->memstall_type - 2);
		if (set & TSK_MEMSTALL_RUNNING)
			*stat_set |= 1 << (2 * task->memstall_type - 1);
	}
	if (!task->in_memstall)
		task->memstall_type = 0;
}

#else
static inline void psi_group_stat_change(struct psi_group *group, int cpu,
					 int clear, int set) {}
static inline void update_psi_stat_delta(struct psi_group *group, int cpu,
					 u64 now) {}
static inline void psi_stat_flags_change(struct task_struct *task,
					 int *stat_set, int *stat_clear,
					 int set, int clear) {}
static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {}
#endif

static void collect_percpu_times(struct psi_group *group,
				 enum psi_aggregators aggregator,
				 u32 *pchanged_states)
@@ -857,16 +963,22 @@ void psi_task_change(struct task_struct *task, int clear, int set)
	struct psi_group *group;
	void *iter = NULL;
	u64 now;
	int stat_set = 0;
	int stat_clear = 0;

	if (!task->pid)
		return;

	psi_flags_change(task, clear, set);
	psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear);

	now = cpu_clock(cpu);

	while ((group = iterate_groups(task, &iter)))
	while ((group = iterate_groups(task, &iter))) {
		update_psi_stat_delta(group, cpu, now);
		psi_group_change(group, cpu, clear, set, now, true);
		psi_group_stat_change(group, cpu, stat_clear, stat_set);
	}
}

void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -892,13 +1004,18 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
				break;
			}

			update_psi_stat_delta(group, cpu, now);
			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
			psi_group_stat_change(group, cpu, 0, 0);
		}
	}

	if (prev->pid) {
		int clear = TSK_ONCPU, set = 0;
		bool wake_clock = true;
		int stat_set = 0;
		int stat_clear = 0;
		bool memstall_type_change = false;

		/*
		 * When we're going to sleep, psi_dequeue() lets us
@@ -925,21 +1042,33 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
		}

		psi_flags_change(prev, clear, set);
		psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear);

		iter = NULL;
		while ((group = iterate_groups(prev, &iter)) && group != common)
		while ((group = iterate_groups(prev, &iter)) && group != common) {
			update_psi_stat_delta(group, cpu, now);
			psi_group_change(group, cpu, clear, set, now, wake_clock);

			psi_group_stat_change(group, cpu, stat_clear, stat_set);
		}
#ifdef CONFIG_PSI_FINE_GRAINED
		if (next->memstall_type != prev->memstall_type)
			memstall_type_change = true;
#endif
		/*
		 * TSK_ONCPU is handled up to the common ancestor. If there are
		 * any other differences between the two tasks (e.g. prev goes
		 * to sleep, or only one task is memstall), finish propagating
		 * those differences all the way up to the root.
		 */
		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU ||
		     memstall_type_change) {
			clear &= ~TSK_ONCPU;
			for (; group; group = iterate_groups(prev, &iter))
			for (; group; group = iterate_groups(prev, &iter)) {
				update_psi_stat_delta(group, cpu, now);
				psi_group_change(group, cpu, clear, set, now, wake_clock);
				psi_group_stat_change(group, cpu, stat_clear,
						      stat_set);
			}
		}
	}
}
@@ -966,6 +1095,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)

		write_seqcount_begin(&groupc->seq);

		update_psi_stat_delta(group, cpu, now);
		record_stat_times(to_psi_group_ext(group), cpu);
		record_times(groupc, now);
		groupc->times[PSI_IRQ_FULL] += delta;

@@ -988,6 +1119,9 @@ void psi_memstall_enter(unsigned long *flags)
{
	struct rq_flags rf;
	struct rq *rq;
#ifdef CONFIG_PSI_FINE_GRAINED
	unsigned long stat_flags = *flags;
#endif

	if (static_branch_likely(&psi_disabled))
		return;
@@ -1005,6 +1139,10 @@ void psi_memstall_enter(unsigned long *flags)
	rq = this_rq_lock_irq(&rf);

	current->in_memstall = 1;
#ifdef CONFIG_PSI_FINE_GRAINED
	if (stat_flags)
		current->memstall_type = stat_flags;
#endif
	psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);

	rq_unlock_irq(rq, &rf);
@@ -1056,6 +1194,11 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
	psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL);
	if (!psi_ext)
		return -ENOMEM;
	psi_ext->pcpu = alloc_percpu(struct psi_group_stat_cpu);
	if (!psi_ext->pcpu) {
		kfree(psi_ext);
		return -ENOMEM;
	}
	cgroup->psi = &psi_ext->psi;
#else
	cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -1066,6 +1209,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
	cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
	if (!cgroup->psi->pcpu) {
#ifdef CONFIG_PSI_FINE_GRAINED
		free_percpu(psi_ext->pcpu);
		kfree(psi_ext);
#else
		kfree(cgroup->psi);
@@ -1086,6 +1230,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
	/* All triggers must be removed by now */
	WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
#ifdef CONFIG_PSI_FINE_GRAINED
	free_percpu(to_psi_group_ext(cgroup->psi)->pcpu);
	kfree(to_psi_group_ext(cgroup->psi));
#else
	kfree(cgroup->psi);
+9 −1
Original line number Diff line number Diff line
@@ -2378,6 +2378,9 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,

		memcg_memory_event(memcg, MEMCG_HIGH);

#ifdef CONFIG_PSI_FINE_GRAINED
		pflags = PSI_MEMCG_RECLAIM;
#endif
		psi_memstall_enter(&pflags);
		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
							gfp_mask,
@@ -2645,6 +2648,9 @@ void mem_cgroup_handle_over_high(void)
	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
	 * need to account for any ill-begotten jiffies to pay them off later.
	 */
#ifdef CONFIG_PSI_FINE_GRAINED
	pflags = PSI_MEMCG_RECLAIM;
#endif
	psi_memstall_enter(&pflags);
	schedule_timeout_killable(penalty_jiffies);
	psi_memstall_leave(&pflags);
@@ -2715,7 +2721,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
		goto nomem;

	memcg_memory_event(mem_over_limit, MEMCG_MAX);

#ifdef CONFIG_PSI_FINE_GRAINED
	pflags = PSI_MEMCG_RECLAIM;
#endif
	psi_memstall_enter(&pflags);
	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
						    gfp_mask, reclaim_options);