Commit 90ef6931 authored by Hui Tang's avatar Hui Tang Committed by Yipeng Zou
Browse files

sched: Fix possible deadlock in tg_set_dynamic_affinity_mode

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7CGD0


CVE: NA

----------------------------------------

Deadlock occurs in two situations as follows:

The first case:

tg_set_dynamic_affinity_mode    --- raw_spin_lock_irq(&auto_affi->lock);
	->start_auto_affintiy   --- trigger timer
		->tg_update_task_prefer_cpus
			>css_task_inter_next
				->raw_spin_unlock_irq

hr_timer_run_queues
  ->sched_auto_affi_period_timer --- try spin lock (&auto_affi->lock)

The second case as follows:

[  291.470810] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[  291.472715] rcu:     1-...0: (0 ticks this GP) idle=a6a/1/0x4000000000000002 softirq=78516/78516 fqs=5249
[  291.475268] rcu:     (detected by 6, t=21006 jiffies, g=202169, q=9862)
[  291.477038] Sending NMI from CPU 6 to CPUs 1:
[  291.481268] NMI backtrace for cpu 1
[  291.481273] CPU: 1 PID: 1923 Comm: sh Kdump: loaded Not tainted 4.19.90+ #150
[  291.481278] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
[  291.481281] RIP: 0010:queued_spin_lock_slowpath+0x136/0x9a0
[  291.481289] Code: c0 74 3f 49 89 dd 48 89 dd 48 b8 00 00 00 00 00 fc ff df 49 c1 ed 03 83 e5 07 49 01 c5 83 c5 03 48 83 05 c4 66 b9 05 01 f3 90 <41> 0f b6 45 00 40 38 c5 7c 08 84 c0 0f 85 ad 07 00 00 0
[  291.481292] RSP: 0018:ffff88801de87cd8 EFLAGS: 00000002
[  291.481297] RAX: 0000000000000101 RBX: ffff888001be0a28 RCX: ffffffffb8090f7d
[  291.481301] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888001be0a28
[  291.481304] RBP: 0000000000000003 R08: ffffed100037c146 R09: ffffed100037c146
[  291.481307] R10: 000000001106b143 R11: ffffed100037c145 R12: 1ffff11003bd0f9c
[  291.481311] R13: ffffed100037c145 R14: fffffbfff7a38dee R15: dffffc0000000000
[  291.481315] FS:  00007fac4f306740(0000) GS:ffff88801de80000(0000) knlGS:0000000000000000
[  291.481318] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  291.481321] CR2: 00007fac4f4bb650 CR3: 00000000046b6000 CR4: 00000000000006e0
[  291.481323] Call Trace:
[  291.481324]  <IRQ>
[  291.481326]  ? osq_unlock+0x2a0/0x2a0
[  291.481329]  ? check_preemption_disabled+0x4c/0x290
[  291.481331]  ? rcu_accelerate_cbs+0x33/0xed0
[  291.481333]  _raw_spin_lock_irqsave+0x83/0xa0
[  291.481336]  sched_auto_affi_period_timer+0x251/0x820
[  291.481338]  ? __remove_hrtimer+0x151/0x200
[  291.481340]  __hrtimer_run_queues+0x39d/0xa50
[  291.481343]  ? tg_update_affinity_domain_down+0x460/0x460
[  291.481345]  ? enqueue_hrtimer+0x2e0/0x2e0
[  291.481348]  ? ktime_get_update_offsets_now+0x1d7/0x2c0
[  291.481350]  hrtimer_run_queues+0x243/0x470
[  291.481352]  run_local_timers+0x5e/0x150
[  291.481354]  update_process_times+0x36/0xb0
[  291.481357]  tick_sched_handle.isra.4+0x7c/0x180
[  291.481359]  tick_nohz_handler+0xd1/0x1d0
[  291.481365]  smp_apic_timer_interrupt+0x12c/0x4e0
[  291.481368]  apic_timer_interrupt+0xf/0x20
[  291.481370]  </IRQ>
[  291.481372]  ? smp_call_function_many+0x68c/0x840
[  291.481375]  ? smp_call_function_many+0x6ab/0x840
[  291.481377]  ? arch_unregister_cpu+0x60/0x60
[  291.481379]  ? native_set_fixmap+0x100/0x180
[  291.481381]  ? arch_unregister_cpu+0x60/0x60
[  291.481384]  ? set_task_select_cpus+0x116/0x940
[  291.481386]  ? smp_call_function+0x53/0xc0
[  291.481388]  ? arch_unregister_cpu+0x60/0x60
[  291.481390]  ? on_each_cpu+0x49/0xf0
[  291.481393]  ? set_task_select_cpus+0x115/0x940
[  291.481395]  ? text_poke_bp+0xff/0x180
[  291.481397]  ? poke_int3_handler+0xc0/0xc0
[  291.481400]  ? __set_prefer_cpus_ptr.constprop.4+0x1cd/0x900
[  291.481402]  ? hrtick+0x1b0/0x1b0
[  291.481404]  ? set_task_select_cpus+0x115/0x940
[  291.481407]  ? __jump_label_transform.isra.0+0x3a1/0x470
[  291.481409]  ? kernel_init+0x280/0x280
[  291.481411]  ? kasan_check_read+0x1d/0x30
[  291.481413]  ? mutex_lock+0x96/0x100
[  291.481415]  ? __mutex_lock_slowpath+0x30/0x30
[  291.481418]  ? arch_jump_label_transform+0x52/0x80
[  291.481420]  ? set_task_select_cpus+0x115/0x940
[  291.481422]  ? __jump_label_update+0x1a1/0x1e0
[  291.481424]  ? jump_label_update+0x2ee/0x3b0
[  291.481427]  ? static_key_slow_inc_cpuslocked+0x1c8/0x2d0
[  291.481430]  ? start_auto_affinity+0x190/0x200
[  291.481432]  ? tg_set_dynamic_affinity_mode+0xad/0xf0
[  291.481435]  ? cpu_affinity_mode_write_u64+0x22/0x30
[  291.481437]  ? cgroup_file_write+0x46f/0x660
[  291.481439]  ? cgroup_init_cftypes+0x300/0x300
[  291.481441]  ? __mutex_lock_slowpath+0x30/0x30

Signed-off-by: default avatarHui Tang <tanghui20@huawei.com>
Reviewed-by: default avatarZhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: default avatarZhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: default avatarYipeng Zou <zouyipeng@huawei.com>
parent 11e18709
Loading
Loading
Loading
Loading
+12 −0
Original line number Original line Diff line number Diff line
@@ -2,6 +2,7 @@
#ifndef _LINUX_SCHED_GRID_QOS_H
#ifndef _LINUX_SCHED_GRID_QOS_H
#define _LINUX_SCHED_GRID_QOS_H
#define _LINUX_SCHED_GRID_QOS_H
#include <linux/nodemask.h>
#include <linux/nodemask.h>
#include <linux/sched.h>


#ifdef CONFIG_QOS_SCHED_SMART_GRID
#ifdef CONFIG_QOS_SCHED_SMART_GRID
enum sched_grid_qos_class {
enum sched_grid_qos_class {
@@ -61,6 +62,7 @@ struct sched_grid_qos_power {


struct sched_grid_qos_affinity {
struct sched_grid_qos_affinity {
	nodemask_t mem_preferred_node_mask;
	nodemask_t mem_preferred_node_mask;
	const struct cpumask *prefer_cpus;
};
};


struct task_struct;
struct task_struct;
@@ -72,6 +74,11 @@ struct sched_grid_qos {
	int (*affinity_set)(struct task_struct *p);
	int (*affinity_set)(struct task_struct *p);
};
};


static inline int sched_qos_affinity_set(struct task_struct *p)
{
	return p->grid_qos->affinity_set(p);
}

int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig);
int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig);
void sched_grid_qos_free(struct task_struct *p);
void sched_grid_qos_free(struct task_struct *p);


@@ -88,5 +95,10 @@ sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
{
{
	return preferred_nid;
	return preferred_nid;
}
}

static inline int sched_qos_affinity_set(struct task_struct *p)
{
	return 0;
}
#endif
#endif
#endif
#endif
+1 −8
Original line number Original line Diff line number Diff line
@@ -9530,9 +9530,6 @@ static inline s64 cpu_smt_expell_read(struct cgroup_subsys_state *css,
int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode)
int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode)
{
{
	struct auto_affinity *auto_affi = tg->auto_affinity;
	struct auto_affinity *auto_affi = tg->auto_affinity;
	int ret = 0;

	raw_spin_lock_irq(&auto_affi->lock);


	/* auto mode*/
	/* auto mode*/
	if (mode == 1) {
	if (mode == 1) {
@@ -9540,14 +9537,10 @@ int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode)
	} else if (mode == 0) {
	} else if (mode == 0) {
		stop_auto_affinity(auto_affi);
		stop_auto_affinity(auto_affi);
	} else {
	} else {
		raw_spin_unlock_irq(&auto_affi->lock);
		return -EINVAL;
		return -EINVAL;
	}
	}


	auto_affi->mode = mode;
	return 0;
	raw_spin_unlock_irq(&auto_affi->lock);

	return ret;
}
}


static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css,
static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css,
+65 −42
Original line number Original line Diff line number Diff line
@@ -28,9 +28,7 @@
#include <linux/delay.h>
#include <linux/delay.h>
#include <linux/tracehook.h>
#include <linux/tracehook.h>
#endif
#endif
#ifdef CONFIG_QOS_SCHED_SMART_GRID
#include <linux/sched/grid_qos.h>
#include <linux/sched/grid_qos.h>
#endif
#include <linux/bpf_sched.h>
#include <linux/bpf_sched.h>


/*
/*
@@ -5821,6 +5819,7 @@ static inline unsigned long cpu_util(int cpu);
static unsigned long capacity_of(int cpu);
static unsigned long capacity_of(int cpu);
static int sched_idle_cpu(int cpu);
static int sched_idle_cpu(int cpu);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static inline bool prefer_cpus_valid(struct task_struct *p);


int sysctl_affinity_adjust_delay_ms = 5000;
int sysctl_affinity_adjust_delay_ms = 5000;


@@ -5836,22 +5835,29 @@ static void smart_grid_usage_dec(void)
	static_key_slow_dec(&__smart_grid_used);
	static_key_slow_dec(&__smart_grid_used);
}
}


static void tg_update_task_prefer_cpus(struct task_group *tg)
static inline struct cpumask *task_prefer_cpus(struct task_struct *p)
{
{
	struct affinity_domain *ad = &tg->auto_affinity->ad;
	struct affinity_domain *ad;
	struct task_struct *task;
	struct css_task_iter it;


	css_task_iter_start(&tg->css, 0, &it);
	if (!smart_grid_used())
	while ((task = css_task_iter_next(&it))) {
		return p->prefer_cpus;
		if (tg == &root_task_group && !task->mm)
			continue;


		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
	if (task_group(p)->auto_affinity->mode == 0)
		/* grid_qos must not be NULL */
		return (void *)p->cpus_ptr;
		task->grid_qos->affinity_set(task);

	ad = &task_group(p)->auto_affinity->ad;
	return ad->domains[ad->curr_level];
}
}
	css_task_iter_end(&it);

static inline int dynamic_affinity_mode(struct task_struct *p)
{
	if (!prefer_cpus_valid(p))
		return -1;

	if (smart_grid_used())
		return task_group(p)->auto_affinity->mode == 0 ? -1 : 1;

	return 0;
}
}


static void affinity_domain_up(struct task_group *tg)
static void affinity_domain_up(struct task_group *tg)
@@ -5872,8 +5878,6 @@ static void affinity_domain_up(struct task_group *tg)


	if (level == ad->dcount)
	if (level == ad->dcount)
		return;
		return;

	tg_update_task_prefer_cpus(tg);
}
}


static void affinity_domain_down(struct task_group *tg)
static void affinity_domain_down(struct task_group *tg)
@@ -5894,8 +5898,6 @@ static void affinity_domain_down(struct task_group *tg)


	if (!level)
	if (!level)
		return;
		return;

	tg_update_task_prefer_cpus(tg);
}
}


static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer)
static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer)
@@ -5961,8 +5963,6 @@ static int tg_update_affinity_domain_down(struct task_group *tg, void *data)
	if (!smart_grid_used())
	if (!smart_grid_used())
		return 0;
		return 0;


	if (auto_affi->mode)
		tg_update_task_prefer_cpus(tg);
	return 0;
	return 0;
}
}


@@ -5980,35 +5980,41 @@ void tg_update_affinity_domains(int cpu, int online)


void start_auto_affinity(struct auto_affinity *auto_affi)
void start_auto_affinity(struct auto_affinity *auto_affi)
{
{
	struct task_group *tg = auto_affi->tg;
	ktime_t delay_ms;
	ktime_t delay_ms;


	if (auto_affi->period_active == 1)
	raw_spin_lock_irq(&auto_affi->lock);
	if (auto_affi->period_active == 1) {
		raw_spin_unlock_irq(&auto_affi->lock);
		return;
		return;

	}
	tg_update_task_prefer_cpus(tg);


	auto_affi->period_active = 1;
	auto_affi->period_active = 1;
	auto_affi->mode = 1;
	delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms);
	delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms);
	hrtimer_forward_now(&auto_affi->period_timer, delay_ms);
	hrtimer_forward_now(&auto_affi->period_timer, delay_ms);
	hrtimer_start_expires(&auto_affi->period_timer,
	hrtimer_start_expires(&auto_affi->period_timer,
				HRTIMER_MODE_ABS_PINNED);
				HRTIMER_MODE_ABS_PINNED);
	raw_spin_unlock_irq(&auto_affi->lock);

	smart_grid_usage_inc();
	smart_grid_usage_inc();
}
}


void stop_auto_affinity(struct auto_affinity *auto_affi)
void stop_auto_affinity(struct auto_affinity *auto_affi)
{
{
	struct task_group *tg = auto_affi->tg;
	struct affinity_domain *ad = &auto_affi->ad;
	struct affinity_domain *ad = &auto_affi->ad;


	if (auto_affi->period_active == 0)
	raw_spin_lock_irq(&auto_affi->lock);
	if (auto_affi->period_active == 0) {
		raw_spin_unlock_irq(&auto_affi->lock);
		return;
		return;
	}


	hrtimer_cancel(&auto_affi->period_timer);
	hrtimer_cancel(&auto_affi->period_timer);
	auto_affi->period_active = 0;
	auto_affi->period_active = 0;
	auto_affi->mode = 0;
	ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0;
	ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0;
	raw_spin_unlock_irq(&auto_affi->lock);


	tg_update_task_prefer_cpus(tg);
	smart_grid_usage_dec();
	smart_grid_usage_dec();
}
}


@@ -6226,6 +6232,19 @@ static void destroy_auto_affinity(struct task_group *tg)
}
}
#else
#else
static void destroy_auto_affinity(struct task_group *tg) {}
static void destroy_auto_affinity(struct task_group *tg) {}

#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
static inline struct cpumask *task_prefer_cpus(struct task_struct *p)
{
	return p->prefer_cpus;
}
#endif

static inline int dynamic_affinity_mode(struct task_struct *p)
{
	return 0;
}

#endif
#endif


/**************************************************
/**************************************************
@@ -7748,10 +7767,11 @@ int sysctl_sched_util_low_pct = 85;


static inline bool prefer_cpus_valid(struct task_struct *p)
static inline bool prefer_cpus_valid(struct task_struct *p)
{
{
	return p->prefer_cpus &&
	struct cpumask *prefer_cpus = task_prefer_cpus(p);
	       !cpumask_empty(p->prefer_cpus) &&

	       !cpumask_equal(p->prefer_cpus, p->cpus_ptr) &&
	return !cpumask_empty(prefer_cpus) &&
	       cpumask_subset(p->prefer_cpus, p->cpus_ptr);
	       !cpumask_equal(prefer_cpus, p->cpus_ptr) &&
	       cpumask_subset(prefer_cpus, p->cpus_ptr);
}
}


static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
@@ -7786,20 +7806,23 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
	long min_util = INT_MIN;
	long min_util = INT_MIN;
	struct task_group *tg;
	struct task_group *tg;
	long spare;
	long spare;
	int cpu;
	int cpu, mode;


	p->select_cpus = p->cpus_ptr;
	rcu_read_lock();
	if (!prefer_cpus_valid(p))
	mode = dynamic_affinity_mode(p);
	if (mode == -1) {
		rcu_read_unlock();
		return;
		return;

	} else if (mode == 1) {
	if (smart_grid_used()) {
		p->select_cpus = task_prefer_cpus(p);
		p->select_cpus = p->prefer_cpus;
		if (idlest_cpu)
		if (idlest_cpu)
			*idlest_cpu = cpumask_first(p->select_cpus);
			*idlest_cpu = cpumask_first(p->select_cpus);
		sched_qos_affinity_set(p);
		rcu_read_unlock();
		return;
		return;
	}
	}


	rcu_read_lock();
	/* manual mode */
	tg = task_group(p);
	tg = task_group(p);
	for_each_cpu(cpu, p->prefer_cpus) {
	for_each_cpu(cpu, p->prefer_cpus) {
		if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) {
		if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) {
@@ -7867,13 +7890,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
	time = schedstat_start_time();
	time = schedstat_start_time();


	/*
	/*
	 * required for stable ->cpus_allowed
	 * required for stable ->cpus_ptr
	 */
	 */
	lockdep_assert_held(&p->pi_lock);
	lockdep_assert_held(&p->pi_lock);


#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
	p->select_cpus = p->cpus_ptr;
	p->select_cpus = p->cpus_ptr;
	if (dynamic_affinity_used())
	if (dynamic_affinity_used() || smart_grid_used())
		set_task_select_cpus(p, &idlest_cpu, sd_flag);
		set_task_select_cpus(p, &idlest_cpu, sd_flag);
#endif
#endif


@@ -9464,7 +9487,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)


#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
	p->select_cpus = p->cpus_ptr;
	p->select_cpus = p->cpus_ptr;
	if (dynamic_affinity_used())
	if (dynamic_affinity_used() || smart_grid_used())
		set_task_select_cpus(p, NULL, 0);
		set_task_select_cpus(p, NULL, 0);
	if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) {
	if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) {
#else
#else
+10 −4
Original line number Original line Diff line number Diff line
@@ -24,20 +24,26 @@
#include <linux/sched/grid_qos.h>
#include <linux/sched/grid_qos.h>
#include "internal.h"
#include "internal.h"


static int qos_affinity_set(struct task_struct *p)
static inline int qos_affinity_set(struct task_struct *p)
{
{
	int n;
	int n;
	struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity;
	struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity;


	nodes_clear(affinity->mem_preferred_node_mask);
	if (likely(affinity->prefer_cpus == p->select_cpus))
		return 0;

	/*
	/*
	 * We want the memory allocation to be as close to the CPU
	 * We want the memory allocation to be as close to the CPU
	 * as possible, and adjust after getting memory bandwidth usage.
	 * as possible, and adjust after getting memory bandwidth usage.
	 */
	 */
	for (n = 0; n < nr_node_ids; n++)
	for (n = 0; n < nr_node_ids; n++) {
		if (cpumask_intersects(cpumask_of_node(n), p->prefer_cpus))
		if (cpumask_intersects(cpumask_of_node(n), p->select_cpus))
			node_set(n, affinity->mem_preferred_node_mask);
			node_set(n, affinity->mem_preferred_node_mask);
		else
			node_clear(n, affinity->mem_preferred_node_mask);
	}


	affinity->prefer_cpus = p->select_cpus;
	return 0;
	return 0;
}
}