Commit 24c56ee0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

 - Correct the marking of kthreads which are supposed to run on a
   specific, single CPU vs such which are affine to only one CPU, mark
   per-cpu workqueue threads as such and make sure that marking
   "survives" CPU hotplug. Fix CPU hotplug issues with such kthreads.

 - A fix to not push away tasks on CPUs coming online.

 - Have workqueue CPU hotplug code use cpu_possible_mask when breaking
   affinity on CPU offlining so that pending workers can finish on newly
   arrived onlined CPUs too.

 - Dump tasks which haven't vacated a CPU which is currently being
   unplugged.

 - Register a special scale invariance callback which gets called on
   resume from RAM to read out APERF/MPERF after resume and thus make
   the schedutil scaling governor more precise.

* tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Relax the set_cpus_allowed_ptr() semantics
  sched: Fix CPU hotplug / tighten is_per_cpu_kthread()
  sched: Prepare to use balance_push in ttwu()
  workqueue: Restrict affinity change to rescuer
  workqueue: Tag bound workers with KTHREAD_IS_PER_CPU
  kthread: Extract KTHREAD_IS_PER_CPU
  sched: Don't run cpu-online with balance_push() enabled
  workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity
  sched/core: Print out straggler tasks in sched_cpu_dying()
  x86: PM: Register syscore_ops for scale invariance
parents 025929f4 741ba80f
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
@@ -56,6 +56,7 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
#include <linux/syscore_ops.h>

#include <asm/acpi.h>
#include <asm/desc.h>
@@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
	this_cpu_write(arch_prev_mperf, mperf);
}

#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
	.resume = init_counter_refs,
};

static void register_freq_invariance_syscore_ops(void)
{
	/* Bail out if registered already. */
	if (freq_invariance_syscore_ops.node.prev)
		return;

	register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif

static void init_freq_invariance(bool secondary, bool cppc_ready)
{
	bool ret = false;
@@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
	if (ret) {
		init_counter_refs();
		static_branch_enable(&arch_scale_freq_key);
		register_freq_invariance_syscore_ops();
		pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
	} else {
		pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
+3 −0
Original line number Diff line number Diff line
@@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
					  unsigned int cpu,
					  const char *namefmt);

void kthread_set_per_cpu(struct task_struct *k, int cpu);
bool kthread_is_per_cpu(struct task_struct *k);

/**
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
+26 −1
Original line number Diff line number Diff line
@@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
		return p;
	kthread_bind(p, cpu);
	/* CPU hotplug need to bind once again when unparking the thread. */
	set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
	to_kthread(p)->cpu = cpu;
	return p;
}

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
	struct kthread *kthread = to_kthread(k);
	if (!kthread)
		return;

	WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

	if (cpu < 0) {
		clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
		return;
	}

	kthread->cpu = cpu;
	set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *k)
{
	struct kthread *kthread = to_kthread(k);
	if (!kthread)
		return false;

	return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:		thread created by kthread_create().
+88 −23
Original line number Diff line number Diff line
@@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 */
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
	/* When not in the task's cpumask, no point in looking further. */
	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
		return false;

	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
	/* migrate_disabled() must be allowed to finish. */
	if (is_migration_disabled(p))
		return cpu_online(cpu);

	/* Non kernel threads are not allowed during either online or offline. */
	if (!(p->flags & PF_KTHREAD))
		return cpu_active(cpu);

	/* KTHREAD_IS_PER_CPU is always allowed. */
	if (kthread_is_per_cpu(p))
		return cpu_online(cpu);

	/* Regular kernel threads don't get to stay during offline. */
	if (cpu_rq(cpu)->balance_push)
		return false;

	/* But are allowed during online. */
	return cpu_online(cpu);
}

/*
@@ -2327,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,

	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
		/*
		 * Kernel threads are allowed on online && !active CPUs.
		 * Kernel threads are allowed on online && !active CPUs,
		 * however, during cpu-hot-unplug, even these might get pushed
		 * away if not KTHREAD_IS_PER_CPU.
		 *
		 * Specifically, migration_disabled() tasks must not fail the
		 * cpumask_any_and_distribute() pick below, esp. so on
@@ -2371,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,

	__do_set_cpus_allowed(p, new_mask, flags);

	if (p->flags & PF_KTHREAD) {
		/*
		 * For kernel threads that do indeed end up on online &&
		 * !active we want to ensure they are strict per-CPU threads.
		 */
		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
			!cpumask_intersects(new_mask, cpu_active_mask) &&
			p->nr_cpus_allowed != 1);
	}

	return affine_move_task(rq, p, &rf, dest_cpu, flags);

out:
@@ -3121,6 +3128,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu)

static inline bool ttwu_queue_cond(int cpu, int wake_flags)
{
	/*
	 * Do not complicate things with the async wake_list while the CPU is
	 * in hotplug state.
	 */
	if (!cpu_active(cpu))
		return false;

	/*
	 * If the CPU does not share cache, then queue the task on the
	 * remote rqs wakelist to avoid accessing remote data.
@@ -7276,8 +7290,14 @@ static void balance_push(struct rq *rq)
	/*
	 * Both the cpu-hotplug and stop task are in this case and are
	 * required to complete the hotplug process.
	 *
	 * XXX: the idle task does not match kthread_is_per_cpu() due to
	 * histerical raisins.
	 */
	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
	if (rq->idle == push_task ||
	    ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
	    is_migration_disabled(push_task)) {

		/*
		 * If this is the idle task on the outgoing CPU try to wake
		 * up the hotplug control thread which might wait for the
@@ -7309,7 +7329,7 @@ static void balance_push(struct rq *rq)
	/*
	 * At this point need_resched() is true and we'll take the loop in
	 * schedule(). The next pick is obviously going to be the stop task
	 * which is_per_cpu_kthread() and will push this task away.
	 * which kthread_is_per_cpu() and will push this task away.
	 */
	raw_spin_lock(&rq->lock);
}
@@ -7320,10 +7340,13 @@ static void balance_push_set(int cpu, bool on)
	struct rq_flags rf;

	rq_lock_irqsave(rq, &rf);
	if (on)
	rq->balance_push = on;
	if (on) {
		WARN_ON_ONCE(rq->balance_callback);
		rq->balance_callback = &balance_push_callback;
	else
	} else if (rq->balance_callback == &balance_push_callback) {
		rq->balance_callback = NULL;
	}
	rq_unlock_irqrestore(rq, &rf);
}

@@ -7441,6 +7464,10 @@ int sched_cpu_activate(unsigned int cpu)
	struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;

	/*
	 * Make sure that when the hotplug state machine does a roll-back
	 * we clear balance_push. Ideally that would happen earlier...
	 */
	balance_push_set(cpu, false);

#ifdef CONFIG_SCHED_SMT
@@ -7483,17 +7510,27 @@ int sched_cpu_deactivate(unsigned int cpu)
	int ret;

	set_cpu_active(cpu, false);

	/*
	 * From this point forward, this CPU will refuse to run any task that
	 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
	 * push those tasks away until this gets cleared, see
	 * sched_cpu_dying().
	 */
	balance_push_set(cpu, true);

	/*
	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
	 * users of this state to go away such that all new such users will
	 * observe it.
	 * We've cleared cpu_active_mask / set balance_push, wait for all
	 * preempt-disabled and RCU users of this state to go away such that
	 * all new such users will observe it.
	 *
	 * Specifically, we rely on ttwu to no longer target this CPU, see
	 * ttwu_queue_cond() and is_cpu_allowed().
	 *
	 * Do sync before park smpboot threads to take care the rcu boost case.
	 */
	synchronize_rcu();

	balance_push_set(cpu, true);

	rq_lock_irqsave(rq, &rf);
	if (rq->rd) {
		update_rq_clock(rq);
@@ -7574,6 +7611,25 @@ static void calc_load_migrate(struct rq *rq)
		atomic_long_add(delta, &calc_load_tasks);
}

static void dump_rq_tasks(struct rq *rq, const char *loglvl)
{
	struct task_struct *g, *p;
	int cpu = cpu_of(rq);

	lockdep_assert_held(&rq->lock);

	printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
	for_each_process_thread(g, p) {
		if (task_cpu(p) != cpu)
			continue;

		if (!task_on_rq_queued(p))
			continue;

		printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
	}
}

int sched_cpu_dying(unsigned int cpu)
{
	struct rq *rq = cpu_rq(cpu);
@@ -7583,9 +7639,18 @@ int sched_cpu_dying(unsigned int cpu)
	sched_tick_stop(cpu);

	rq_lock_irqsave(rq, &rf);
	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
		WARN(true, "Dying CPU not properly vacated!");
		dump_rq_tasks(rq, KERN_WARNING);
	}
	rq_unlock_irqrestore(rq, &rf);

	/*
	 * Now that the CPU is offline, make sure we're welcome
	 * to new tasks once we come back up.
	 */
	balance_push_set(cpu, false);

	calc_load_migrate(rq);
	update_max_interval();
	nohz_balance_exit_idle(rq);
+1 −0
Original line number Diff line number Diff line
@@ -975,6 +975,7 @@ struct rq {
	unsigned long		cpu_capacity_orig;

	struct callback_head	*balance_callback;
	unsigned char		balance_push;

	unsigned char		nohz_idle_balance;
	unsigned char		idle_balance;
Loading