Commit b167fdff authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
"Load-balancing improvements:

   - Improve NUMA balancing on AMD Zen systems for affine workloads.

   - Improve the handling of reduced-capacity CPUs in load-balancing.

   - Energy Model improvements: fix & refine all the energy fairness
     metrics (PELT), and remove the conservative threshold requiring 6%
     energy savings to migrate a task. Doing this improves power
     efficiency for most workloads, and also increases the reliability
     of energy-efficiency scheduling.

   - Optimize/tweak select_idle_cpu() to spend (much) less time
     searching for an idle CPU on overloaded systems. There's reports of
     several milliseconds spent there on large systems with large
     workloads ...

     [ Since the search logic changed, there might be behavioral side
       effects. ]

   - Improve NUMA imbalance behavior. On certain systems with spare
     capacity, initial placement of tasks is non-deterministic, and such
     an artificial placement imbalance can persist for a long time,
     hurting (and sometimes helping) performance.

     The fix is to make fork-time task placement consistent with runtime
     NUMA balancing placement.

     Note that some performance regressions were reported against this,
     caused by workloads that are not memory bandwith limited, which
     benefit from the artificial locality of the placement bug(s). Mel
     Gorman's conclusion, with which we concur, was that consistency is
     better than random workload benefits from non-deterministic bugs:

        "Given there is no crystal ball and it's a tradeoff, I think
         it's better to be consistent and use similar logic at both fork
         time and runtime even if it doesn't have universal benefit."

   - Improve core scheduling by fixing a bug in
     sched_core_update_cookie() that caused unnecessary forced idling.

   - Improve wakeup-balancing by allowing same-LLC wakeup of idle CPUs
     for newly woken tasks.

   - Fix a newidle balancing bug that introduced unnecessary wakeup
     latencies.

  ABI improvements/fixes:

   - Do not check capabilities and do not issue capability check denial
     messages when a scheduler syscall doesn't require privileges. (Such
     as increasing niceness.)

   - Add forced-idle accounting to cgroups too.

   - Fix/improve the RSEQ ABI to not just silently accept unknown flags.
     (No existing tooling is known to have learned to rely on the
     previous behavior.)

   - Depreciate the (unused) RSEQ_CS_FLAG_NO_RESTART_ON_* flags.

  Optimizations:

   - Optimize & simplify leaf_cfs_rq_list()

   - Micro-optimize set_nr_{and_not,if}_polling() via try_cmpxchg().

  Misc fixes & cleanups:

   - Fix the RSEQ self-tests on RISC-V and Glibc 2.35 systems.

   - Fix a full-NOHZ bug that can in some cases result in the tick not
     being re-enabled when the last SCHED_RT task is gone from a
     runqueue but there's still SCHED_OTHER tasks around.

   - Various PREEMPT_RT related fixes.

   - Misc cleanups & smaller fixes"

* tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  rseq: Kill process when unknown flags are encountered in ABI structures
  rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags
  sched/core: Fix the bug that task won't enqueue into core tree when update cookie
  nohz/full, sched/rt: Fix missed tick-reenabling bug in dequeue_task_rt()
  sched/core: Always flush pending blk_plug
  sched/fair: fix case with reduced capacity CPU
  sched/core: Use try_cmpxchg in set_nr_{and_not,if}_polling
  sched/core: add forced idle accounting for cgroups
  sched/fair: Remove the energy margin in feec()
  sched/fair: Remove task_util from effective utilization in feec()
  sched/fair: Use the same cpumask per-PD throughout find_energy_efficient_cpu()
  sched/fair: Rename select_idle_mask to select_rq_mask
  sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util()
  sched/fair: Decay task PELT values during wakeup migration
  sched/fair: Provide u64 read for 32-bits arch helper
  sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg
  sched: only perform capability check on privileged operation
  sched: Remove unused function group_first_cpu()
  sched/fair: Remove redundant word " *"
  selftests/rseq: check if libc rseq support is registered
  ...
parents 0dd1cabe c17a6ff9
Loading
Loading
Loading
Loading
+9 −24
Original line number Diff line number Diff line
@@ -71,34 +71,19 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)

static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
{
	unsigned long max = 0, sum_util = 0;
	unsigned long max, sum_util = 0;
	int cpu;

	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {

	/*
	 * The capacity is the same for all CPUs belonging to
		 * the same perf domain, so a single call to
		 * arch_scale_cpu_capacity() is enough. However, we
		 * need the CPU parameter to be initialized by the
		 * loop, so the call ends up in this block.
		 *
		 * We can initialize 'max' with a cpumask_first() call
		 * before the loop but the bits computation is not
		 * worth given the arch_scale_cpu_capacity() just
		 * returns a value where the resulting assembly code
		 * will be optimized by the compiler.
	 * the same perf domain.
	 */
		max = arch_scale_cpu_capacity(cpu);
		sum_util += sched_cpu_util(cpu, max);
	}
	max = arch_scale_cpu_capacity(cpumask_first(pd_mask));

	/*
	 * In the improbable case where all the CPUs of the perf
	 * domain are offline, 'max' will be zero and will lead to an
	 * illegal operation with a zero division.
	 */
	return max ? (power * ((sum_util << 10) / max)) >> 10 : 0;
	for_each_cpu_and(cpu, pd_mask, cpu_online_mask)
		sum_util += sched_cpu_util(cpu);

	return (power * ((sum_util << 10) / max)) >> 10;
}

static u64 get_pd_power_uw(struct dtpm *dtpm)
+2 −4
Original line number Diff line number Diff line
@@ -137,11 +137,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
		    int cpu_idx)
{
	unsigned long max = arch_scale_cpu_capacity(cpu);
	unsigned long util;
	unsigned long util = sched_cpu_util(cpu);

	util = sched_cpu_util(cpu, max);
	return (util * 100) / max;
	return (util * 100) / arch_scale_cpu_capacity(cpu);
}
#else /* !CONFIG_SMP */
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
+4 −0
Original line number Diff line number Diff line
@@ -288,6 +288,10 @@ struct css_set {

struct cgroup_base_stat {
	struct task_cputime cputime;

#ifdef CONFIG_SCHED_CORE
	u64 forceidle_sum;
#endif
};

/*
+7 −0
Original line number Diff line number Diff line
@@ -28,6 +28,9 @@ enum cpu_usage_stat {
	CPUTIME_STEAL,
	CPUTIME_GUEST,
	CPUTIME_GUEST_NICE,
#ifdef CONFIG_SCHED_CORE
	CPUTIME_FORCEIDLE,
#endif
	NR_STATS,
};

@@ -115,4 +118,8 @@ extern void account_process_tick(struct task_struct *, int user);

extern void account_idle_ticks(unsigned long ticks);

#ifdef CONFIG_SCHED_CORE
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif

#endif /* _LINUX_KERNEL_STAT_H */
+1 −1
Original line number Diff line number Diff line
@@ -2257,7 +2257,7 @@ static inline bool owner_on_cpu(struct task_struct *owner)
}

/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu, unsigned long max);
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */

#ifdef CONFIG_RSEQ
Loading