Commit dfad78e0 authored by Rafael J. Wysocki's avatar Rafael J. Wysocki
Browse files

Merge branches 'pm-sleep', 'pm-domains' and 'pm-docs'

Merge changes related to system sleep, PM domains changes and power
management documentation changes for 5.18-rc1:

 - Fix load_image_and_restore() error path (Ye Bin).

 - Fix typos in comments in the system wakeup hadling code (Tom Rix).

 - Clean up non-kernel-doc comments in hibernation code (Jiapeng
   Chong).

 - Fix __setup handler error handling in system-wide suspend and
   hibernation core code (Randy Dunlap).

 - Add device name to suspend_report_result() (Youngjin Jang).

 - Make virtual guests honour ACPI S4 hardware signature by
   default (David Woodhouse).

 - Block power off of a parent PM domain unless child is in deepest
   state (Ulf Hansson).

 - Use dev_err_probe() to simplify error handling for generic PM
   domains (Ahmad Fatoum).

 - Fix sleep-in-atomic bug caused by genpd_debug_remove() (Shawn Guo).

 - Document Intel uncore frequency scaling (Srinivas Pandruvada).

* pm-sleep:
  PM: hibernate: Honour ACPI hardware signature by default for virtual guests
  PM: sleep: Add device name to suspend_report_result()
  PM: suspend: fix return value of __setup handler
  PM: hibernate: fix __setup handler error handling
  PM: hibernate: Clean up non-kernel-doc comments
  PM: sleep: wakeup: Fix typos in comments
  PM: hibernate: fix load_image_and_restore() error path

* pm-domains:
  PM: domains: Fix sleep-in-atomic bug caused by genpd_debug_remove()
  PM: domains: use dev_err_probe() to simplify error handling
  PM: domains: Prevent power off for parent unless child is in deepest state

* pm-docs:
  Documentation: admin-guide: pm: Document uncore frequency scaling
Loading
Loading
Loading
Loading
+60 −0
Original line number Diff line number Diff line
.. SPDX-License-Identifier: GPL-2.0
.. include:: <isonum.txt>

==============================
Intel Uncore Frequency Scaling
==============================

:Copyright: |copy| 2022 Intel Corporation

:Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>

Introduction
------------

The uncore can consume significant amount of power in Intel's Xeon servers based
on the workload characteristics. To optimize the total power and improve overall
performance, SoCs have internal algorithms for scaling uncore frequency. These
algorithms monitor workload usage of uncore and set a desirable frequency.

It is possible that users have different expectations of uncore performance and
want to have control over it. The objective is similar to allowing users to set
the scaling min/max frequencies via cpufreq sysfs to improve CPU performance.
Users may have some latency sensitive workloads where they do not want any
change to uncore frequency. Also, users may have workloads which require
different core and uncore performance at distinct phases and they may want to
use both cpufreq and the uncore scaling interface to distribute power and
improve overall performance.

Sysfs Interface
---------------

To control uncore frequency, a sysfs interface is provided in the directory:
`/sys/devices/system/cpu/intel_uncore_frequency/`.

There is one directory for each package and die combination as the scope of
uncore scaling control is per die in multiple die/package SoCs or per
package for single die per package SoCs. The name represents the
scope of control. For example: 'package_00_die_00' is for package id 0 and
die 0.

Each package_*_die_* contains the following attributes:

``initial_max_freq_khz``
	Out of reset, this attribute represent the maximum possible frequency.
	This is a read-only attribute. If users adjust max_freq_khz,
	they can always go back to maximum using the value from this attribute.

``initial_min_freq_khz``
	Out of reset, this attribute represent the minimum possible frequency.
	This is a read-only attribute. If users adjust min_freq_khz,
	they can always go back to minimum using the value from this attribute.

``max_freq_khz``
	This attribute is used to set the maximum uncore frequency.

``min_freq_khz``
	This attribute is used to set the minimum uncore frequency.

``current_freq_khz``
	This attribute is used to get the current uncore frequency.
+1 −0
Original line number Diff line number Diff line
@@ -15,3 +15,4 @@ Working-State Power Management
   cpufreq_drivers
   intel_epb
   intel-speed-select
   intel_uncore_frequency_scaling
+21 −2
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <asm/desc.h>
#include <asm/cacheflush.h>
#include <asm/realmode.h>
#include <asm/hypervisor.h>

#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
@@ -140,9 +141,9 @@ static int __init acpi_sleep_setup(char *str)
			acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
		if (strncmp(str, "s4_hwsig", 8) == 0)
			acpi_check_s4_hw_signature(1);
			acpi_check_s4_hw_signature = 1;
		if (strncmp(str, "s4_nohwsig", 10) == 0)
			acpi_check_s4_hw_signature(0);
			acpi_check_s4_hw_signature = 0;
#endif
		if (strncmp(str, "nonvs", 5) == 0)
			acpi_nvs_nosave();
@@ -160,3 +161,21 @@ static int __init acpi_sleep_setup(char *str)
}

__setup("acpi_sleep=", acpi_sleep_setup);

#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST)
static int __init init_s4_sigcheck(void)
{
	/*
	 * If running on a hypervisor, honour the ACPI specification
	 * by default and trigger a clean reboot when the hardware
	 * signature in FACS is changed after hibernation.
	 */
	if (acpi_check_s4_hw_signature == -1 &&
	    !hypervisor_is_type(X86_HYPER_NATIVE))
		acpi_check_s4_hw_signature = 1;

	return 0;
}
/* This must happen before acpi_init() which is a subsys initcall */
arch_initcall(init_s4_sigcheck);
#endif
+3 −8
Original line number Diff line number Diff line
@@ -869,12 +869,7 @@ static inline void acpi_sleep_syscore_init(void) {}
#ifdef CONFIG_HIBERNATION
static unsigned long s4_hardware_signature;
static struct acpi_table_facs *facs;
static int sigcheck = -1; /* Default behaviour is just to warn */

void __init acpi_check_s4_hw_signature(int check)
{
	sigcheck = check;
}
int acpi_check_s4_hw_signature = -1; /* Default behaviour is just to warn */

static int acpi_hibernation_begin(pm_message_t stage)
{
@@ -999,7 +994,7 @@ static void acpi_sleep_hibernate_setup(void)
	hibernation_set_ops(old_suspend_ordering ?
			&acpi_hibernation_ops_old : &acpi_hibernation_ops);
	sleep_states[ACPI_STATE_S4] = 1;
	if (!sigcheck)
	if (!acpi_check_s4_hw_signature)
		return;

	acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs);
@@ -1011,7 +1006,7 @@ static void acpi_sleep_hibernate_setup(void)
		 */
		s4_hardware_signature = facs->hardware_signature;

		if (sigcheck > 0) {
		if (acpi_check_s4_hw_signature > 0) {
			/*
			 * If we're actually obeying the ACPI specification
			 * then the signature is written out as part of the
+26 −16
Original line number Diff line number Diff line
@@ -636,6 +636,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
			atomic_read(&genpd->sd_count) > 0)
		return -EBUSY;

	/*
	 * The children must be in their deepest (powered-off) states to allow
	 * the parent to be powered off. Note that, there's no need for
	 * additional locking, as powering on a child, requires the parent's
	 * lock to be acquired first.
	 */
	list_for_each_entry(link, &genpd->parent_links, parent_node) {
		struct generic_pm_domain *child = link->child;
		if (child->state_idx < child->state_count - 1)
			return -EBUSY;
	}

	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
		enum pm_qos_flags_status stat;

@@ -1073,6 +1085,13 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
	    || atomic_read(&genpd->sd_count) > 0)
		return;

	/* Check that the children are in their deepest (powered-off) state. */
	list_for_each_entry(link, &genpd->parent_links, parent_node) {
		struct generic_pm_domain *child = link->child;
		if (child->state_idx < child->state_count - 1)
			return;
	}

	/* Choose the deepest state when suspending */
	genpd->state_idx = genpd->state_count - 1;
	if (_genpd_power_off(genpd, false))
@@ -2058,9 +2077,9 @@ static int genpd_remove(struct generic_pm_domain *genpd)
		kfree(link);
	}

	genpd_debug_remove(genpd);
	list_del(&genpd->gpd_list_node);
	genpd_unlock(genpd);
	genpd_debug_remove(genpd);
	cancel_work_sync(&genpd->power_off_work);
	if (genpd_is_cpu_domain(genpd))
		free_cpumask_var(genpd->cpus);
@@ -2248,12 +2267,8 @@ int of_genpd_add_provider_simple(struct device_node *np,
	/* Parse genpd OPP table */
	if (genpd->set_performance_state) {
		ret = dev_pm_opp_of_add_table(&genpd->dev);
		if (ret) {
			if (ret != -EPROBE_DEFER)
				dev_err(&genpd->dev, "Failed to add OPP table: %d\n",
					ret);
			return ret;
		}
		if (ret)
			return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n");

		/*
		 * Save table for faster processing while setting performance
@@ -2312,9 +2327,8 @@ int of_genpd_add_provider_onecell(struct device_node *np,
		if (genpd->set_performance_state) {
			ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i);
			if (ret) {
				if (ret != -EPROBE_DEFER)
					dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n",
						i, ret);
				dev_err_probe(&genpd->dev, ret,
					      "Failed to add OPP table for index %d\n", i);
				goto error;
			}

@@ -2672,12 +2686,8 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,
	ret = genpd_add_device(pd, dev, base_dev);
	mutex_unlock(&gpd_list_lock);

	if (ret < 0) {
		if (ret != -EPROBE_DEFER)
			dev_err(dev, "failed to add to PM domain %s: %d",
				pd->name, ret);
		return ret;
	}
	if (ret < 0)
		return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name);

	dev->pm_domain->detach = genpd_dev_pm_detach;
	dev->pm_domain->sync = genpd_dev_pm_sync;
Loading