drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation (b75efe88) · Commits · EulixOS / Software / Kernel

drivers/gpu/drm/amd/amdgpu/amdgpu.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -286,6 +286,9 @@ extern int amdgpu_user_partt_mode;
		#define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
		#define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)

		/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
		#define AMDGPU_SWCTF_EXTRA_DELAY 50

		struct amdgpu_xcp_mgr;
		struct amdgpu_device;
		struct amdgpu_irq_src;

drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c

+48 −0

Original line number	Diff line number	Diff line
		@@ -26,6 +26,7 @@
		#include <linux/gfp.h>
		#include <linux/slab.h>
		#include <linux/firmware.h>
		#include <linux/reboot.h>
		#include "amd_shared.h"
		#include "amd_powerplay.h"
		#include "power_state.h"
		@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
		return 0;
		}

		static void pp_swctf_delayed_work_handler(struct work_struct *work)
		{
		struct pp_hwmgr *hwmgr =
		container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
		struct amdgpu_device *adev = hwmgr->adev;
		struct amdgpu_dpm_thermal *range =
		&adev->pm.dpm.thermal;
		uint32_t gpu_temperature, size;
		int ret;

		/*
		* If the hotspot/edge temperature is confirmed as below SW CTF setting point
		* after the delay enforced, nothing will be done.
		* Otherwise, a graceful shutdown will be performed to prevent further damage.
		*/
		if (range->sw_ctf_threshold &&
		hwmgr->hwmgr_func->read_sensor) {
		ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
		AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
		&gpu_temperature,
		&size);
		/*
		* For some legacy ASICs, hotspot temperature retrieving might be not
		* supported. Check the edge temperature instead then.
		*/
		if (ret == -EOPNOTSUPP)
		ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
		AMDGPU_PP_SENSOR_EDGE_TEMP,
		&gpu_temperature,
		&size);
		if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
		return;
		}

		dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
		dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
		orderly_poweroff(true);
		}

		static int pp_sw_init(void *handle)
		{
		struct amdgpu_device *adev = handle;
		@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)

		pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");

		if (!ret)
		INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
		pp_swctf_delayed_work_handler);

		return ret;
		}

		@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
		struct amdgpu_device *adev = handle;
		struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;

		cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);

		hwmgr_hw_fini(hwmgr);

		return 0;
		@@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
		struct amdgpu_device *adev = handle;
		struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;

		cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);

		return hwmgr_suspend(hwmgr);
		}

drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c

+9 −18

Original line number	Diff line number	Diff line
		@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,
		struct amdgpu_irq_src *source,
		struct amdgpu_iv_entry *entry)
		{
		struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
		uint32_t client_id = entry->client_id;
		uint32_t src_id = entry->src_id;

		if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
		if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
		dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
		/*
		* SW CTF just occurred.
		* Try to do a graceful shutdown to prevent further damage.
		*/
		dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
		orderly_poweroff(true);
		} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
		schedule_delayed_work(&hwmgr->swctf_delayed_work,
		msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
		} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
		dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
		else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
		} else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
		dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
		/*
		* HW CTF just occurred. Shutdown to prevent further damage.
		@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,
		orderly_poweroff(true);
		}
		} else if (client_id == SOC15_IH_CLIENTID_THM) {
		if (src_id == 0) {
		dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
		/*
		* SW CTF just occurred.
		* Try to do a graceful shutdown to prevent further damage.
		*/
		dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
		orderly_poweroff(true);
		} else
		if (src_id == 0)
		schedule_delayed_work(&hwmgr->swctf_delayed_work,
		msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
		else
		dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
		} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
		dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");

drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -811,6 +811,8 @@ struct pp_hwmgr {
		bool gfxoff_state_changed_by_workload;
		uint32_t pstate_sclk_peak;
		uint32_t pstate_mclk_peak;

		struct delayed_work swctf_delayed_work;
		};

		int hwmgr_early_init(struct pp_hwmgr *hwmgr);

drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c

+34 −0

Original line number	Diff line number	Diff line
		@@ -24,6 +24,7 @@

		#include <linux/firmware.h>
		#include <linux/pci.h>
		#include <linux/reboot.h>

		#include "amdgpu.h"
		#include "amdgpu_smu.h"
		@@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
		smu->ppt_funcs->interrupt_work(smu);
		}

		static void smu_swctf_delayed_work_handler(struct work_struct *work)
		{
		struct smu_context *smu =
		container_of(work, struct smu_context, swctf_delayed_work.work);
		struct smu_temperature_range *range =
		&smu->thermal_range;
		struct amdgpu_device *adev = smu->adev;
		uint32_t hotspot_tmp, size;

		/*
		* If the hotspot temperature is confirmed as below SW CTF setting point
		* after the delay enforced, nothing will be done.
		* Otherwise, a graceful shutdown will be performed to prevent further damage.
		*/
		if (range->software_shutdown_temp &&
		smu->ppt_funcs->read_sensor &&
		!smu->ppt_funcs->read_sensor(smu,
		AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
		&hotspot_tmp,
		&size) &&
		hotspot_tmp / 1000 < range->software_shutdown_temp)
		return;

		dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
		dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
		orderly_poweroff(true);
		}

		static int smu_sw_init(void *handle)
		{
		struct amdgpu_device adev = (struct amdgpu_device )handle;
		@@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle)
		smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
		smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;

		INIT_DELAYED_WORK(&smu->swctf_delayed_work,
		smu_swctf_delayed_work_handler);

		ret = smu_smc_table_sw_init(smu);
		if (ret) {
		dev_err(adev->dev, "Failed to sw init smc table!\n");
		@@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
		return ret;
		}

		cancel_delayed_work_sync(&smu->swctf_delayed_work);

		ret = smu_disable_dpms(smu);
		if (ret) {
		dev_err(adev->dev, "Fail to disable dpm features!\n");