Commit 88f8575b authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher
Browse files

drm/amdgpu: enable watchdog feature for SQ of aldebaran



SQ's watchdog timer monitors forward progress, a mask of which waves
caused the watchdog timeout is recorded into ras status registers and
then trigger a system fatal error event.

v2:
1. change *query_timeout_status to *query_sq_timeout_status.
2. move query_sq_timeout_status into amdgpu_ras_do_recovery.
3. add module parameters to enable/disable fatal error event and modify
the watchdog timer.

v3:
1. remove unused parameters of *enable_watchdog_timer

Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 4abc2567
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -126,6 +126,12 @@ struct amdgpu_mgpu_info
	uint32_t			num_apu;
};

struct amdgpu_watchdog_timer
{
	bool timeout_fatal_disable;
	uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
};

#define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256

/*
@@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask;
extern int amdgpu_bad_page_threshold;
extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
extern int amdgpu_async_gfx_ring;
extern int amdgpu_mcbp;
extern int amdgpu_discovery;
+18 −0
Original line number Diff line number Diff line
@@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = {
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = 100;
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
	.timeout_fatal_disable = false,
	.period = 0x3f, /* about 8s */
};

/**
 * DOC: vramlimit (int)
@@ -530,6 +534,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);

/**
 * DOC: timeout_fatal_disable (bool)
 * Disable Watchdog timeout fatal error event
 */
MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);

/**
 * DOC: timeout_period (uint)
 * Modify the watchdog timeout max_cycles as (1 << period)
 */
MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);

/**
 * DOC: si_support (int)
 * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
+2 −0
Original line number Diff line number Diff line
@@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs {
	void (*init_spm_golden)(struct amdgpu_device *adev);
	void (*query_ras_error_status) (struct amdgpu_device *adev);
	void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
	void (*enable_watchdog_timer)(struct amdgpu_device *adev);
	void (*query_sq_timeout_status)(struct amdgpu_device *adev);
};

struct sq_work {
+3 −0
Original line number Diff line number Diff line
@@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->query_ras_error_status)
			adev->gfx.funcs->query_ras_error_status(adev);

		if (adev->gfx.funcs->query_sq_timeout_status)
			adev->gfx.funcs->query_sq_timeout_status(adev);
		break;
	case AMDGPU_RAS_BLOCK__MMHUB:
		if (adev->mmhub.funcs->query_ras_error_status)
+5 −0
Original line number Diff line number Diff line
@@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
	.query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
};

static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle)
	if (adev->asic_type == CHIP_ALDEBARAN)
		gfx_v9_4_2_set_power_brake_sequence(adev);

	if (adev->gfx.funcs->enable_watchdog_timer)
		adev->gfx.funcs->enable_watchdog_timer(adev);

	return r;
}

Loading