Commit 6475ae2b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)



Add help functions to query and reset RAS UTCL2 poison status.

v2: implement it on amdgpu side and kfd only calls it.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9d8a8d78
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
	else if (reset)
		amdgpu_amdkfd_gpu_reset(adev);
}

bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
{
	if (adev->gfx.ras->query_utcl2_poison_status)
		return adev->gfx.ras->query_utcl2_poison_status(adev);
	else
		return false;
}
+1 −0
Original line number Diff line number Diff line
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p);
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);

#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
+1 −0
Original line number Diff line number Diff line
@@ -202,6 +202,7 @@ struct amdgpu_cu_info {
struct amdgpu_gfx_ras {
	struct amdgpu_ras_block_object  ras_block;
	void (*enable_watchdog_timer)(struct amdgpu_device *adev);
	bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
};

struct amdgpu_gfx_funcs {
+14 −0
Original line number Diff line number Diff line
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
	mutex_unlock(&adev->grbm_idx_mutex);
}

static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
{
	u32 status = 0;
	struct amdgpu_vmhub *hub;

	hub = &adev->vmhub[AMDGPU_GFXHUB_0];
	status = RREG32(hub->vm_l2_pro_fault_status);
	/* reset page fault status */
	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);

	return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
}

struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
		.hw_ops = &gfx_v9_4_2_ras_ops,
	},
	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
	.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
};