Commit ceb47e0d authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher
Browse files

drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran



Fix the following issues with SDMA RAS error reporting:
1. Read the EDC_COUNTER2 register also to fetch error counts
   for all sub-blocks in SDMA.
2. SDMA RAS on Aldebaran suports single-bit uncorrectable errors
   only. So, report error count in UE count instead of CE count.

Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-By: default avatarJohn Clements <John.Clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 1f0d8e37
Loading
Loading
Loading
Loading
+28 −7
Original line number Diff line number Diff line
@@ -160,6 +160,7 @@ static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = {
};

static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
					  uint32_t reg_offset,
					  uint32_t value,
					  uint32_t instance,
					  uint32_t *sec_count)
@@ -169,6 +170,9 @@ static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,

	/* double bits error (multiple bits) error detection is not supported */
	for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) {
		if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset)
			continue;

		/* the SDMA_EDC_COUNTER register in each sdma instance
		 * shares the same sed shift_mask
		 * */
@@ -197,13 +201,30 @@ static int sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,
	reg_value = RREG32(reg_offset);
	/* double bit error is not supported */
	if (reg_value)
		sdma_v4_4_get_ras_error_count(adev, reg_value, instance, &sec_count);
	/* err_data->ce_count should be initialized to 0
	 * before calling into this function */
	err_data->ce_count += sec_count;
	/* double bit error is not supported
	 * set ue count to 0 */
	err_data->ue_count = 0;
		sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value,
					      instance, &sec_count);

	reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2);
	reg_value = RREG32(reg_offset);
	/* double bit error is not supported */
	if (reg_value)
		sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value,
					      instance, &sec_count);

	/*
	 * err_data->ue_count should be initialized to 0
	 * before calling into this function
	 *
	 * SDMA RAS supports single bit uncorrectable error detection.
	 * So, increment uncorrectable error count.
	 */
	err_data->ue_count += sec_count;

	/*
	 * SDMA RAS does not support correctable errors.
	 * Set ce count to 0.
	 */
	err_data->ce_count = 0;

	return 0;
};