Commit 2a460963 authored by Candice Li's avatar Candice Li Committed by Alex Deucher
Browse files

drm/amdgpu: Resolve RAS GFX error count issue after cold boot on Arcturus



Adjust the sequence for ras late init and separate ras reset error status
from query status.

v2: squash in fix from Candice

Signed-off-by: default avatarCandice Li <candice.li@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 28caf8c4
Loading
Loading
Loading
Loading
+6 −3
Original line number Diff line number Diff line
@@ -594,17 +594,20 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value)
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
	int r;
	r = amdgpu_ras_block_late_init(adev, ras_block);
	if (r)
		return r;

	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
		if (!amdgpu_persistent_edc_harvesting_supported(adev))
			amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);

		r = amdgpu_ras_block_late_init(adev, ras_block);
		if (r)
			return r;

		r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
		if (r)
			goto late_fini;
	} else {
		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
	}

	return 0;
+22 −5
Original line number Diff line number Diff line
@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;

	/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
	}

	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
			"ue", info.ue_count,
			"ce", info.ce_count);
@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;

	if (obj->adev->asic_type == CHIP_ALDEBARAN) {
	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
			DRM_WARN("Failed to reset error counter and error status");
			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
	}

	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
		}
	}

	if (!amdgpu_persistent_edc_harvesting_supported(adev))
		amdgpu_ras_reset_error_status(adev, info->head.block);

	return 0;
}

@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
		if (res)
			return res;

		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
			if (amdgpu_ras_reset_error_status(adev, info.head.block))
				dev_warn(adev->dev, "Failed to reset error counter and error status");
		}

		ce += info.ce_count;
		ue += info.ue_count;
	}
@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
			continue;

		amdgpu_ras_query_error_status(adev, &info);

		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
			if (amdgpu_ras_reset_error_status(adev, info.head.block))
				dev_warn(adev->dev, "Failed to reset error counter and error status");
		}
	}
}