Commit a46751fb authored by Luben Tuikov's avatar Luben Tuikov Committed by Alex Deucher
Browse files

drm/amdgpu: Fix RAS function interface



The correctable and uncorrectable errors
are calculated at each invocation of this
function. Therefore, it is highly inefficient to
return just one of them based on a Boolean
input. If the caller wants both, twice the work
would be done. (And this work is O(n^3) on
Vega20.)

Fix this "interface" to simply return what it had
calculated--both values. Let the caller choose
what it wants to record, inspect, use.

Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarLuben Tuikov <luben.tuikov@amd.com>
Reviewed-by: default avatarAlexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2871e101
Loading
Loading
Loading
Loading
+15 −8
Original line number Diff line number Diff line
@@ -1043,29 +1043,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
}

/* get the total error counts on all IPs */
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
		bool is_ce)
void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
				  unsigned long *ce_count,
				  unsigned long *ue_count)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_manager *obj;
	struct ras_err_data data = {0, 0};
	unsigned long ce, ue;

	if (!adev->ras_enabled || !con)
		return 0;
		return;

	ce = 0;
	ue = 0;
	list_for_each_entry(obj, &con->head, node) {
		struct ras_query_if info = {
			.head = obj->head,
		};

		if (amdgpu_ras_query_error_status(adev, &info))
			return 0;
			return;

		data.ce_count += info.ce_count;
		data.ue_count += info.ue_count;
		ce += info.ce_count;
		ue += info.ue_count;
	}

	return is_ce ? data.ce_count : data.ue_count;
	if (ce_count)
		*ce_count = ce;

	if (ue_count)
		*ue_count = ue;
}
/* query/inject/cure end */

+3 −2
Original line number Diff line number Diff line
@@ -485,8 +485,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);

unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
		bool is_ce);
void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
				  unsigned long *ce_count,
				  unsigned long *ue_count);

/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,