Commit fec8c524 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: save error count in RAS poison handler



Otherwise the RAS error count couldn't be queried from sysfs.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 45e3d1db
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo

	/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
	if (!adev->gmc.xgmi.connected_to_cpu)
		amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
		amdgpu_umc_poison_handler(adev, &err_data, reset);
	else if (reset)
		amdgpu_amdkfd_gpu_reset(adev);
}
+95 −73
Original line number Diff line number Diff line
@@ -23,79 +23,7 @@

#include "amdgpu_ras.h"

static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry)
{
	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}

int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{
	int r;
	struct ras_fs_if fs_info = {
		.sysfs_name = "umc_err_count",
	};
	struct ras_ih_if ih_info = {
		.cb = amdgpu_umc_process_ras_data_cb,
	};

	if (!adev->umc.ras_if) {
		adev->umc.ras_if =
			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
		if (!adev->umc.ras_if)
			return -ENOMEM;
		adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
		adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
		adev->umc.ras_if->sub_block_index = 0;
	}
	ih_info.head = fs_info.head = *adev->umc.ras_if;

	r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
				 &fs_info, &ih_info);
	if (r)
		goto free;

	if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
		if (r)
			goto late_fini;
	} else {
		r = 0;
		goto free;
	}

	/* ras init of specific umc version */
	if (adev->umc.ras_funcs &&
	    adev->umc.ras_funcs->err_cnt_init)
		adev->umc.ras_funcs->err_cnt_init(adev);

	return 0;

late_fini:
	amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
free:
	kfree(adev->umc.ras_if);
	adev->umc.ras_if = NULL;
	return r;
}

void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
{
	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
			adev->umc.ras_if) {
		struct ras_common_if *ras_if = adev->umc.ras_if;
		struct ras_ih_if ih_info = {
			.head = *ras_if,
			.cb = amdgpu_umc_process_ras_data_cb,
		};

		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
		kfree(ras_if);
	}
}

int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry,
		bool reset)
@@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
	return AMDGPU_RAS_SUCCESS;
}

int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
		void *ras_error_status,
		bool reset)
{
	int ret;
	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
	struct ras_common_if head = {
		.block = AMDGPU_RAS_BLOCK__UMC,
	};
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);

	ret =
		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);

	if (ret == AMDGPU_RAS_SUCCESS && obj) {
		obj->err_data.ue_count += err_data->ue_count;
		obj->err_data.ce_count += err_data->ce_count;
	}

	return ret;
}

static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry)
{
	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}

int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{
	int r;
	struct ras_fs_if fs_info = {
		.sysfs_name = "umc_err_count",
	};
	struct ras_ih_if ih_info = {
		.cb = amdgpu_umc_process_ras_data_cb,
	};

	if (!adev->umc.ras_if) {
		adev->umc.ras_if =
			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
		if (!adev->umc.ras_if)
			return -ENOMEM;
		adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
		adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
		adev->umc.ras_if->sub_block_index = 0;
	}
	ih_info.head = fs_info.head = *adev->umc.ras_if;

	r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
				 &fs_info, &ih_info);
	if (r)
		goto free;

	if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
		if (r)
			goto late_fini;
	} else {
		r = 0;
		goto free;
	}

	/* ras init of specific umc version */
	if (adev->umc.ras_funcs &&
	    adev->umc.ras_funcs->err_cnt_init)
		adev->umc.ras_funcs->err_cnt_init(adev);

	return 0;

late_fini:
	amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
free:
	kfree(adev->umc.ras_if);
	adev->umc.ras_if = NULL;
	return r;
}

void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
{
	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
			adev->umc.ras_if) {
		struct ras_common_if *ras_if = adev->umc.ras_if;
		struct ras_ih_if ih_info = {
			.head = *ras_if,
			.cb = amdgpu_umc_process_ras_data_cb,
		};

		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
		kfree(ras_if);
	}
}

int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
		struct amdgpu_irq_src *source,
		struct amdgpu_iv_entry *entry)
+1 −2
Original line number Diff line number Diff line
@@ -78,9 +78,8 @@ struct amdgpu_umc {

int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry,
		bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
		struct amdgpu_irq_src *source,