Commit cbe4d43e authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: add RAS page retirement functions for MCA



Define page retirement functions for MCA platform.

v2: remove page retirement handling from MCA poison handler,
    let MCA notifier do page retirement.

v3: remove specific poison handler for MCA to simplify code.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 25135748
Loading
Loading
Loading
Loading
+53 −0
Original line number Diff line number Diff line
@@ -22,6 +22,59 @@
 */

#include "amdgpu.h"
#include "umc_v6_7.h"

static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
				    struct ras_err_data *err_data, uint64_t err_addr,
				    uint32_t ch_inst, uint32_t umc_inst)
{
	switch (adev->ip_versions[UMC_HWIP][0]) {
	case IP_VERSION(6, 7, 0):
		umc_v6_7_convert_error_address(adev,
				err_data, err_addr, ch_inst, umc_inst);
		break;
	default:
		dev_warn(adev->dev,
			 "UMC address to Physical address translation is not supported\n");
		return AMDGPU_RAS_FAIL;
	}

	return AMDGPU_RAS_SUCCESS;
}

int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
{
	struct ras_err_data err_data = {0, 0, 0, NULL};
	int ret = AMDGPU_RAS_FAIL;

	err_data.err_addr =
		kcalloc(adev->umc.max_ras_err_cnt_per_query,
			sizeof(struct eeprom_table_record), GFP_KERNEL);
	if (!err_data.err_addr) {
		dev_warn(adev->dev,
			"Failed to alloc memory for umc error record in MCA notifier!\n");
		return AMDGPU_RAS_FAIL;
	}

	/*
	 * Translate UMC channel address to Physical address
	 */
	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
					ch_inst, umc_inst);
	if (ret)
		goto out;

	if (amdgpu_bad_page_threshold != 0) {
		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
						err_data.err_addr_cnt);
		amdgpu_ras_save_bad_pages(adev);
	}

out:
	kfree(err_data.err_addr);
	return ret;
}

static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
		void *ras_error_status,
+2 −0
Original line number Diff line number Diff line
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry);
int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
#endif