Commit 3f975d0f authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher
Browse files

drm/amdgpu: update athub interrupt harvesting handle



GCEA/MMHUB EA error should not result to DF freeze, this is
fixed in next generation, but for some reasons the GCEA/MMHUB
EA error will result to DF freeze in previous generation,
diver should avoid to indicate GCEA/MMHUB EA error as hw fatal
error in kernel message by read GCEA/MMHUB err status registers.

Changed from V1:
    make query_ras_error_status function more general
    make read mmhub er status register more friendly

Changed from V2:
    move ras error status query function into do_recovery workqueue

Changed from V3:
    remove useless code from V2, print GCEA error status
    instance number

Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d117413f
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs {
	int (*query_ras_error_count) (struct amdgpu_device *adev, void *ras_error_status);
	void (*reset_ras_error_count) (struct amdgpu_device *adev);
	void (*init_spm_golden)(struct amdgpu_device *adev);
	void (*query_ras_error_status) (struct amdgpu_device *adev);
};

struct sq_work {
+1 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs {
				uint64_t page_table_base);
	void (*update_power_gating)(struct amdgpu_device *adev,
                                bool enable);
	void (*query_ras_error_status)(struct amdgpu_device *adev);
};

struct amdgpu_mmhub {
+42 −1
Original line number Diff line number Diff line
@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
	}
}

/* Parse RdRspStatus and WrRspStatus */
void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
		struct ras_query_if *info)
{
	/*
	 * Only two block need to query read/write
	 * RspStatus at current state
	 */
	switch (info->head.block) {
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->query_ras_error_status)
			adev->gfx.funcs->query_ras_error_status(adev);
		break;
	case AMDGPU_RAS_BLOCK__MMHUB:
		if (adev->mmhub.funcs->query_ras_error_status)
			adev->mmhub.funcs->query_ras_error_status(adev);
		break;
	default:
		break;
	}
}

static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_manager *obj;

	if (!con)
		return;

	list_for_each_entry(obj, &con->head, node) {
		struct ras_query_if info = {
			.head = obj->head,
		};

		amdgpu_ras_error_status_query(adev, &info);
	}
}

/* recovery begin */

/* return 0 on success.
@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
		}

		list_for_each_entry(remote_adev,
				device_list_handle, gmc.xgmi.head)
				device_list_handle, gmc.xgmi.head) {
			amdgpu_ras_query_err_status(remote_adev);
			amdgpu_ras_log_on_err_counter(remote_adev);
		}

		amdgpu_put_xgmi_hive(hive);
	}
+1 −0
Original line number Diff line number Diff line
@@ -2075,6 +2075,7 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
	.ras_error_inject = &gfx_v9_4_ras_error_inject,
	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
	.query_ras_error_status = &gfx_v9_4_query_ras_error_status,
};

static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
+29 −0
Original line number Diff line number Diff line
@@ -992,3 +992,32 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if)

	return ret;
}

static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
	{ SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };

void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
{
	uint32_t i, j;
	uint32_t reg_value;

	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
		return;

	mutex_lock(&adev->grbm_idx_mutex);

	for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
		for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
		     j++) {
			gfx_v9_4_select_se_sh(adev, i, 0, j);
			reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
				gfx_v9_4_rdrsp_status_regs));
			if (reg_value)
				dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
						j, reg_value);
		}
	}

	gfx_v9_4_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
	mutex_unlock(&adev->grbm_idx_mutex);
}
Loading