Commit 7692e1ee authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: add RAS fatal error handler for NBIO v7.9



Register RAS fatal error interrupt and add handler.

v2: only register NBIO RAS for dGPU platform.
    change nbio_v7_9_set_ras_controller_irq_state and nbio_v7_9_set_ras_err_event_athub_irq_state
    to dummy functions.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 657db07b
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "nbio_v4_3.h"
#include "nbio_v7_9.h"
#include "atom.h"
#include "amdgpu_reset.h"

@@ -2644,6 +2645,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
			 * check DF RAS */
			adev->nbio.ras = &nbio_v4_3_ras;
		break;
	case IP_VERSION(7, 9, 0):
		if (!adev->gmc.is_app_apu)
			adev->nbio.ras = &nbio_v7_9_ras;
		break;
	default:
		/* nbio ras is not available */
		break;
+187 −0
Original line number Diff line number Diff line
@@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
	.init_registers = nbio_v7_9_init_registers,
	.get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count,
};

static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev,
					void *ras_error_status)
{
	return;
}

static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev)
{
	uint32_t bif_doorbell_intr_cntl;
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
	struct ras_err_data err_data = {0, 0, 0, NULL};
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

	bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);

	if (REG_GET_FIELD(bif_doorbell_intr_cntl,
		BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) {
		/* driver has to clear the interrupt status when bif ring is disabled */
		bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
						BIF_BX0_BIF_DOORBELL_INT_CNTL,
						RAS_CNTLR_INTERRUPT_CLEAR, 1);
		WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);

		if (!ras->disable_ras_err_cnt_harvest) {
			/*
			 * clear error status after ras_controller_intr
			 * according to hw team and count ue number
			 * for query
			 */
			nbio_v7_9_query_ras_error_count(adev, &err_data);

			/* logging on error cnt and printing for awareness */
			obj->err_data.ue_count += err_data.ue_count;
			obj->err_data.ce_count += err_data.ce_count;

			if (err_data.ce_count)
				dev_info(adev->dev, "%ld correctable hardware "
						"errors detected in %s block, "
						"no user action is needed.\n",
						obj->err_data.ce_count,
						get_ras_block_str(adev->nbio.ras_if));

			if (err_data.ue_count)
				dev_info(adev->dev, "%ld uncorrectable hardware "
						"errors detected in %s block\n",
						obj->err_data.ue_count,
						get_ras_block_str(adev->nbio.ras_if));
		}

		dev_info(adev->dev, "RAS controller interrupt triggered "
					"by NBIF error\n");

		/* ras_controller_int is dedicated for nbif ras error,
		 * not the global interrupt for sync flood
		 */
		amdgpu_ras_reset_gpu(adev);
	}
}

static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
{
	uint32_t bif_doorbell_intr_cntl;

	bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);

	if (REG_GET_FIELD(bif_doorbell_intr_cntl,
		BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
		/* driver has to clear the interrupt status when bif ring is disabled */
		bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
						BIF_BX0_BIF_DOORBELL_INT_CNTL,
						RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);

		WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);

		amdgpu_ras_global_ras_isr(adev);
	}
}

static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev,
						  struct amdgpu_irq_src *src,
						  unsigned type,
						  enum amdgpu_interrupt_state state)
{
	/* Dummy function, there is no initialization operation in driver */

	return 0;
}

static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev,
						struct amdgpu_irq_src *source,
						struct amdgpu_iv_entry *entry)
{
	/* By design, the ih cookie for ras_controller_irq should be written
	 * to BIFring instead of general iv ring. However, due to known bif ring
	 * hw bug, it has to be disabled. There is no chance the process function
	 * will be involked. Just left it as a dummy one.
	 */
	return 0;
}

static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
						       struct amdgpu_irq_src *src,
						       unsigned type,
						       enum amdgpu_interrupt_state state)
{
	/* Dummy function, there is no initialization operation in driver */

	return 0;
}

static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev,
						 struct amdgpu_irq_src *source,
						 struct amdgpu_iv_entry *entry)
{
	/* By design, the ih cookie for err_event_athub_irq should be written
	 * to BIFring instead of general iv ring. However, due to known bif ring
	 * hw bug, it has to be disabled. There is no chance the process function
	 * will be involked. Just left it as a dummy one.
	 */
	return 0;
}

static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = {
	.set = nbio_v7_9_set_ras_controller_irq_state,
	.process = nbio_v7_9_process_ras_controller_irq,
};

static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = {
	.set = nbio_v7_9_set_ras_err_event_athub_irq_state,
	.process = nbio_v7_9_process_err_event_athub_irq,
};

static int nbio_v7_9_init_ras_controller_interrupt (struct amdgpu_device *adev)
{
	int r;

	/* init the irq funcs */
	adev->nbio.ras_controller_irq.funcs =
		&nbio_v7_9_ras_controller_irq_funcs;
	adev->nbio.ras_controller_irq.num_types = 1;

	/* register ras controller interrupt */
	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
			      NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT,
			      &adev->nbio.ras_controller_irq);

	return r;
}

static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct amdgpu_device *adev)
{

	int r;

	/* init the irq funcs */
	adev->nbio.ras_err_event_athub_irq.funcs =
		&nbio_v7_9_ras_err_event_athub_irq_funcs;
	adev->nbio.ras_err_event_athub_irq.num_types = 1;

	/* register ras err event athub interrupt */
	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
			      NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
			      &adev->nbio.ras_err_event_athub_irq);

	return r;
}

const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = {
	.query_ras_error_count = nbio_v7_9_query_ras_error_count,
};

struct amdgpu_nbio_ras nbio_v7_9_ras = {
	.ras_block = {
		.ras_comm = {
			.name = "pcie_bif",
			.block = AMDGPU_RAS_BLOCK__PCIE_BIF,
			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
		},
		.hw_ops = &nbio_v7_9_ras_hw_ops,
		.ras_late_init = amdgpu_nbio_ras_late_init,
	},
	.handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring,
	.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring,
	.init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt,
	.init_ras_err_event_athub_interrupt = nbio_v7_9_init_ras_err_event_athub_interrupt,
};
+1 −0
Original line number Diff line number Diff line
@@ -28,5 +28,6 @@

extern const struct nbio_hdp_flush_reg nbio_v7_9_hdp_flush_reg;
extern const struct amdgpu_nbio_funcs nbio_v7_9_funcs;
extern struct amdgpu_nbio_ras nbio_v7_9_ras;

#endif