Commit 9af357bc authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: Add fatal error handling in nbio v4_3



GPU will stop working once fatal error is detected.
it will inform driver to do reset to recover from
the fatal error.

v2: squash in logic fix (Srinivasan)
v3: squash in logic fix (Dan)

Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarCandice Li <candice.li@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 62f03dad
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@
#include "amdgpu_atomfirmware.h"
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "nbio_v4_3.h"
#include "atom.h"
#include "amdgpu_reset.h"

@@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
		if (!adev->gmc.xgmi.connected_to_cpu)
			adev->nbio.ras = &nbio_v7_4_ras;
		break;
	case IP_VERSION(4, 3, 0):
		if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
			/* unlike other generation of nbio ras,
			 * nbio v4_3 only support fatal error interrupt
			 * to inform software that DF is freezed due to
			 * system fatal error event. driver should not
			 * enable nbio ras in such case. Instead,
			 * check DF RAS */
			adev->nbio.ras = &nbio_v4_3_ras;
		break;
	default:
		/* nbio ras is not available */
		break;
+79 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@

#include "nbio/nbio_4_3_0_offset.h"
#include "nbio/nbio_4_3_0_sh_mask.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include <uapi/linux/kfd_ioctl.h>

static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev)
@@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
	.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
	.get_rom_offset = nbio_v4_3_get_rom_offset,
};

static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
						       struct amdgpu_irq_src *src,
						       unsigned type,
						       enum amdgpu_interrupt_state state)
{
	/* The ras_controller_irq enablement should be done in psp bl when it
	 * tries to enable ras feature. Driver only need to set the correct interrupt
	 * vector for bare-metal and sriov use case respectively
	 */
	uint32_t bif_doorbell_int_cntl;

	bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
	bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
					      BIF_BX0_BIF_DOORBELL_INT_CNTL,
					      RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
					      (state == AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
	WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl);

	return 0;
}

static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev,
						 struct amdgpu_irq_src *source,
						 struct amdgpu_iv_entry *entry)
{
	/* By design, the ih cookie for err_event_athub_irq should be written
	 * to bif ring. since bif ring is not enabled, just leave process callback
	 * as a dummy one.
	 */
	return 0;
}

static const struct amdgpu_irq_src_funcs nbio_v4_3_ras_err_event_athub_irq_funcs = {
	.set = nbio_v4_3_set_ras_err_event_athub_irq_state,
	.process = nbio_v4_3_process_err_event_athub_irq,
};

static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
{
	uint32_t bif_doorbell_int_cntl;

	bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
	if (REG_GET_FIELD(bif_doorbell_int_cntl,
			  BIF_DOORBELL_INT_CNTL,
			  RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
		/* driver has to clear the interrupt status when bif ring is disabled */
		bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
						BIF_DOORBELL_INT_CNTL,
						RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
		WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl);
		amdgpu_ras_global_ras_isr(adev);
	}
}

static int nbio_v4_3_init_ras_err_event_athub_interrupt(struct amdgpu_device *adev)
{

	int r;

	/* init the irq funcs */
	adev->nbio.ras_err_event_athub_irq.funcs =
		&nbio_v4_3_ras_err_event_athub_irq_funcs;
	adev->nbio.ras_err_event_athub_irq.num_types = 1;

	/* register ras err event athub interrupt
	 * nbio v4_3 uses the same irq source as nbio v7_4 */
	r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF,
			      NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
			      &adev->nbio.ras_err_event_athub_irq);

	return r;
}

struct amdgpu_nbio_ras nbio_v4_3_ras = {
	.handle_ras_err_event_athub_intr_no_bifring = nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring,
	.init_ras_err_event_athub_interrupt = nbio_v4_3_init_ras_err_event_athub_interrupt,
};
+1 −0
Original line number Diff line number Diff line
@@ -29,5 +29,6 @@
extern const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg;
extern const struct amdgpu_nbio_funcs nbio_v4_3_funcs;
extern const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs;
extern struct amdgpu_nbio_ras nbio_v4_3_ras;

#endif
+14 −1
Original line number Diff line number Diff line
@@ -754,6 +754,14 @@ static int soc21_common_late_init(void *handle)
							     sriov_vcn_4_0_0_video_codecs_decode_array_vcn0,
							     ARRAY_SIZE(sriov_vcn_4_0_0_video_codecs_decode_array_vcn0));
		}
	} else {
		if (adev->nbio.ras &&
		    adev->nbio.ras_err_event_athub_irq.funcs)
			/* don't need to fail gpu late init
			 * if enabling athub_err_event interrupt failed
			 * nbio v4_3 only support fatal error hanlding
			 * just enable the interrupt directly */
			amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
	}

	return 0;
@@ -801,8 +809,13 @@ static int soc21_common_hw_fini(void *handle)
	/* disable the doorbell aperture */
	soc21_enable_doorbell_aperture(adev, false);

	if (amdgpu_sriov_vf(adev))
	if (amdgpu_sriov_vf(adev)) {
		xgpu_nv_mailbox_put_irq(adev);
	} else {
		if (adev->nbio.ras &&
		    adev->nbio.ras_err_event_athub_irq.funcs)
			amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
	}

	return 0;
}