Commit 8cf12507 authored by xinhui pan's avatar xinhui pan Committed by Alex Deucher
Browse files

drm/amdgpu: enable ras on sdma4



register IH, enable ras features on sdma.
create sysfs debugfs file for sdma.

Signed-off-by: default avatarxinhui pan <xinhui.pan@amd.com>
Signed-off-by: default avatarFeifei Xu <Feifei.Xu@amd.com>
Signed-off-by: default avatarEric Huang <JinhuiEric.Huang@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2be4c4a9
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -30,6 +30,8 @@
enum amdgpu_sdma_irq {
	AMDGPU_SDMA_IRQ_TRAP0 = 0,
	AMDGPU_SDMA_IRQ_TRAP1,
	AMDGPU_SDMA_IRQ_ECC0,
	AMDGPU_SDMA_IRQ_ECC1,

	AMDGPU_SDMA_IRQ_LAST
};
@@ -49,9 +51,11 @@ struct amdgpu_sdma {
	struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
	struct amdgpu_irq_src	trap_irq;
	struct amdgpu_irq_src	illegal_inst_irq;
	struct amdgpu_irq_src	ecc_irq;
	int			num_instances;
	uint32_t                    srbm_soft_reset;
	bool			has_page_queue;
	struct ras_common_if	*ras_if;
};

/*
+183 −1
Original line number Diff line number Diff line
@@ -41,6 +41,8 @@
#include "ivsrcid/sdma0/irqsrcs_sdma0_4_0.h"
#include "ivsrcid/sdma1/irqsrcs_sdma1_4_0.h"

#include "amdgpu_ras.h"

MODULE_FIRMWARE("amdgpu/vega10_sdma.bin");
MODULE_FIRMWARE("amdgpu/vega10_sdma1.bin");
MODULE_FIRMWARE("amdgpu/vega12_sdma.bin");
@@ -1493,6 +1495,83 @@ static int sdma_v4_0_early_init(void *handle)
	return 0;
}

static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
		struct amdgpu_iv_entry *entry);

static int sdma_v4_0_late_init(void *handle)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
	struct ras_common_if **ras_if = &adev->sdma.ras_if;
	struct ras_ih_if ih_info = {
		.cb = sdma_v4_0_process_ras_data_cb,
	};
	struct ras_fs_if fs_info = {
		.sysfs_name = "sdma_err_count",
		.debugfs_name = "sdma_err_inject",
	};
	struct ras_common_if ras_block = {
		.block = AMDGPU_RAS_BLOCK__SDMA,
		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
		.sub_block_index = 0,
		.name = "sdma",
	};
	int r;

	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
		amdgpu_ras_feature_enable(adev, &ras_block, 0);
		return 0;
	}

	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
	if (!*ras_if)
		return -ENOMEM;

	**ras_if = ras_block;

	r = amdgpu_ras_feature_enable(adev, *ras_if, 1);
	if (r)
		goto feature;

	ih_info.head = **ras_if;
	fs_info.head = **ras_if;

	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
	if (r)
		goto interrupt;

	r = amdgpu_ras_debugfs_create(adev, &fs_info);
	if (r)
		goto debugfs;

	r = amdgpu_ras_sysfs_create(adev, &fs_info);
	if (r)
		goto sysfs;

	r = amdgpu_irq_get(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC0);
	if (r)
		goto irq;

	r = amdgpu_irq_get(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC1);
	if (r) {
		amdgpu_irq_put(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC0);
		goto irq;
	}

	return 0;
irq:
	amdgpu_ras_sysfs_remove(adev, *ras_if);
sysfs:
	amdgpu_ras_debugfs_remove(adev, *ras_if);
debugfs:
	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
interrupt:
	amdgpu_ras_feature_enable(adev, *ras_if, 0);
feature:
	kfree(*ras_if);
	*ras_if = NULL;
	return -EINVAL;
}

static int sdma_v4_0_sw_init(void *handle)
{
	struct amdgpu_ring *ring;
@@ -1511,6 +1590,18 @@ static int sdma_v4_0_sw_init(void *handle)
	if (r)
		return r;

	/* SDMA SRAM ECC event */
	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA0, SDMA0_4_0__SRCID__SDMA_SRAM_ECC,
			&adev->sdma.ecc_irq);
	if (r)
		return r;

	/* SDMA SRAM ECC event */
	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA1, SDMA1_4_0__SRCID__SDMA_SRAM_ECC,
			&adev->sdma.ecc_irq);
	if (r)
		return r;

	for (i = 0; i < adev->sdma.num_instances; i++) {
		ring = &adev->sdma.instance[i].ring;
		ring->ring_obj = NULL;
@@ -1561,6 +1652,22 @@ static int sdma_v4_0_sw_fini(void *handle)
	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
	int i;

	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA) &&
			adev->sdma.ras_if) {
		struct ras_common_if *ras_if = adev->sdma.ras_if;
		struct ras_ih_if ih_info = {
			.head = *ras_if,
		};

		/*remove fs first*/
		amdgpu_ras_debugfs_remove(adev, ras_if);
		amdgpu_ras_sysfs_remove(adev, ras_if);
		/*remove the IH*/
		amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
		amdgpu_ras_feature_enable(adev, ras_if, 0);
		kfree(ras_if);
	}

	for (i = 0; i < adev->sdma.num_instances; i++) {
		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
		if (adev->sdma.has_page_queue)
@@ -1598,6 +1705,9 @@ static int sdma_v4_0_hw_fini(void *handle)
	if (amdgpu_sriov_vf(adev))
		return 0;

	amdgpu_irq_put(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC0);
	amdgpu_irq_put(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC1);

	sdma_v4_0_ctx_switch_enable(adev, false);
	sdma_v4_0_enable(adev, false);

@@ -1714,6 +1824,50 @@ static int sdma_v4_0_process_trap_irq(struct amdgpu_device *adev,
	return 0;
}

static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
		struct amdgpu_iv_entry *entry)
{
	uint32_t instance, err_source;

	switch (entry->client_id) {
	case SOC15_IH_CLIENTID_SDMA0:
		instance = 0;
		break;
	case SOC15_IH_CLIENTID_SDMA1:
		instance = 1;
		break;
	default:
		return 0;
	}

	switch (entry->src_id) {
	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
		err_source = 0;
		break;
	case SDMA0_4_0__SRCID__SDMA_ECC:
		err_source = 1;
		break;
	default:
		return 0;
	}

	amdgpu_ras_reset_gpu(adev, 0);

	return AMDGPU_RAS_UE;
}

static int sdma_v4_0_process_ecc_irq(struct amdgpu_device *adev,
				      struct amdgpu_irq_src *source,
				      struct amdgpu_iv_entry *entry)
{
	struct ras_dispatch_if ih_data = {
		.head = *adev->sdma.ras_if,
		.entry = entry,
	};
	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
	return 0;
}

static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
					      struct amdgpu_irq_src *source,
					      struct amdgpu_iv_entry *entry)
@@ -1741,6 +1895,25 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
	return 0;
}

static int sdma_v4_0_set_ecc_irq_state(struct amdgpu_device *adev,
					struct amdgpu_irq_src *source,
					unsigned type,
					enum amdgpu_interrupt_state state)
{
	u32 sdma_edc_config;

	u32 reg_offset = (type == AMDGPU_SDMA_IRQ_ECC0) ?
		sdma_v4_0_get_reg_offset(adev, 0, mmSDMA0_EDC_CONFIG) :
		sdma_v4_0_get_reg_offset(adev, 1, mmSDMA0_EDC_CONFIG);

	sdma_edc_config = RREG32(reg_offset);
	sdma_edc_config = REG_SET_FIELD(sdma_edc_config, SDMA0_EDC_CONFIG, ECC_INT_ENABLE,
		       state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
	WREG32(reg_offset, sdma_edc_config);

	return 0;
}

static void sdma_v4_0_update_medium_grain_clock_gating(
		struct amdgpu_device *adev,
		bool enable)
@@ -1906,7 +2079,7 @@ static void sdma_v4_0_get_clockgating_state(void *handle, u32 *flags)
const struct amd_ip_funcs sdma_v4_0_ip_funcs = {
	.name = "sdma_v4_0",
	.early_init = sdma_v4_0_early_init,
	.late_init = NULL,
	.late_init = sdma_v4_0_late_init,
	.sw_init = sdma_v4_0_sw_init,
	.sw_fini = sdma_v4_0_sw_fini,
	.hw_init = sdma_v4_0_hw_init,
@@ -2008,11 +2181,20 @@ static const struct amdgpu_irq_src_funcs sdma_v4_0_illegal_inst_irq_funcs = {
	.process = sdma_v4_0_process_illegal_inst_irq,
};

static const struct amdgpu_irq_src_funcs sdma_v4_0_ecc_irq_funcs = {
	.set = sdma_v4_0_set_ecc_irq_state,
	.process = sdma_v4_0_process_ecc_irq,
};



static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev)
{
	adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_LAST;
	adev->sdma.trap_irq.funcs = &sdma_v4_0_trap_irq_funcs;
	adev->sdma.illegal_inst_irq.funcs = &sdma_v4_0_illegal_inst_irq_funcs;
	adev->sdma.ecc_irq.num_types = AMDGPU_SDMA_IRQ_LAST;
	adev->sdma.ecc_irq.funcs = &sdma_v4_0_ecc_irq_funcs;
}

/**