Commit 950d6425 authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher
Browse files

drm/amdgpu: support ras on SRIOV



support umc/gfx/sdma ras on guest side

Changed from V1:
    move sriov judgment in amdgpu_ras_interrupt_fatal_error_handler

Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2c270d3e
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -5219,6 +5219,10 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
		r = amdgpu_device_reset_sriov(adev, job ? false : true);
		if (r)
			adev->asic_reset_res = r;

		/* Aldebaran supports ras in SRIOV, so need resume ras during reset */
		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
			amdgpu_ras_resume(adev);
	} else {
		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
		if (r && r == -EAGAIN)
+30 −12
Original line number Diff line number Diff line
@@ -726,7 +726,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
	/* Do not enable if it is not allowed. */
	WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));

	if (!amdgpu_ras_intr_triggered()) {
	/* Only enable ras feature operation handle on host side */
	if (!amdgpu_sriov_vf(adev) &&
		!amdgpu_ras_intr_triggered()) {
		ret = psp_ras_enable_features(&adev->psp, info, enable);
		if (ret) {
			dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
@@ -1523,7 +1525,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 */
void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
{
	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
	/* Fatal error events are handled on host side */
	if (amdgpu_sriov_vf(adev) ||
		!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
		return;

	if (adev->nbio.ras &&
@@ -2270,10 +2274,14 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
{
	adev->ras_hw_enabled = adev->ras_enabled = 0;

	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
	if (!adev->is_atom_fw ||
	    !amdgpu_ras_asic_supported(adev))
		return;

	if (!(amdgpu_sriov_vf(adev) &&
		(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
		return;

	if (!adev->gmc.xgmi.connected_to_cpu) {
		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
			dev_info(adev->dev, "MEM ECC is active.\n");
@@ -2285,6 +2293,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)

		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
			dev_info(adev->dev, "SRAM ECC is active.\n");
			if (!amdgpu_sriov_vf(adev)) {
				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
							    1 << AMDGPU_RAS_BLOCK__DF);

@@ -2294,6 +2303,11 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
				else
					adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
							1 << AMDGPU_RAS_BLOCK__JPEG);
			} else {
				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
								1 << AMDGPU_RAS_BLOCK__SDMA |
								1 << AMDGPU_RAS_BLOCK__GFX);
			}
		} else {
			dev_info(adev->dev, "SRAM ECC is not presented.\n");
		}
@@ -2637,6 +2651,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
	struct amdgpu_ras_block_object *obj;
	int r;

	/* Guest side doesn't need init ras feature */
	if (amdgpu_sriov_vf(adev))
		return 0;

	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
		if (!node->ras_obj) {
			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
+4 −0
Original line number Diff line number Diff line
@@ -124,6 +124,10 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
		struct amdgpu_iv_entry *entry)
{
	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

	if (amdgpu_sriov_vf(adev))
		return AMDGPU_RAS_SUCCESS;

	amdgpu_ras_reset_gpu(adev);

	return AMDGPU_RAS_SUCCESS;
+6 −3
Original line number Diff line number Diff line
@@ -85,9 +85,12 @@ static int psp_v13_0_init_microcode(struct psp_context *psp)
		err = psp_init_sos_microcode(psp, chip_name);
		if (err)
			return err;
		/* It's not necessary to load ras ta on Guest side */
		if (!amdgpu_sriov_vf(adev)) {
			err = psp_init_ta_microcode(&adev->psp, chip_name);
			if (err)
				return err;
		}
		break;
	case IP_VERSION(13, 0, 1):
	case IP_VERSION(13, 0, 3):