Commit dac6b808 authored by Victor Zhao's avatar Victor Zhao Committed by Alex Deucher
Browse files

drm/amdgpu: let mode2 reset fallback to default when failure



- introduce AMDGPU_SKIP_MODE2_RESET flag
- let mode2 reset fallback to default reset method if failed

v2: move this part out from the asic specific part

Signed-off-by: default avatarVictor Zhao <Victor.Zhao@amd.com>
Acked-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 672c0218
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
	reset_context.method = AMD_RESET_METHOD_NONE;
	reset_context.reset_req_dev = adev;
	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
	clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);

	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
+6 −1
Original line number Diff line number Diff line
@@ -5148,6 +5148,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

	reset_context->job = job;
	reset_context->hive = hive;

	/*
	 * Build list of devices to reset.
	 * In case we are in XGMI hive mode, resort the device list
@@ -5267,9 +5268,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			amdgpu_ras_resume(adev);
	} else {
		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
		if (r && r == -EAGAIN)
		if (r && r == -EAGAIN) {
			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
			adev->asic_reset_res = 0;
			goto retry;
		}
	}

skip_hw_reset:

@@ -5699,6 +5703,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
	reset_context.reset_req_dev = adev;
	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);

	adev->no_hw_access = true;
	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
+1 −0
Original line number Diff line number Diff line
@@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
		reset_context.method = AMD_RESET_METHOD_NONE;
		reset_context.reset_req_dev = adev;
		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);

		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
		if (r)
+1 −0
Original line number Diff line number Diff line
@@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
		reset_context.method = AMD_RESET_METHOD_NONE;
		reset_context.reset_req_dev = adev;
		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);

		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
	}
+6 −0
Original line number Diff line number Diff line
@@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
{
	struct amdgpu_reset_handler *reset_handler = NULL;

	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
		return -ENOSYS;

	if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
		reset_handler = adev->reset_cntl->get_reset_handler(
			adev->reset_cntl, reset_context);
@@ -90,6 +93,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
	int ret;
	struct amdgpu_reset_handler *reset_handler = NULL;

	if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
		return -ENOSYS;

	if (adev->reset_cntl)
		reset_handler = adev->reset_cntl->get_reset_handler(
			adev->reset_cntl, reset_context);
Loading