Commit 02599bc7 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky
Browse files

drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.



No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: default avatarLiu Shaoyun <Shaoyun.Liu@amd.com>
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarLiu Shaoyun <Shaoyun.Liu@amd.com>
Link: https://www.spinics.net/lists/amd-gfx/msg74114.html
parent 54f329cc
Loading
Loading
Loading
Loading
+6 −3
Original line number Diff line number Diff line
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
	if (amdgpu_device_should_recover_gpu(adev)
		&& (!amdgpu_device_has_job_running(adev) ||
		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
		amdgpu_device_gpu_recover(adev, NULL);
		amdgpu_device_gpu_recover_imp(adev, NULL);
}

static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,

	switch (event) {
		case IDH_FLR_NOTIFICATION:
		if (amdgpu_sriov_runtime(adev))
			schedule_work(&adev->virt.flr_work);
		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
			WARN_ONCE(!queue_work(adev->reset_domain.wq,
					      &adev->virt.flr_work),
				  "Failed to queue work! at %s",
				  __func__);
		break;
		case IDH_QUERY_ALIVE:
			xgpu_ai_mailbox_send_ack(adev);
+6 −3
Original line number Diff line number Diff line
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
		adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
		amdgpu_device_gpu_recover(adev, NULL);
		amdgpu_device_gpu_recover_imp(adev, NULL);
}

static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,

	switch (event) {
	case IDH_FLR_NOTIFICATION:
		if (amdgpu_sriov_runtime(adev))
			schedule_work(&adev->virt.flr_work);
		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
			WARN_ONCE(!queue_work(adev->reset_domain.wq,
					      &adev->virt.flr_work),
				  "Failed to queue work! at %s",
				  __func__);
		break;
		/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
		 * it byfar since that polling thread will handle it,
+6 −3
Original line number Diff line number Diff line
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)

	/* Trigger recovery due to world switch failure */
	if (amdgpu_device_should_recover_gpu(adev))
		amdgpu_device_gpu_recover(adev, NULL);
		amdgpu_device_gpu_recover_imp(adev, NULL);
}

static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
		r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);

		/* only handle FLR_NOTIFY now */
		if (!r)
			schedule_work(&adev->virt.flr_work);
		if (!r && !amdgpu_in_reset(adev))
			WARN_ONCE(!queue_work(adev->reset_domain.wq,
					      &adev->virt.flr_work),
				  "Failed to queue work! at %s",
				  __func__);
	}

	return 0;