drm/amdgpu: Drop concurrent GPU reset protection for device (f287a3c5) · Commits · EulixOS / Software / Kernel

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+7 −82

Original line number	Diff line number	Diff line
		@@ -4817,11 +4817,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
		return r;
		}

		static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
		static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
		struct amdgpu_hive_info *hive)
		{
		if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
		return false;
		atomic_set(&adev->in_gpu_reset, 1);

		if (hive) {
		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
		@@ -4840,8 +4839,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
		adev->mp1_state = PP_MP1_STATE_NONE;
		break;
		}

		return true;
		}

		static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
		@@ -4852,46 +4849,6 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
		up_write(&adev->reset_sem);
		}

		/*
		* to lockup a list of amdgpu devices in a hive safely, if not a hive
		* with multiple nodes, it will be similar as amdgpu_device_lock_adev.
		*
		* unlock won't require roll back.
		*/
		static int amdgpu_device_lock_hive_adev(struct amdgpu_device adev, struct amdgpu_hive_info hive)
		{
		struct amdgpu_device *tmp_adev = NULL;

		if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
		if (!hive) {
		dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
		return -ENODEV;
		}
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
		if (!amdgpu_device_lock_adev(tmp_adev, hive))
		goto roll_back;
		}
		} else if (!amdgpu_device_lock_adev(adev, hive))
		return -EAGAIN;

		return 0;
		roll_back:
		if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
		/*
		* if the lockup iteration break in the middle of a hive,
		* it may means there may has a race issue,
		* or a hive device locked up independently.
		* we may be in trouble and may not, so will try to roll back
		* the lock and give out a warnning.
		*/
		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
		amdgpu_device_unlock_adev(tmp_adev);
		}
		}
		return -EAGAIN;
		}

		static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
		{
		struct pci_dev *p = NULL;
		@@ -5078,22 +5035,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
		reset_context.hive = hive;
		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

		/*
		* lock the device before we try to operate the linked list
		* if didn't get the device lock, don't touch the linked list since
		* others may iterating it.
		*/
		r = amdgpu_device_lock_hive_adev(adev, hive);
		if (r) {
		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
		job ? job->base.id : -1);

		/* even we skipped this reset, still need to set the job to guilty */
		if (job && job->vm)
		drm_sched_increase_karma(&job->base);
		goto skip_recovery;
		}

		/*
		* Build list of devices to reset.
		* In case we are in XGMI hive mode, resort the device list
		@@ -5113,6 +5054,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,

		/* block all schedulers and reset given job's ring */
		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

		amdgpu_device_lock_adev(tmp_adev, hive);

		/*
		* Try to put the audio codec into suspend state
		* before gpu reset started.
		@@ -5264,13 +5208,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
		amdgpu_device_unlock_adev(tmp_adev);
		}

		skip_recovery:
		if (hive) {
		mutex_unlock(&hive->hive_lock);
		amdgpu_put_xgmi_hive(hive);
		}

		if (r && r != -EAGAIN)
		if (r)
		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
		return r;
		}
		@@ -5493,20 +5436,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
		return 0;
		}

		static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
		{
		int i;

		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
		struct amdgpu_ring *ring = adev->rings[i];

		if (!ring \|\| !ring->sched.thread)
		continue;

		cancel_delayed_work_sync(&ring->sched.work_tdr);
		}
		}

		/**
		* amdgpu_pci_error_detected - Called when a PCI error is detected.
		* @pdev: PCI device struct
		@@ -5537,14 +5466,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
		/* Fatal error, prepare for slot reset */
		case pci_channel_io_frozen:
		/*
		* Cancel and wait for all TDRs in progress if failing to
		* set adev->in_gpu_reset in amdgpu_device_lock_adev
		*
		* Locking adev->reset_sem will prevent any external access
		* to GPU during PCI error recovery
		*/
		while (!amdgpu_device_lock_adev(adev, NULL))
		amdgpu_cancel_all_tdr(adev);
		amdgpu_device_lock_adev(adev, NULL);

		/*
		* Block any work scheduling as we do for regular GPU reset