Commit f287a3c5 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky
Browse files

drm/amdgpu: Drop concurrent GPU reset protection for device



Since now all GPU resets are serialzied there is no need for this.

This patch also reverts 'drm/amdgpu: race issue when jobs on 2 ring timeout'

Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Link: https://www.spinics.net/lists/amd-gfx/msg74119.html
parent 681260df
Loading
Loading
Loading
Loading
+7 −82
Original line number Diff line number Diff line
@@ -4817,11 +4817,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
	return r;
}

static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
				struct amdgpu_hive_info *hive)
{
	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
		return false;
	atomic_set(&adev->in_gpu_reset, 1);

	if (hive) {
		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
@@ -4840,8 +4839,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
		adev->mp1_state = PP_MP1_STATE_NONE;
		break;
	}

	return true;
}

static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
@@ -4852,46 +4849,6 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
	up_write(&adev->reset_sem);
}

/*
 * to lockup a list of amdgpu devices in a hive safely, if not a hive
 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
 *
 * unlock won't require roll back.
 */
static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
{
	struct amdgpu_device *tmp_adev = NULL;

	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
		if (!hive) {
			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
			return -ENODEV;
		}
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			if (!amdgpu_device_lock_adev(tmp_adev, hive))
				goto roll_back;
		}
	} else if (!amdgpu_device_lock_adev(adev, hive))
		return -EAGAIN;

	return 0;
roll_back:
	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
		/*
		 * if the lockup iteration break in the middle of a hive,
		 * it may means there may has a race issue,
		 * or a hive device locked up independently.
		 * we may be in trouble and may not, so will try to roll back
		 * the lock and give out a warnning.
		 */
		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			amdgpu_device_unlock_adev(tmp_adev);
		}
	}
	return -EAGAIN;
}

static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
{
	struct pci_dev *p = NULL;
@@ -5078,22 +5035,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
	reset_context.hive = hive;
	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

	/*
	 * lock the device before we try to operate the linked list
	 * if didn't get the device lock, don't touch the linked list since
	 * others may iterating it.
	 */
	r = amdgpu_device_lock_hive_adev(adev, hive);
	if (r) {
		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
					job ? job->base.id : -1);

		/* even we skipped this reset, still need to set the job to guilty */
		if (job && job->vm)
			drm_sched_increase_karma(&job->base);
		goto skip_recovery;
	}

	/*
	 * Build list of devices to reset.
	 * In case we are in XGMI hive mode, resort the device list
@@ -5113,6 +5054,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,

	/* block all schedulers and reset given job's ring */
	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {

		amdgpu_device_lock_adev(tmp_adev, hive);

		/*
		 * Try to put the audio codec into suspend state
		 * before gpu reset started.
@@ -5264,13 +5208,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
		amdgpu_device_unlock_adev(tmp_adev);
	}

skip_recovery:
	if (hive) {
		mutex_unlock(&hive->hive_lock);
		amdgpu_put_xgmi_hive(hive);
	}

	if (r && r != -EAGAIN)
	if (r)
		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
	return r;
}
@@ -5493,20 +5436,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
	return 0;
}

static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
{
	int i;

	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
		struct amdgpu_ring *ring = adev->rings[i];

		if (!ring || !ring->sched.thread)
			continue;

		cancel_delayed_work_sync(&ring->sched.work_tdr);
	}
}

/**
 * amdgpu_pci_error_detected - Called when a PCI error is detected.
 * @pdev: PCI device struct
@@ -5537,14 +5466,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
	/* Fatal error, prepare for slot reset */
	case pci_channel_io_frozen:
		/*
		 * Cancel and wait for all TDRs in progress if failing to
		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
		 *
		 * Locking adev->reset_sem will prevent any external access
		 * to GPU during PCI error recovery
		 */
		while (!amdgpu_device_lock_adev(adev, NULL))
			amdgpu_cancel_all_tdr(adev);
		amdgpu_device_lock_adev(adev, NULL);

		/*
		 * Block any work scheduling as we do for regular GPU reset