Commit d0fb18b5 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky
Browse files

drm/amdgpu: Move reset sem into reset_domain



We want single instance of reset sem across all
reset clients because in case of XGMI we should stop
access cross device MMIO because any of them could be
in a reset in the moment.

Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Link: https://www.spinics.net/lists/amd-gfx/msg74117.html
parent cfbb6b00
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -1058,7 +1058,6 @@ struct amdgpu_device {

	atomic_t 			in_gpu_reset;
	enum pp_mp1_state               mp1_state;
	struct rw_semaphore reset_sem;
	struct amdgpu_doorbell_index doorbell_index;

	struct mutex			notifier_lock;
+6 −4
Original line number Diff line number Diff line
@@ -37,6 +37,8 @@
#include "amdgpu_fw_attestation.h"
#include "amdgpu_umr.h"

#include "amdgpu_reset.h"

#if defined(CONFIG_DEBUG_FS)

/**
@@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
	}

	/* Avoid accidently unparking the sched thread during GPU reset */
	r = down_write_killable(&adev->reset_sem);
	r = down_write_killable(&adev->reset_domain->sem);
	if (r)
		return r;

@@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
		kthread_unpark(ring->sched.thread);
	}

	up_write(&adev->reset_sem);
	up_write(&adev->reset_domain->sem);

	pm_runtime_mark_last_busy(dev->dev);
	pm_runtime_put_autosuspend(dev->dev);
@@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
		return -ENOMEM;

	/* Avoid accidently unparking the sched thread during GPU reset */
	r = down_read_killable(&adev->reset_sem);
	r = down_read_killable(&adev->reset_domain->sem);
	if (r)
		goto pro_end;

@@ -1560,7 +1562,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
	/* restart the scheduler */
	kthread_unpark(ring->sched.thread);

	up_read(&adev->reset_sem);
	up_read(&adev->reset_domain->sem);

	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

+11 −12
Original line number Diff line number Diff line
@@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
	 * the lock.
	 */
	if (in_task()) {
		if (down_read_trylock(&adev->reset_sem))
			up_read(&adev->reset_sem);
		if (down_read_trylock(&adev->reset_domain->sem))
			up_read(&adev->reset_domain->sem);
		else
			lockdep_assert_held(&adev->reset_sem);
			lockdep_assert_held(&adev->reset_domain->sem);
	}
#endif
	return false;
@@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
	if ((reg * 4) < adev->rmmio_size) {
		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
		    amdgpu_sriov_runtime(adev) &&
		    down_read_trylock(&adev->reset_sem)) {
		    down_read_trylock(&adev->reset_domain->sem)) {
			ret = amdgpu_kiq_rreg(adev, reg);
			up_read(&adev->reset_sem);
			up_read(&adev->reset_domain->sem);
		} else {
			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
		}
@@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
	if ((reg * 4) < adev->rmmio_size) {
		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
		    amdgpu_sriov_runtime(adev) &&
		    down_read_trylock(&adev->reset_sem)) {
		    down_read_trylock(&adev->reset_domain->sem)) {
			amdgpu_kiq_wreg(adev, reg, v);
			up_read(&adev->reset_sem);
			up_read(&adev->reset_domain->sem);
		} else {
			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
		}
@@ -3555,7 +3555,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	mutex_init(&adev->virt.vf_errors.lock);
	hash_init(adev->mn_hash);
	atomic_set(&adev->in_gpu_reset, 0);
	init_rwsem(&adev->reset_sem);
	mutex_init(&adev->psp.mutex);
	mutex_init(&adev->notifier_lock);

@@ -4833,9 +4832,9 @@ static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
	atomic_set(&adev->in_gpu_reset, 1);

	if (hive) {
		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
		down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock);
	} else {
		down_write(&adev->reset_sem);
		down_write(&adev->reset_domain->sem);
	}

	switch (amdgpu_asic_reset_method(adev)) {
@@ -4856,7 +4855,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
	amdgpu_vf_error_trans_all(adev);
	adev->mp1_state = PP_MP1_STATE_NONE;
	atomic_set(&adev->in_gpu_reset, 0);
	up_write(&adev->reset_sem);
	up_write(&adev->reset_domain->sem);
}

static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -5476,7 +5475,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
	/* Fatal error, prepare for slot reset */
	case pci_channel_io_frozen:
		/*
		 * Locking adev->reset_sem will prevent any external access
		 * Locking adev->reset_domain->sem will prevent any external access
		 * to GPU during PCI error recovery
		 */
		amdgpu_device_lock_adev(adev, NULL);
+10 −8
Original line number Diff line number Diff line
@@ -31,6 +31,8 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>

#include "amdgpu_reset.h"

#define EEPROM_I2C_MADDR_VEGA20         0x0
#define EEPROM_I2C_MADDR_ARCTURUS       0x40000
#define EEPROM_I2C_MADDR_ARCTURUS_D342  0x0
@@ -193,12 +195,12 @@ static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
	__encode_table_header_to_buf(&control->tbl_hdr, buf);

	/* i2c may be unstable in gpu reset */
	down_read(&adev->reset_sem);
	down_read(&adev->reset_domain->sem);
	res = amdgpu_eeprom_write(&adev->pm.smu_i2c,
				  control->i2c_address +
				  control->ras_header_offset,
				  buf, RAS_TABLE_HEADER_SIZE);
	up_read(&adev->reset_sem);
	up_read(&adev->reset_domain->sem);

	if (res < 0) {
		DRM_ERROR("Failed to write EEPROM table header:%d", res);
@@ -387,13 +389,13 @@ static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
	int res;

	/* i2c may be unstable in gpu reset */
	down_read(&adev->reset_sem);
	down_read(&adev->reset_domain->sem);
	buf_size = num * RAS_TABLE_RECORD_SIZE;
	res = amdgpu_eeprom_write(&adev->pm.smu_i2c,
				  control->i2c_address +
				  RAS_INDEX_TO_OFFSET(control, fri),
				  buf, buf_size);
	up_read(&adev->reset_sem);
	up_read(&adev->reset_domain->sem);
	if (res < 0) {
		DRM_ERROR("Writing %d EEPROM table records error:%d",
			  num, res);
@@ -547,12 +549,12 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
		goto Out;
	}

	down_read(&adev->reset_sem);
	down_read(&adev->reset_domain->sem);
	res = amdgpu_eeprom_read(&adev->pm.smu_i2c,
				 control->i2c_address +
				 control->ras_record_offset,
				 buf, buf_size);
	up_read(&adev->reset_sem);
	up_read(&adev->reset_domain->sem);
	if (res < 0) {
		DRM_ERROR("EEPROM failed reading records:%d\n",
			  res);
@@ -642,13 +644,13 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
	int res;

	/* i2c may be unstable in gpu reset */
	down_read(&adev->reset_sem);
	down_read(&adev->reset_domain->sem);
	buf_size = num * RAS_TABLE_RECORD_SIZE;
	res = amdgpu_eeprom_read(&adev->pm.smu_i2c,
				 control->i2c_address +
				 RAS_INDEX_TO_OFFSET(control, fri),
				 buf, buf_size);
	up_read(&adev->reset_sem);
	up_read(&adev->reset_domain->sem);
	if (res < 0) {
		DRM_ERROR("Reading %d EEPROM table records error:%d",
			  num, res);
+2 −0
Original line number Diff line number Diff line
@@ -131,6 +131,8 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d

	}

	init_rwsem(&reset_domain->sem);

	return reset_domain;
}

Loading