Merge tag 'amd-drm-fixes-6.0-2022-08-17' of... (b1fb6b87) · Commits · EulixOS / Software / Kernel

drivers/gpu/drm/amd/amdgpu/aldebaran.c

+14 −31

Original line number	Diff line number	Diff line
		@@ -148,30 +148,22 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
		struct amdgpu_reset_context *reset_context)
		{
		struct amdgpu_device adev = (struct amdgpu_device )reset_ctl->handle;
		struct list_head *reset_device_list = reset_context->reset_device_list;
		struct amdgpu_device *tmp_adev = NULL;
		struct list_head reset_device_list;
		int r = 0;

		dev_dbg(adev->dev, "aldebaran perform hw reset\n");

		if (reset_device_list == NULL)
		return -EINVAL;

		if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
		reset_context->hive == NULL) {
		/* Wrong context, return error */
		return -EINVAL;
		}

		INIT_LIST_HEAD(&reset_device_list);
		if (reset_context->hive) {
		list_for_each_entry (tmp_adev,
		&reset_context->hive->device_list,
		gmc.xgmi.head)
		list_add_tail(&tmp_adev->reset_list,
		&reset_device_list);
		} else {
		list_add_tail(&reset_context->reset_req_dev->reset_list,
		&reset_device_list);
		}

		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		mutex_lock(&tmp_adev->reset_cntl->reset_lock);
		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
		}
		@@ -179,7 +171,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
		* Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
		* them together so that they can be completed asynchronously on multiple nodes
		*/
		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		/* For XGMI run all resets in parallel to speed up the process */
		if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
		if (!queue_work(system_unbound_wq,
		@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,

		/* For XGMI wait for all resets to complete before proceed */
		if (!r) {
		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
		flush_work(&tmp_adev->reset_cntl->reset_work);
		r = tmp_adev->asic_reset_res;
		@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
		}
		}

		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
		}
		@@ -339,10 +331,13 @@ static int
		aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
		struct amdgpu_reset_context *reset_context)
		{
		struct list_head *reset_device_list = reset_context->reset_device_list;
		struct amdgpu_device *tmp_adev = NULL;
		struct list_head reset_device_list;
		int r;

		if (reset_device_list == NULL)
		return -EINVAL;

		if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
		IP_VERSION(13, 0, 2) &&
		reset_context->hive == NULL) {
		@@ -350,19 +345,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
		return -EINVAL;
		}

		INIT_LIST_HEAD(&reset_device_list);
		if (reset_context->hive) {
		list_for_each_entry (tmp_adev,
		&reset_context->hive->device_list,
		gmc.xgmi.head)
		list_add_tail(&tmp_adev->reset_list,
		&reset_device_list);
		} else {
		list_add_tail(&reset_context->reset_req_dev->reset_list,
		&reset_device_list);
		}

		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		dev_info(tmp_adev->dev,
		"GPU reset succeeded, trying to resume\n");
		r = aldebaran_mode2_restore_ip(tmp_adev);

drivers/gpu/drm/amd/amdgpu/amdgpu.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -317,7 +317,7 @@ enum amdgpu_kiq_irq {
		AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
		AMDGPU_CP_KIQ_IRQ_LAST
		};

		#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */
		#define MAX_KIQ_REG_WAIT 5000 /* in usecs, 5ms */
		#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */
		#define MAX_KIQ_REG_TRY 1000

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence {
		struct amdgpu_kfd_dev {
		struct kfd_dev *dev;
		uint64_t vram_used;
		uint64_t vram_used_aligned;
		bool init_complete;
		struct work_struct reset_work;
		};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+12 −9

Original line number	Diff line number	Diff line
		@@ -40,10 +40,10 @@
		#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1

		/*
		* Align VRAM allocations to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
		* Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
		* BO chunk
		*/
		#define VRAM_ALLOCATION_ALIGN (1 << 21)
		#define VRAM_AVAILABLITY_ALIGN (1 << 21)

		/* Impose limit on how much memory KFD can use */
		static struct {
		@@ -149,7 +149,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
		* to avoid fragmentation caused by 4K allocations in the tail
		* 2M BO chunk.
		*/
		vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
		vram_needed = size;
		} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
		system_mem_needed = size;
		} else if (!(alloc_flag &
		@@ -182,8 +182,10 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
		*/
		WARN_ONCE(vram_needed && !adev,
		"adev reference can't be null when vram is used");
		if (adev)
		if (adev) {
		adev->kfd.vram_used += vram_needed;
		adev->kfd.vram_used_aligned += ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
		}
		kfd_mem_limit.system_mem_used += system_mem_needed;
		kfd_mem_limit.ttm_mem_used += ttm_mem_needed;

		@@ -203,8 +205,10 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
		} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
		WARN_ONCE(!adev,
		"adev reference can't be null when alloc mem flags vram is set");
		if (adev)
		adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
		if (adev) {
		adev->kfd.vram_used -= size;
		adev->kfd.vram_used_aligned -= ALIGN(size, VRAM_AVAILABLITY_ALIGN);
		}
		} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
		kfd_mem_limit.system_mem_used -= size;
		} else if (!(alloc_flag &
		@@ -1608,15 +1612,14 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
		uint64_t reserved_for_pt =
		ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
		size_t available;

		spin_lock(&kfd_mem_limit.mem_limit_lock);
		available = adev->gmc.real_vram_size
		- adev->kfd.vram_used
		- adev->kfd.vram_used_aligned
		- atomic64_read(&adev->vram_pin_size)
		- reserved_for_pt;
		spin_unlock(&kfd_mem_limit.mem_limit_lock);

		return ALIGN_DOWN(available, VRAM_ALLOCATION_ALIGN);
		return ALIGN_DOWN(available, VRAM_AVAILABLITY_ALIGN);
		}

		int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(

drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -314,7 +314,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev,
		mem_channel_number = vram_info->v30.channel_num;
		mem_channel_width = vram_info->v30.channel_width;
		if (vram_width)
		vram_width = mem_channel_number mem_channel_width;
		vram_width = mem_channel_number (1 << mem_channel_width);
		break;
		default:
		return -EINVAL;