Commit b1fb6b87 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-fixes-6.0-2022-08-17' of...

Merge tag 'amd-drm-fixes-6.0-2022-08-17' of https://gitlab.freedesktop.org/agd5f/linux

 into drm-fixes

amd-drm-fixes-6.0-2022-08-17:

amdgpu:
- Revert some DML stack changes
- Rounding fixes in KFD allocations
- atombios vram info table parsing fix
- DCN 3.1.4 fixes
- Clockgating fixes for various new IPs
- SMU 13.0.4 fixes
- DCN 3.1.4 FP fixes
- TMDS fixes for YCbCr420 4k modes
- DCN 3.2.x fixes
- USB 4 fixes
- SMU 13.0 fixes
- SMU driver unload memory leak fixes
- Display orientation fix
- Regression fix for generic fbdev conversion
- SDMA 6.x fixes
- SR-IOV fixes
- IH 6.x fixes
- Use after free fix in bo list handling
- Revert pipe1 support
- XGMI hive reset fix

amdkfd:
- Fix potential crach in kfd_create_indirect_link_prop()

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220818025206.6463-1-alexander.deucher@amd.com
parents 2ae6ab9d 085292c3
Loading
Loading
Loading
Loading
+14 −31
Original line number Diff line number Diff line
@@ -148,30 +148,22 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
			      struct amdgpu_reset_context *reset_context)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
	struct list_head *reset_device_list = reset_context->reset_device_list;
	struct amdgpu_device *tmp_adev = NULL;
	struct list_head reset_device_list;
	int r = 0;

	dev_dbg(adev->dev, "aldebaran perform hw reset\n");

	if (reset_device_list == NULL)
		return -EINVAL;

	if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
	    reset_context->hive == NULL) {
		/* Wrong context, return error */
		return -EINVAL;
	}

	INIT_LIST_HEAD(&reset_device_list);
	if (reset_context->hive) {
		list_for_each_entry (tmp_adev,
				     &reset_context->hive->device_list,
				     gmc.xgmi.head)
			list_add_tail(&tmp_adev->reset_list,
				      &reset_device_list);
	} else {
		list_add_tail(&reset_context->reset_req_dev->reset_list,
			      &reset_device_list);
	}

	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		mutex_lock(&tmp_adev->reset_cntl->reset_lock);
		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
	}
@@ -179,7 +171,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
	 * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
	 * them together so that they can be completed asynchronously on multiple nodes
	 */
	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		/* For XGMI run all resets in parallel to speed up the process */
		if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
			if (!queue_work(system_unbound_wq,
@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,

	/* For XGMI wait for all resets to complete before proceed */
	if (!r) {
		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
		list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
				flush_work(&tmp_adev->reset_cntl->reset_work);
				r = tmp_adev->asic_reset_res;
@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
		}
	}

	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
	}
@@ -339,10 +331,13 @@ static int
aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
				  struct amdgpu_reset_context *reset_context)
{
	struct list_head *reset_device_list = reset_context->reset_device_list;
	struct amdgpu_device *tmp_adev = NULL;
	struct list_head reset_device_list;
	int r;

	if (reset_device_list == NULL)
		return -EINVAL;

	if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
		    IP_VERSION(13, 0, 2) &&
	    reset_context->hive == NULL) {
@@ -350,19 +345,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
		return -EINVAL;
	}

	INIT_LIST_HEAD(&reset_device_list);
	if (reset_context->hive) {
		list_for_each_entry (tmp_adev,
				     &reset_context->hive->device_list,
				     gmc.xgmi.head)
			list_add_tail(&tmp_adev->reset_list,
				      &reset_device_list);
	} else {
		list_add_tail(&reset_context->reset_req_dev->reset_list,
			      &reset_device_list);
	}

	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
		dev_info(tmp_adev->dev,
			 "GPU reset succeeded, trying to resume\n");
		r = aldebaran_mode2_restore_ip(tmp_adev);
+1 −1
Original line number Diff line number Diff line
@@ -317,7 +317,7 @@ enum amdgpu_kiq_irq {
	AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
	AMDGPU_CP_KIQ_IRQ_LAST
};

#define SRIOV_USEC_TIMEOUT  1200000 /* wait 12 * 100ms for SRIOV */
#define MAX_KIQ_REG_WAIT       5000 /* in usecs, 5ms */
#define MAX_KIQ_REG_BAILOUT_INTERVAL   5 /* in msecs, 5ms */
#define MAX_KIQ_REG_TRY 1000
+1 −0
Original line number Diff line number Diff line
@@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence {
struct amdgpu_kfd_dev {
	struct kfd_dev *dev;
	uint64_t vram_used;
	uint64_t vram_used_aligned;
	bool init_complete;
	struct work_struct reset_work;
};
+12 −9
Original line number Diff line number Diff line
@@ -40,10 +40,10 @@
#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1

/*
 * Align VRAM allocations to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
 * Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
 * BO chunk
 */
#define VRAM_ALLOCATION_ALIGN (1 << 21)
#define VRAM_AVAILABLITY_ALIGN (1 << 21)

/* Impose limit on how much memory KFD can use */
static struct {
@@ -149,7 +149,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
		 * to avoid fragmentation caused by 4K allocations in the tail
		 * 2M BO chunk.
		 */
		vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
		vram_needed = size;
	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
		system_mem_needed = size;
	} else if (!(alloc_flag &
@@ -182,8 +182,10 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
	 */
	WARN_ONCE(vram_needed && !adev,
		  "adev reference can't be null when vram is used");
	if (adev)
	if (adev) {
		adev->kfd.vram_used += vram_needed;
		adev->kfd.vram_used_aligned += ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
	}
	kfd_mem_limit.system_mem_used += system_mem_needed;
	kfd_mem_limit.ttm_mem_used += ttm_mem_needed;

@@ -203,8 +205,10 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
		WARN_ONCE(!adev,
			  "adev reference can't be null when alloc mem flags vram is set");
		if (adev)
			adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
		if (adev) {
			adev->kfd.vram_used -= size;
			adev->kfd.vram_used_aligned -= ALIGN(size, VRAM_AVAILABLITY_ALIGN);
		}
	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
		kfd_mem_limit.system_mem_used -= size;
	} else if (!(alloc_flag &
@@ -1608,15 +1612,14 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
	uint64_t reserved_for_pt =
		ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
	size_t available;

	spin_lock(&kfd_mem_limit.mem_limit_lock);
	available = adev->gmc.real_vram_size
		- adev->kfd.vram_used
		- adev->kfd.vram_used_aligned
		- atomic64_read(&adev->vram_pin_size)
		- reserved_for_pt;
	spin_unlock(&kfd_mem_limit.mem_limit_lock);

	return ALIGN_DOWN(available, VRAM_ALLOCATION_ALIGN);
	return ALIGN_DOWN(available, VRAM_AVAILABLITY_ALIGN);
}

int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
+1 −1
Original line number Diff line number Diff line
@@ -314,7 +314,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev,
					mem_channel_number = vram_info->v30.channel_num;
					mem_channel_width = vram_info->v30.channel_width;
					if (vram_width)
						*vram_width = mem_channel_number * mem_channel_width;
						*vram_width = mem_channel_number * (1 << mem_channel_width);
					break;
				default:
					return -EINVAL;
Loading