Commit 0a178750 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-fixes-5.19-2022-06-08' of...

Merge tag 'amd-drm-fixes-5.19-2022-06-08' of https://gitlab.freedesktop.org/agd5f/linux

 into drm-fixes

amd-drm-fixes-5.19-2022-06-08:

amdgpu:
- DCN 3.1 golden settings fix
- eDP fixes
- DMCUB fixes
- GFX11 fixes and cleanups
- VCN fix for yellow carp
- GMC11 fixes
- RAS fixes
- GPUVM TLB flush fixes
- SMU13 fixes
- VCN3 AV1 regression fix
- VCN2 JPEG fix
- Other misc fixes

amdkfd:
- MMU notifier fix
- Support for more GC 10.3.x families
- Pinned BO handling fix
- Partial migration bug fix

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220608203008.6187-1-alexander.deucher@amd.com
parents f2906aa8 431d0712
Loading
Loading
Loading
Loading
+6 −7
Original line number Diff line number Diff line
@@ -1918,9 +1918,6 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
		return -EINVAL;
	}

	/* delete kgd_mem from kfd_bo_list to avoid re-validating
	 * this BO in BO's restoring after eviction.
	 */
	mutex_lock(&mem->process_info->lock);

	ret = amdgpu_bo_reserve(bo, true);
@@ -1943,7 +1940,6 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,

	amdgpu_amdkfd_remove_eviction_fence(
		bo, mem->process_info->eviction_fence);
	list_del_init(&mem->validate_list.head);

	if (size)
		*size = amdgpu_bo_size(bo);
@@ -2512,12 +2508,15 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
	process_info->eviction_fence = new_fence;
	*ef = dma_fence_get(&new_fence->base);

	/* Attach new eviction fence to all BOs */
	/* Attach new eviction fence to all BOs except pinned ones */
	list_for_each_entry(mem, &process_info->kfd_bo_list,
		validate_list.head)
		validate_list.head) {
		if (mem->bo->tbo.pin_count)
			continue;

		amdgpu_bo_fence(mem->bo,
			&process_info->eviction_fence->base, true);

	}
	/* Attach eviction fence to PD / PT BOs */
	list_for_each_entry(peer_vm, &process_info->vm_list_head,
			    vm_list_node) {
+6 −3
Original line number Diff line number Diff line
@@ -594,17 +594,20 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value)
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
	int r;
	r = amdgpu_ras_block_late_init(adev, ras_block);
	if (r)
		return r;

	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
		if (!amdgpu_persistent_edc_harvesting_supported(adev))
			amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);

		r = amdgpu_ras_block_late_init(adev, ras_block);
		if (r)
			return r;

		r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
		if (r)
			goto late_fini;
	} else {
		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
	}

	return 0;
+2 −2
Original line number Diff line number Diff line
@@ -518,6 +518,8 @@ void amdgpu_gmc_tmz_set(struct amdgpu_device *adev)
	case IP_VERSION(9, 1, 0):
	/* RENOIR looks like RAVEN */
	case IP_VERSION(9, 3, 0):
	/* GC 10.3.7 */
	case IP_VERSION(10, 3, 7):
		if (amdgpu_tmz == 0) {
			adev->gmc.tmz_enabled = false;
			dev_info(adev->dev,
@@ -540,8 +542,6 @@ void amdgpu_gmc_tmz_set(struct amdgpu_device *adev)
	case IP_VERSION(10, 3, 1):
	/* YELLOW_CARP*/
	case IP_VERSION(10, 3, 3):
	/* GC 10.3.7 */
	case IP_VERSION(10, 3, 7):
		/* Don't enable it by default yet.
		 */
		if (amdgpu_tmz < 1) {
+25 −7
Original line number Diff line number Diff line
@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;

	/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
	}

	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
			"ue", info.ue_count,
			"ce", info.ce_count);
@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
	if (amdgpu_ras_query_error_status(obj->adev, &info))
		return -EINVAL;

	if (obj->adev->asic_type == CHIP_ALDEBARAN) {
	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
			DRM_WARN("Failed to reset error counter and error status");
			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
	}

	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
		}
	}

	if (!amdgpu_persistent_edc_harvesting_supported(adev))
		amdgpu_ras_reset_error_status(adev, info->head.block);

	return 0;
}

@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
		if (res)
			return res;

		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
			if (amdgpu_ras_reset_error_status(adev, info.head.block))
				dev_warn(adev->dev, "Failed to reset error counter and error status");
		}

		ce += info.ce_count;
		ue += info.ue_count;
	}
@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
			continue;

		amdgpu_ras_query_error_status(adev, &info);

		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
			if (amdgpu_ras_reset_error_status(adev, info.head.block))
				dev_warn(adev->dev, "Failed to reset error counter and error status");
		}
	}
}

@@ -2278,8 +2295,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
	    !amdgpu_ras_asic_supported(adev))
		return;

	if (!(amdgpu_sriov_vf(adev) &&
		(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
	/* If driver run on sriov guest side, only enable ras for aldebaran */
	if (amdgpu_sriov_vf(adev) &&
		adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2))
		return;

	if (!adev->gmc.xgmi.connected_to_cpu) {
+11 −2
Original line number Diff line number Diff line
@@ -679,6 +679,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
{
	struct amdgpu_vm_update_params params;
	struct amdgpu_vm_bo_base *entry;
	bool flush_tlb_needed = false;
	int r, idx;

	if (list_empty(&vm->relocated))
@@ -697,6 +698,9 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
		goto error;

	list_for_each_entry(entry, &vm->relocated, vm_status) {
		/* vm_flush_needed after updating moved PDEs */
		flush_tlb_needed |= entry->moved;

		r = amdgpu_vm_pde_update(&params, entry);
		if (r)
			goto error;
@@ -706,7 +710,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
	if (r)
		goto error;

	/* vm_flush_needed after updating PDEs */
	if (flush_tlb_needed)
		atomic64_inc(&vm->tlb_seq);

	while (!list_empty(&vm->relocated)) {
@@ -789,6 +793,11 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
	flush_tlb |= adev->gmc.xgmi.num_physical_nodes &&
		     adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 0);

	/*
	 * On GFX8 and older any 8 PTE block with a valid bit set enters the TLB
	 */
	flush_tlb |= adev->ip_versions[GC_HWIP][0] < IP_VERSION(9, 0, 0);

	memset(&params, 0, sizeof(params));
	params.adev = adev;
	params.vm = vm;
Loading