Commit 7be288cc authored by Jesse.zhang@amd.com's avatar Jesse.zhang@amd.com Committed by Wen Zhiwei
Browse files

drm/amdkfd: pause autosuspend when creating pdd

stable inclusion
from stable-v6.6.69
commit 60b57dc761d3a82666c6b84baeb20756b83cf5ca
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/IBNEPJ

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=60b57dc761d3a82666c6b84baeb20756b83cf5ca



--------------------------------

[ Upstream commit 438b39ac74e2a9dc0a5c9d653b7d8066877e86b1 ]

When using MES creating a pdd will require talking to the GPU to
setup the relevant context. The code here forgot to wake up the GPU
in case it was in suspend, this causes KVM to EFAULT for passthrough
GPU for example. This issue can be masked if the GPU was woken up by
other things (e.g. opening the KMS node) first and have not yet gone to sleep.

v4: do the allocation of proc_ctx_bo in a lazy fashion
when the first queue is created in a process (Felix)

Signed-off-by: default avatarJesse Zhang <jesse.zhang@amd.com>
Reviewed-by: default avatarYunxiang Li <Yunxiang.Li@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
Signed-off-by: default avatarWen Zhiwei <wenzhiwei@kylinos.cn>
parent e8aa1623
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -197,6 +197,21 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
	if (dqm->is_hws_hang)
		return -EIO;

	if (!pdd->proc_ctx_cpu_ptr) {
		r = amdgpu_amdkfd_alloc_gtt_mem(adev,
				AMDGPU_MES_PROC_CTX_SIZE,
				&pdd->proc_ctx_bo,
				&pdd->proc_ctx_gpu_addr,
				&pdd->proc_ctx_cpu_ptr,
				false);
		if (r) {
			dev_err(adev->dev,
				"failed to allocate process context bo\n");
			return r;
		}
		memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
	}

	memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
	queue_input.process_id = qpd->pqm->process->pasid;
	queue_input.page_table_base_addr =  qpd->page_table_base;
+2 −21
Original line number Diff line number Diff line
@@ -1046,7 +1046,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)

		kfd_free_process_doorbells(pdd->dev->kfd, pdd);

		if (pdd->dev->kfd->shared_resources.enable_mes)
		if (pdd->dev->kfd->shared_resources.enable_mes &&
			pdd->proc_ctx_cpu_ptr)
			amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
						   &pdd->proc_ctx_bo);
		/*
@@ -1572,7 +1573,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
							struct kfd_process *p)
{
	struct kfd_process_device *pdd = NULL;
	int retval = 0;

	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
		return NULL;
@@ -1596,21 +1596,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
	pdd->user_gpu_id = dev->id;
	atomic64_set(&pdd->evict_duration_counter, 0);

	if (dev->kfd->shared_resources.enable_mes) {
		retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
						AMDGPU_MES_PROC_CTX_SIZE,
						&pdd->proc_ctx_bo,
						&pdd->proc_ctx_gpu_addr,
						&pdd->proc_ctx_cpu_ptr,
						false);
		if (retval) {
			dev_err(dev->adev->dev,
				"failed to allocate process context bo\n");
			goto err_free_pdd;
		}
		memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
	}

	p->pdds[p->n_pdds++] = pdd;
	if (kfd_dbg_is_per_vmid_supported(pdd->dev))
		pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
@@ -1622,10 +1607,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
	idr_init(&pdd->alloc_idr);

	return pdd;

err_free_pdd:
	kfree(pdd);
	return NULL;
}

/**