Commit f88e295e authored Apr 19, 2023 by Christian König Committed by Alex Deucher Jun 15, 2023

drm/amdgpu: add VM generation token



Instead of using the VRAM lost counter add a 64bit token which indicates
if a context or job is still valid to use.

Should the VRAM be lost or the page tables need re-creation the token will
change indicating that userspace needs to act and re-create the contexts
and re-submit the work.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

parent 55bf196f

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -309,7 +309,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
		}
		p->gang_leader = p->jobs[p->gang_leader_idx];

		if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
		if (p->ctx->generation != p->gang_leader->generation) {
		ret = -ECANCELED;
		goto free_all_kdata;
		}

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -333,7 +333,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,

		ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
		ctx->reset_counter_query = ctx->reset_counter;
		ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter);
		ctx->generation = amdgpu_vm_generation(mgr->adev, &fpriv->vm);
		ctx->init_priority = priority;
		ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;

		@@ -586,7 +586,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
		if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter))
		out->state.flags \|= AMDGPU_CTX_QUERY2_FLAGS_RESET;

		if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
		if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm))
		out->state.flags \|= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;

		if (atomic_read(&ctx->guilty))

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ struct amdgpu_ctx {
		struct amdgpu_ctx_mgr *mgr;
		unsigned reset_counter;
		unsigned reset_counter_query;
		uint32_t vram_lost_counter;
		uint64_t generation;
		spinlock_t ring_lock;
		struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM];
		bool preamble_presented;

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -109,7 +109,7 @@ int amdgpu_job_alloc(struct amdgpu_device adev, struct amdgpu_vm vm,
		(*job)->vm = vm;

		amdgpu_sync_create(&(*job)->explicit_sync);
		(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
		(*job)->generation = amdgpu_vm_generation(adev, vm);
		(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;

		if (!entity)
		@@ -295,7 +295,7 @@ static struct dma_fence amdgpu_job_run(struct drm_sched_job sched_job)
		trace_amdgpu_sched_run_job(job);

		/* Skip job if VRAM is lost and never resubmit gangs */
		if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) \|\|
		if (job->generation != amdgpu_vm_generation(adev, job->vm) \|\|
		(job->job_run_counter && job->gang_submit))
		dma_fence_set_error(finished, -ECANCELED);

drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -61,7 +61,7 @@ struct amdgpu_job {
		uint32_t gds_base, gds_size;
		uint32_t gws_base, gws_size;
		uint32_t oa_base, oa_size;
		uint32_t vram_lost_counter;
		uint64_t generation;

		/* user fence handling */
		uint64_t uf_addr;