Commit 56b0989e authored by Christian König's avatar Christian König Committed by Alex Deucher
Browse files

drm/amdgpu: fix GDS/GWS/OA switch handling



Bas pointed out that this isn't working as expected and could cause
crashes. Fix the handling by storing the marker that a switch is needed
inside the job instead.

Reported-by: default avatarBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e0607c10
Loading
Loading
Loading
Loading
+34 −8
Original line number Diff line number Diff line
@@ -165,6 +165,26 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
		atomic_read(&adev->gpu_reset_counter);
}

/* Check if we need to switch to another set of resources */
static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
					  struct amdgpu_job *job)
{
	return id->gds_base != job->gds_base ||
		id->gds_size != job->gds_size ||
		id->gws_base != job->gws_base ||
		id->gws_size != job->gws_size ||
		id->oa_base != job->oa_base ||
		id->oa_size != job->oa_size;
}

/* Check if the id is compatible with the job */
static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
				   struct amdgpu_job *job)
{
	return  id->pd_gpu_addr == job->vm_pd_addr &&
		!amdgpu_vmid_gds_switch_needed(id, job);
}

/**
 * amdgpu_vmid_grab_idle - grab idle VMID
 *
@@ -265,7 +285,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,

	*id = vm->reserved_vmid[vmhub];
	if ((*id)->owner != vm->immediate.fence_context ||
	    (*id)->pd_gpu_addr != job->vm_pd_addr ||
	    !amdgpu_vmid_compatible(*id, job) ||
	    (*id)->flushed_updates < updates ||
	    !(*id)->last_flush ||
	    ((*id)->last_flush->context != fence_context &&
@@ -294,7 +314,6 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
	if (r)
		return r;

	(*id)->flushed_updates = updates;
	job->vm_needs_flush = needs_flush;
	return 0;
}
@@ -333,7 +352,7 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
		if ((*id)->owner != vm->immediate.fence_context)
			continue;

		if ((*id)->pd_gpu_addr != job->vm_pd_addr)
		if (!amdgpu_vmid_compatible(*id, job))
			continue;

		if (!(*id)->last_flush ||
@@ -355,7 +374,6 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
		if (r)
			return r;

		(*id)->flushed_updates = updates;
		job->vm_needs_flush |= needs_flush;
		return 0;
	}
@@ -408,22 +426,30 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
			if (r)
				goto error;

			id->flushed_updates = amdgpu_vm_tlb_seq(vm);
			job->vm_needs_flush = true;
		}

		list_move_tail(&id->list, &id_mgr->ids_lru);
	}

	id->pd_gpu_addr = job->vm_pd_addr;
	id->owner = vm->immediate.fence_context;

	job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
	if (job->vm_needs_flush) {
		id->flushed_updates = amdgpu_vm_tlb_seq(vm);
		dma_fence_put(id->last_flush);
		id->last_flush = NULL;
	}
	job->vmid = id - id_mgr->ids;
	job->pasid = vm->pasid;

	id->gds_base = job->gds_base;
	id->gds_size = job->gds_size;
	id->gws_base = job->gws_base;
	id->gws_size = job->gws_size;
	id->oa_base = job->oa_base;
	id->oa_size = job->oa_size;
	id->pd_gpu_addr = job->vm_pd_addr;
	id->owner = vm->immediate.fence_context;

	trace_amdgpu_vm_grab_id(vm, ring, job);

error:
+1 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ struct amdgpu_job {
	uint32_t		preamble_status;
	uint32_t                preemption_status;
	bool                    vm_needs_flush;
	bool			gds_switch_needed;
	uint64_t		vm_pd_addr;
	unsigned		vmid;
	unsigned		pasid;
+19 −35
Original line number Diff line number Diff line
@@ -484,25 +484,20 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
	struct amdgpu_device *adev = ring->adev;
	unsigned vmhub = ring->funcs->vmhub;
	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
	struct amdgpu_vmid *id;
	bool gds_switch_needed;
	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;

	if (job->vmid == 0)
		return false;
	id = &id_mgr->ids[job->vmid];
	gds_switch_needed = ring->funcs->emit_gds_switch && (
		id->gds_base != job->gds_base ||
		id->gds_size != job->gds_size ||
		id->gws_base != job->gws_base ||
		id->gws_size != job->gws_size ||
		id->oa_base != job->oa_base ||
		id->oa_size != job->oa_size);

	if (amdgpu_vmid_had_gpu_reset(adev, id))

	if (job->vm_needs_flush || ring->has_compute_vm_bug)
		return true;

	if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
		return true;

	return vm_flush_needed || gds_switch_needed;
	if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
		return true;

	return false;
}

/**
@@ -524,13 +519,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
	unsigned vmhub = ring->funcs->vmhub;
	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
	bool gds_switch_needed = ring->funcs->emit_gds_switch && (
		id->gds_base != job->gds_base ||
		id->gds_size != job->gds_size ||
		id->gws_base != job->gws_base ||
		id->gws_size != job->gws_size ||
		id->oa_base != job->oa_base ||
		id->oa_size != job->oa_size);
	bool gds_switch_needed = ring->funcs->emit_gds_switch &&
		job->gds_switch_needed;
	bool vm_flush_needed = job->vm_needs_flush;
	struct dma_fence *fence = NULL;
	bool pasid_mapping_needed = false;
@@ -577,6 +567,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
	if (pasid_mapping_needed)
		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);

	if (!ring->is_mes_queue && ring->funcs->emit_gds_switch &&
	    gds_switch_needed) {
		amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
					    job->gds_size, job->gws_base,
					    job->gws_size, job->oa_base,
					    job->oa_size);
	}

	if (vm_flush_needed || pasid_mapping_needed) {
		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
		if (r)
@@ -601,20 +599,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
	}
	dma_fence_put(fence);

	if (!ring->is_mes_queue && ring->funcs->emit_gds_switch &&
	    gds_switch_needed) {
		id->gds_base = job->gds_base;
		id->gds_size = job->gds_size;
		id->gws_base = job->gws_base;
		id->gws_size = job->gws_size;
		id->oa_base = job->oa_base;
		id->oa_size = job->oa_size;
		amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
					    job->gds_size, job->gws_base,
					    job->gws_size, job->oa_base,
					    job->oa_size);
	}

	if (ring->funcs->patch_cond_exec)
		amdgpu_ring_patch_cond_exec(ring, patch_offset);