Commit b41896e3 authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher
Browse files

drm/amdkfd: add svm_bo eviction mechanism support



svm_bo eviction mechanism is different from regular BOs.
Every SVM_BO created contains one eviction fence and one
worker item for eviction process.
SVM_BOs can be attached to one or more pranges.
For SVM_BO eviction mechanism, TTM will start to call
enable_signal callback for every SVM_BO until VRAM space
is available.
Here, all the ttm_evict calls are synchronous, this guarantees
that each eviction has completed and the fence has signaled before
it returns.

Signed-off-by: default avatarAlex Sierra <alex.sierra@amd.com>
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f04c79cf
Loading
Loading
Loading
Loading
+159 −42
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@

#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1

static void svm_range_evict_svm_bo_worker(struct work_struct *work);
static bool
svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
				    const struct mmu_notifier_range *range,
@@ -320,7 +321,15 @@ static void svm_range_bo_release(struct kref *kref)
		spin_lock(&svm_bo->list_lock);
	}
	spin_unlock(&svm_bo->list_lock);

	if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
		/* We're not in the eviction worker.
		 * Signal the fence and synchronize with any
		 * pending eviction work.
		 */
		dma_fence_signal(&svm_bo->eviction_fence->base);
		cancel_work_sync(&svm_bo->eviction_work);
	}
	dma_fence_put(&svm_bo->eviction_fence->base);
	amdgpu_bo_unref(&svm_bo->bo);
	kfree(svm_bo);
}
@@ -333,6 +342,61 @@ static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
	kref_put(&svm_bo->kref, svm_range_bo_release);
}

static bool svm_range_validate_svm_bo(struct svm_range *prange)
{
	mutex_lock(&prange->lock);
	if (!prange->svm_bo) {
		mutex_unlock(&prange->lock);
		return false;
	}
	if (prange->ttm_res) {
		/* We still have a reference, all is well */
		mutex_unlock(&prange->lock);
		return true;
	}
	if (svm_bo_ref_unless_zero(prange->svm_bo)) {
		if (READ_ONCE(prange->svm_bo->evicting)) {
			struct dma_fence *f;
			struct svm_range_bo *svm_bo;
			/* The BO is getting evicted,
			 * we need to get a new one
			 */
			mutex_unlock(&prange->lock);
			svm_bo = prange->svm_bo;
			f = dma_fence_get(&svm_bo->eviction_fence->base);
			svm_range_bo_unref(prange->svm_bo);
			/* wait for the fence to avoid long spin-loop
			 * at list_empty_careful
			 */
			dma_fence_wait(f, false);
			dma_fence_put(f);
		} else {
			/* The BO was still around and we got
			 * a new reference to it
			 */
			mutex_unlock(&prange->lock);
			pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
				 prange->svms, prange->start, prange->last);

			prange->ttm_res = &prange->svm_bo->bo->tbo.mem;
			return true;
		}

	} else {
		mutex_unlock(&prange->lock);
	}

	/* We need a new svm_bo. Spin-loop to wait for concurrent
	 * svm_range_bo_release to finish removing this range from
	 * its range list. After this, it is safe to reuse the
	 * svm_bo pointer and svm_bo_list head.
	 */
	while (!list_empty_careful(&prange->svm_bo_list))
		;

	return false;
}

static struct svm_range_bo *svm_range_bo_new(void)
{
	struct svm_range_bo *svm_bo;
@@ -352,72 +416,56 @@ int
svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
			bool clear)
{
	struct amdkfd_process_info *process_info;
	struct amdgpu_bo_param bp;
	struct svm_range_bo *svm_bo;
	struct amdgpu_bo_user *ubo;
	struct amdgpu_bo *bo;
	struct kfd_process *p;
	struct mm_struct *mm;
	int r;

	pr_debug("[0x%lx 0x%lx]\n", prange->start, prange->last);
	mutex_lock(&prange->lock);
	if (prange->svm_bo) {
		if (prange->ttm_res) {
			/* We still have a reference, all is well */
			mutex_unlock(&prange->lock);
			return 0;
		}
		if (svm_bo_ref_unless_zero(prange->svm_bo)) {
			/* The BO was still around and we got
			 * a new reference to it
			 */
			mutex_unlock(&prange->lock);
			pr_debug("reuse old bo [0x%lx 0x%lx]\n",
	p = container_of(prange->svms, struct kfd_process, svms);
	pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
		 prange->start, prange->last);

			prange->ttm_res = &prange->svm_bo->bo->tbo.mem;
	if (svm_range_validate_svm_bo(prange))
		return 0;
		}

		mutex_unlock(&prange->lock);

		/* We need a new svm_bo. Spin-loop to wait for concurrent
		 * svm_range_bo_release to finish removing this range from
		 * its range list. After this, it is safe to reuse the
		 * svm_bo pointer and svm_bo_list head.
		 */
		while (!list_empty_careful(&prange->svm_bo_list))
			;

	} else {
		mutex_unlock(&prange->lock);
	}

	svm_bo = svm_range_bo_new();
	if (!svm_bo) {
		pr_debug("failed to alloc svm bo\n");
		return -ENOMEM;
	}

	mm = get_task_mm(p->lead_thread);
	if (!mm) {
		pr_debug("failed to get mm\n");
		kfree(svm_bo);
		return -ESRCH;
	}
	svm_bo->svms = prange->svms;
	svm_bo->eviction_fence =
		amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
					   mm,
					   svm_bo);
	mmput(mm);
	INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
	svm_bo->evicting = 0;
	memset(&bp, 0, sizeof(bp));
	bp.size = prange->npages * PAGE_SIZE;
	bp.byte_align = PAGE_SIZE;
	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
	bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
	bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
	bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO;
	bp.type = ttm_bo_type_device;
	bp.resv = NULL;

	r = amdgpu_bo_create_user(adev, &bp, &ubo);
	if (r) {
		pr_debug("failed %d to create bo\n", r);
		kfree(svm_bo);
		return r;
		goto create_bo_failed;
	}
	bo = &ubo->bo;

	p = container_of(prange->svms, struct kfd_process, svms);
	r = amdgpu_bo_reserve(bo, true);
	if (r) {
		pr_debug("failed %d to reserve bo\n", r);
@@ -430,8 +478,7 @@ svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
		amdgpu_bo_unreserve(bo);
		goto reserve_bo_failed;
	}
	process_info = p->kgd_process_info;
	amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true);
	amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);

	amdgpu_bo_unreserve(bo);

@@ -447,8 +494,10 @@ svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
	return 0;

reserve_bo_failed:
	kfree(svm_bo);
	amdgpu_bo_unref(&bo);
create_bo_failed:
	dma_fence_put(&svm_bo->eviction_fence->base);
	kfree(svm_bo);
	prange->ttm_res = NULL;

	return r;
@@ -2329,6 +2378,74 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
	return r;
}

int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
{
	if (!fence)
		return -EINVAL;

	if (dma_fence_is_signaled(&fence->base))
		return 0;

	if (fence->svm_bo) {
		WRITE_ONCE(fence->svm_bo->evicting, 1);
		schedule_work(&fence->svm_bo->eviction_work);
	}

	return 0;
}

static void svm_range_evict_svm_bo_worker(struct work_struct *work)
{
	struct svm_range_bo *svm_bo;
	struct kfd_process *p;
	struct mm_struct *mm;

	svm_bo = container_of(work, struct svm_range_bo, eviction_work);
	if (!svm_bo_ref_unless_zero(svm_bo))
		return; /* svm_bo was freed while eviction was pending */

	/* svm_range_bo_release destroys this worker thread. So during
	 * the lifetime of this thread, kfd_process and mm will be valid.
	 */
	p = container_of(svm_bo->svms, struct kfd_process, svms);
	mm = p->mm;
	if (!mm)
		return;

	mmap_read_lock(mm);
	spin_lock(&svm_bo->list_lock);
	while (!list_empty(&svm_bo->range_list)) {
		struct svm_range *prange =
				list_first_entry(&svm_bo->range_list,
						struct svm_range, svm_bo_list);
		list_del_init(&prange->svm_bo_list);
		spin_unlock(&svm_bo->list_lock);

		pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
			 prange->start, prange->last);

		mutex_lock(&prange->migrate_mutex);
		svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm);

		mutex_lock(&prange->lock);
		prange->svm_bo = NULL;
		mutex_unlock(&prange->lock);

		mutex_unlock(&prange->migrate_mutex);

		spin_lock(&svm_bo->list_lock);
	}
	spin_unlock(&svm_bo->list_lock);
	mmap_read_unlock(mm);

	dma_fence_signal(&svm_bo->eviction_fence->base);
	/* This is the last reference to svm_bo, after svm_range_vram_node_free
	 * has been called in svm_migrate_vram_to_ram
	 */
	WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
	svm_range_bo_unref(svm_bo);
}

static int
svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
		   uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
+9 −4
Original line number Diff line number Diff line
@@ -38,6 +38,10 @@ struct svm_range_bo {
	struct kref			kref;
	struct list_head		range_list; /* all svm ranges shared this bo */
	spinlock_t			list_lock;
	struct amdgpu_amdkfd_fence	*eviction_fence;
	struct work_struct		eviction_work;
	struct svm_range_list		*svms;
	uint32_t			evicting;
};

enum svm_work_list_ops {
@@ -157,6 +161,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
			       struct svm_range *prange);
int svm_range_restore_pages(struct amdgpu_device *adev,
			    unsigned int pasid, uint64_t addr);
int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
void svm_range_add_list_work(struct svm_range_list *svms,
			     struct svm_range *prange, struct mm_struct *mm,
			     enum svm_work_list_ops op);