Commit 8445dde1 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay
Browse files

habanalabs: move relevant datapath work outside cs lock



In order to shorten the time cs lock is being held, we move any
possible work outside of the cs lock.

Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 2f6274e4
Loading
Loading
Loading
Loading
+52 −34
Original line number Diff line number Diff line
@@ -84,31 +84,12 @@ int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
	return 0;
}

static void hl_fence_release(struct kref *kref)
static void sob_reset_work(struct work_struct *work)
{
	struct hl_fence *fence =
		container_of(kref, struct hl_fence, refcount);
	struct hl_cs_compl *hl_cs_cmpl =
		container_of(fence, struct hl_cs_compl, base_fence);
		container_of(work, struct hl_cs_compl, sob_reset_work);
	struct hl_device *hdev = hl_cs_cmpl->hdev;

	/* EBUSY means the CS was never submitted and hence we don't have
	 * an attached hw_sob object that we should handle here
	 */
	if (fence->error == -EBUSY)
		goto free;

	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
		(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
		(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {

		dev_dbg(hdev->dev,
			"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
			hl_cs_cmpl->cs_seq,
			hl_cs_cmpl->type,
			hl_cs_cmpl->hw_sob->sob_id,
			hl_cs_cmpl->sob_val);

	/*
	 * A signal CS can get completion while the corresponding wait
	 * for signal CS is on its way to the PQ. The wait for signal CS
@@ -131,6 +112,38 @@ static void hl_fence_release(struct kref *kref)
	if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
		hdev->asic_funcs->reset_sob_group(hdev,
				hl_cs_cmpl->sob_group);

	kfree(hl_cs_cmpl);
}

static void hl_fence_release(struct kref *kref)
{
	struct hl_fence *fence =
		container_of(kref, struct hl_fence, refcount);
	struct hl_cs_compl *hl_cs_cmpl =
		container_of(fence, struct hl_cs_compl, base_fence);
	struct hl_device *hdev = hl_cs_cmpl->hdev;

	/* EBUSY means the CS was never submitted and hence we don't have
	 * an attached hw_sob object that we should handle here
	 */
	if (fence->error == -EBUSY)
		goto free;

	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
		(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
		(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {

		dev_dbg(hdev->dev,
			"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
			hl_cs_cmpl->cs_seq,
			hl_cs_cmpl->type,
			hl_cs_cmpl->hw_sob->sob_id,
			hl_cs_cmpl->sob_val);

		queue_work(hdev->sob_reset_wq, &hl_cs_cmpl->sob_reset_work);

		return;
	}

free:
@@ -670,9 +683,23 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
		goto free_cs;
	}

	cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
			sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
	if (!cs->jobs_in_queue_cnt)
		cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
				sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);

	if (!cs->jobs_in_queue_cnt) {
		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
		atomic64_inc(&cntr->out_of_mem_drop_cnt);
		rc = -ENOMEM;
		goto free_cs_cmpl;
	}

	cs_cmpl->hdev = hdev;
	cs_cmpl->type = cs->type;
	spin_lock_init(&cs_cmpl->lock);
	INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work);
	cs->fence = &cs_cmpl->base_fence;

	spin_lock(&ctx->cs_lock);
@@ -702,19 +729,6 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
		goto free_fence;
	}

	cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
			sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
	if (!cs->jobs_in_queue_cnt)
		cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
				sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);

	if (!cs->jobs_in_queue_cnt) {
		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
		atomic64_inc(&cntr->out_of_mem_drop_cnt);
		rc = -ENOMEM;
		goto free_fence;
	}

	/* init hl_fence */
	hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);

@@ -737,6 +751,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,

free_fence:
	spin_unlock(&ctx->cs_lock);
	kfree(cs->jobs_in_queue_cnt);
free_cs_cmpl:
	kfree(cs_cmpl);
free_cs:
	kfree(cs);
@@ -759,6 +775,8 @@ void hl_cs_rollback_all(struct hl_device *hdev)
	int i;
	struct hl_cs *cs, *tmp;

	flush_workqueue(hdev->sob_reset_wq);

	/* flush all completions before iterating over the CS mirror list in
	 * order to avoid a race with the release functions
	 */
+12 −1
Original line number Diff line number Diff line
@@ -368,11 +368,19 @@ static int device_early_init(struct hl_device *hdev)
		goto free_cq_wq;
	}

	hdev->sob_reset_wq = alloc_workqueue("hl-sob-reset", WQ_UNBOUND, 0);
	if (!hdev->sob_reset_wq) {
		dev_err(hdev->dev,
			"Failed to allocate SOB reset workqueue\n");
		rc = -ENOMEM;
		goto free_eq_wq;
	}

	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
					GFP_KERNEL);
	if (!hdev->hl_chip_info) {
		rc = -ENOMEM;
		goto free_eq_wq;
		goto free_sob_reset_wq;
	}

	hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
@@ -418,6 +426,8 @@ static int device_early_init(struct hl_device *hdev)
	kfree(hdev->idle_busy_ts_arr);
free_chip_info:
	kfree(hdev->hl_chip_info);
free_sob_reset_wq:
	destroy_workqueue(hdev->sob_reset_wq);
free_eq_wq:
	destroy_workqueue(hdev->eq_wq);
free_cq_wq:
@@ -454,6 +464,7 @@ static void device_early_fini(struct hl_device *hdev)
	kfree(hdev->idle_busy_ts_arr);
	kfree(hdev->hl_chip_info);

	destroy_workqueue(hdev->sob_reset_wq);
	destroy_workqueue(hdev->eq_wq);
	destroy_workqueue(hdev->device_reset_work.wq);

+4 −0
Original line number Diff line number Diff line
@@ -528,6 +528,7 @@ struct hl_fence {

/**
 * struct hl_cs_compl - command submission completion object.
 * @sob_reset_work: workqueue object to run SOB reset flow.
 * @base_fence: hl fence object.
 * @lock: spinlock to protect fence.
 * @hdev: habanalabs device structure.
@@ -538,6 +539,7 @@ struct hl_fence {
 * @sob_group: the SOB group that is used in this collective wait CS.
 */
struct hl_cs_compl {
	struct work_struct	sob_reset_work;
	struct hl_fence		base_fence;
	spinlock_t		lock;
	struct hl_device	*hdev;
@@ -1905,6 +1907,7 @@ struct hl_mmu_funcs {
 * @cq_wq: work queues of completion queues for executing work in process
 *         context.
 * @eq_wq: work queue of event queue for executing work in process context.
 * @sob_reset_wq: work queue for sob reset executions.
 * @kernel_ctx: Kernel driver context structure.
 * @kernel_queues: array of hl_hw_queue.
 * @cs_mirror_list: CS mirror list for TDR.
@@ -2022,6 +2025,7 @@ struct hl_device {
	struct hl_user_interrupt	common_user_interrupt;
	struct workqueue_struct		**cq_wq;
	struct workqueue_struct		*eq_wq;
	struct workqueue_struct		*sob_reset_wq;
	struct hl_ctx			*kernel_ctx;
	struct hl_hw_queue		*kernel_queues;
	struct list_head		cs_mirror_list;