Commit 860e73b4 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge tag 'misc-habanalabs-next-2020-07-24' of...

Merge tag 'misc-habanalabs-next-2020-07-24' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains the following changes for kernel 5.9-rc1:

- Remove rate limiters from GAUDI configuration (no longer needed).
- Set maximum amount of in-flight CS per ASIC type and increase
  the maximum amount for GAUDI.
- Refactor signal/wait command submissions code
- Calculate trace frequency from PLLs to show accurate profiling data
- Rephrase error messages to make them more clear to the common user
- Add statistics of dropped CS (counter per possible reason for drop)
- Get ECC information from firmware
- Remove support for partial SoC reset in Gaudi
- Halt device CPU only when reset is certain to happen. Sometimes we abort
  the reset procedure and in that case we can't leave device CPU in halt
  mode.
- set each CQ to its own work queue to prevent a race between completions
  on different CQs.
- Use queue pi/ci in order to determine queue occupancy. This is done to
  make the code reusable between current and future ASICs.
- Add more validations for user inputs.
- Refactor PCIe controller configuration to make the code reusable between
  current and future ASICs.
- Update firmware interface headers to latest version
- Move all common code to a dedicated common sub-folder

* tag 'misc-habanalabs-next-2020-07-24' of git://people.freedesktop.org/~gabbayo/linux: (28 commits)
  habanalabs: Fix memory leak in error flow of context initialization
  habanalabs: use no flags on MMU cache invalidation
  habanalabs: enable device before hw_init()
  habanalabs: create internal CB pool
  habanalabs: update hl_boot_if.h from firmware
  habanalabs: create common folder
  habanalabs: check for DMA errors when clearing memory
  habanalabs: verify queue can contain all cs jobs
  habanalabs: Assign each CQ with its own work queue
  habanalabs: halt device CPU only upon certain reset
  habanalabs: remove unused hash
  habanalabs: use queue pi/ci in order to determine queue occupancy
  habanalabs: configure maximum queues per asic
  habanalabs: remove soft-reset support from GAUDI
  habanalabs: PCIe iATU refactoring
  habanalabs: Extract ECC information from FW
  habanalabs: Add dropped cs statistics info struct
  habanalabs: extract cpu boot status lookup
  habanalabs: rephrase error messages
  habanalabs: Increase queues depth
  ...
parents 54918b8e 94f8be9e
Loading
Loading
Loading
Loading
+5 −6
Original line number Diff line number Diff line
@@ -3,16 +3,15 @@
# Makefile for HabanaLabs AI accelerators driver
#

obj-m	:= habanalabs.o
obj-$(CONFIG_HABANA_AI) := habanalabs.o

habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
		command_submission.o mmu.o firmware_if.o pci.o

habanalabs-$(CONFIG_DEBUG_FS) += debugfs.o
include $(src)/common/Makefile
habanalabs-y += $(HL_COMMON_FILES)

include $(src)/goya/Makefile
habanalabs-y += $(HL_GOYA_FILES)

include $(src)/gaudi/Makefile
habanalabs-y += $(HL_GAUDI_FILES)

habanalabs-$(CONFIG_DEBUG_FS) += common/debugfs.o
+9 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only
subdir-ccflags-y += -I$(src)/common

HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
		common/asid.o common/habanalabs_ioctl.o \
		common/command_buffer.o common/hw_queue.o common/irq.o \
		common/sysfs.o common/hwmon.o common/memory.o \
		common/command_submission.o common/mmu.o common/firmware_if.o \
		common/pci.o
+52 −30
Original line number Diff line number Diff line
@@ -10,12 +10,18 @@

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/genalloc.h>

static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
{
	if (cb->is_internal)
		gen_pool_free(hdev->internal_cb_pool,
				cb->kernel_address, cb->size);
	else
		hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size,
				(void *) (uintptr_t) cb->kernel_address,
				cb->bus_address);

	kfree(cb);
}

@@ -44,9 +50,10 @@ static void cb_release(struct kref *ref)
}

static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
					int ctx_id)
					int ctx_id, bool internal_cb)
{
	struct hl_cb *cb;
	u32 cb_offset;
	void *p;

	/*
@@ -65,13 +72,25 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
	if (!cb)
		return NULL;

	if (ctx_id == HL_KERNEL_ASID_ID)
	if (internal_cb) {
		p = (void *) gen_pool_alloc(hdev->internal_cb_pool, cb_size);
		if (!p) {
			kfree(cb);
			return NULL;
		}

		cb_offset = p - hdev->internal_cb_pool_virt_addr;
		cb->is_internal = true;
		cb->bus_address =  hdev->internal_cb_va_base + cb_offset;
	} else if (ctx_id == HL_KERNEL_ASID_ID) {
		p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
						&cb->bus_address, GFP_ATOMIC);
	else
	} else {
		p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
						&cb->bus_address,
						GFP_USER | __GFP_ZERO);
	}

	if (!p) {
		dev_err(hdev->dev,
			"failed to allocate %d of dma memory for CB\n",
@@ -87,7 +106,7 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
}

int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
			u32 cb_size, u64 *handle, int ctx_id)
			u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
{
	struct hl_cb *cb;
	bool alloc_new_cb = true;
@@ -112,6 +131,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
		goto out_err;
	}

	if (!internal_cb) {
		/* Minimum allocation must be PAGE SIZE */
		if (cb_size < PAGE_SIZE)
			cb_size = PAGE_SIZE;
@@ -121,8 +141,8 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,

			spin_lock(&hdev->cb_pool_lock);
			if (!list_empty(&hdev->cb_pool)) {
			cb = list_first_entry(&hdev->cb_pool, typeof(*cb),
					pool_list);
				cb = list_first_entry(&hdev->cb_pool,
						typeof(*cb), pool_list);
				list_del(&cb->pool_list);
				spin_unlock(&hdev->cb_pool_lock);
				alloc_new_cb = false;
@@ -131,9 +151,10 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
				dev_dbg(hdev->dev, "CB pool is empty\n");
			}
		}
	}

	if (alloc_new_cb) {
		cb = hl_cb_alloc(hdev, cb_size, ctx_id);
		cb = hl_cb_alloc(hdev, cb_size, ctx_id, internal_cb);
		if (!cb) {
			rc = -ENOMEM;
			goto out_err;
@@ -230,7 +251,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
		} else {
			rc = hl_cb_create(hdev, &hpriv->cb_mgr,
					args->in.cb_size, &handle,
						hpriv->ctx->asid);
					hpriv->ctx->asid, false);
		}

		memset(args, 0, sizeof(*args));
@@ -398,14 +419,15 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
	idr_destroy(&mgr->cb_handles);
}

struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size)
struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
					bool internal_cb)
{
	u64 cb_handle;
	struct hl_cb *cb;
	int rc;

	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
			HL_KERNEL_ASID_ID);
			HL_KERNEL_ASID_ID, internal_cb);
	if (rc) {
		dev_err(hdev->dev,
			"Failed to allocate CB for the kernel driver %d\n", rc);
@@ -437,7 +459,7 @@ int hl_cb_pool_init(struct hl_device *hdev)

	for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) {
		cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size,
				HL_KERNEL_ASID_ID);
				HL_KERNEL_ASID_ID, false);
		if (cb) {
			cb->is_pool = true;
			list_add(&cb->pool_list, &hdev->cb_pool);
+74 −23
Original line number Diff line number Diff line
@@ -246,6 +246,18 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
	kfree(job);
}

static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
{
	hdev->aggregated_cs_counters.device_in_reset_drop_cnt +=
			ctx->cs_counters.device_in_reset_drop_cnt;
	hdev->aggregated_cs_counters.out_of_mem_drop_cnt +=
			ctx->cs_counters.out_of_mem_drop_cnt;
	hdev->aggregated_cs_counters.parsing_drop_cnt +=
			ctx->cs_counters.parsing_drop_cnt;
	hdev->aggregated_cs_counters.queue_full_drop_cnt +=
			ctx->cs_counters.queue_full_drop_cnt;
}

static void cs_do_release(struct kref *ref)
{
	struct hl_cs *cs = container_of(ref, struct hl_cs,
@@ -349,6 +361,9 @@ static void cs_do_release(struct kref *ref)
	dma_fence_signal(cs->fence);
	dma_fence_put(cs->fence);

	cs_counters_aggregate(hdev, cs->ctx);

	kfree(cs->jobs_in_queue_cnt);
	kfree(cs);
}

@@ -373,9 +388,9 @@ static void cs_timedout(struct work_struct *work)
	hdev = cs->ctx->hdev;
	ctx_asid = cs->ctx->asid;

	/* TODO: add information about last signaled seq and last emitted seq */
	dev_err(hdev->dev, "User %d command submission %llu got stuck!\n",
		ctx_asid, cs->sequence);
	dev_err(hdev->dev,
		"Command submission %llu has not finished in time!\n",
		cs->sequence);

	cs_put(cs);

@@ -418,21 +433,29 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
	spin_lock(&ctx->cs_lock);

	cs_cmpl->cs_seq = ctx->cs_sequence;
	other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
	other = ctx->cs_pending[cs_cmpl->cs_seq &
				(hdev->asic_prop.max_pending_cs - 1)];
	if ((other) && (!dma_fence_is_signaled(other))) {
		spin_unlock(&ctx->cs_lock);
		dev_dbg(hdev->dev,
			"Rejecting CS because of too many in-flights CS\n");
		rc = -EAGAIN;
		goto free_fence;
	}

	cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
			sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
	if (!cs->jobs_in_queue_cnt) {
		rc = -ENOMEM;
		goto free_fence;
	}

	dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
			ctx->asid, ctx->cs_sequence);

	cs->sequence = cs_cmpl->cs_seq;

	ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
	ctx->cs_pending[cs_cmpl->cs_seq &
			(hdev->asic_prop.max_pending_cs - 1)] =
							&cs_cmpl->base_fence;
	ctx->cs_sequence++;

@@ -447,6 +470,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
	return 0;

free_fence:
	spin_unlock(&ctx->cs_lock);
	kfree(cs_cmpl);
free_cs:
	kfree(cs);
@@ -463,10 +487,12 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)

void hl_cs_rollback_all(struct hl_device *hdev)
{
	int i;
	struct hl_cs *cs, *tmp;

	/* flush all completions */
	flush_workqueue(hdev->cq_wq);
	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
		flush_workqueue(hdev->cq_wq[i]);

	/* Make sure we don't have leftovers in the H/W queues mirror list */
	list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
@@ -499,10 +525,18 @@ static int validate_queue_index(struct hl_device *hdev,
	struct asic_fixed_properties *asic = &hdev->asic_prop;
	struct hw_queue_properties *hw_queue_prop;

	/* This must be checked here to prevent out-of-bounds access to
	 * hw_queues_props array
	 */
	if (chunk->queue_index >= asic->max_queues) {
		dev_err(hdev->dev, "Queue index %d is invalid\n",
			chunk->queue_index);
		return -EINVAL;
	}

	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];

	if ((chunk->queue_index >= HL_MAX_QUEUES) ||
			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
	if (hw_queue_prop->type == QUEUE_TYPE_NA) {
		dev_err(hdev->dev, "Queue index %d is invalid\n",
			chunk->queue_index);
		return -EINVAL;
@@ -630,12 +664,15 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,

		rc = validate_queue_index(hdev, chunk, &queue_type,
						&is_kernel_allocated_cb);
		if (rc)
		if (rc) {
			hpriv->ctx->cs_counters.parsing_drop_cnt++;
			goto free_cs_object;
		}

		if (is_kernel_allocated_cb) {
			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
			if (!cb) {
				hpriv->ctx->cs_counters.parsing_drop_cnt++;
				rc = -EINVAL;
				goto free_cs_object;
			}
@@ -649,6 +686,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
		job = hl_cs_allocate_job(hdev, queue_type,
						is_kernel_allocated_cb);
		if (!job) {
			hpriv->ctx->cs_counters.out_of_mem_drop_cnt++;
			dev_err(hdev->dev, "Failed to allocate a new job\n");
			rc = -ENOMEM;
			if (is_kernel_allocated_cb)
@@ -681,6 +719,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,

		rc = cs_parser(hpriv, job);
		if (rc) {
			hpriv->ctx->cs_counters.parsing_drop_cnt++;
			dev_err(hdev->dev,
				"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
				cs->ctx->asid, cs->sequence, job->id, rc);
@@ -689,6 +728,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
	}

	if (int_queues_only) {
		hpriv->ctx->cs_counters.parsing_drop_cnt++;
		dev_err(hdev->dev,
			"Reject CS %d.%llu because only internal queues jobs are present\n",
			cs->ctx->asid, cs->sequence);
@@ -738,6 +778,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
	struct hl_cs_job *job;
	struct hl_cs *cs;
	struct hl_cb *cb;
	enum hl_queue_type q_type;
	u64 *signal_seq_arr = NULL, signal_seq;
	u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size;
	int rc;
@@ -770,9 +811,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
	chunk = &cs_chunk_array[0];
	q_idx = chunk->queue_index;
	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
	q_type = hw_queue_prop->type;

	if ((q_idx >= HL_MAX_QUEUES) ||
			(hw_queue_prop->type != QUEUE_TYPE_EXT)) {
	if ((q_idx >= hdev->asic_prop.max_queues) ||
			(!hw_queue_prop->supports_sync_stream)) {
		dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx);
		rc = -EINVAL;
		goto free_cs_chunk_array;
@@ -869,25 +911,28 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,

	*cs_seq = cs->sequence;

	job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
	job = hl_cs_allocate_job(hdev, q_type, true);
	if (!job) {
		ctx->cs_counters.out_of_mem_drop_cnt++;
		dev_err(hdev->dev, "Failed to allocate a new job\n");
		rc = -ENOMEM;
		goto put_cs;
	}

	cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
	if (cs->type == CS_TYPE_WAIT)
		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
	else
		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);

	cb = hl_cb_kernel_create(hdev, cb_size,
				q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
	if (!cb) {
		ctx->cs_counters.out_of_mem_drop_cnt++;
		kfree(job);
		rc = -EFAULT;
		goto put_cs;
	}

	if (cs->type == CS_TYPE_WAIT)
		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
	else
		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);

	job->id = 0;
	job->cs = cs;
	job->user_cb = cb;
@@ -1126,7 +1171,7 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
		rc = PTR_ERR(fence);
		if (rc == -EINVAL)
			dev_notice_ratelimited(hdev->dev,
				"Can't wait on seq %llu because current CS is at seq %llu\n",
				"Can't wait on CS %llu because current CS is at seq %llu\n",
				seq, ctx->cs_sequence);
	} else if (fence) {
		rc = dma_fence_wait_timeout(fence, true, timeout);
@@ -1159,15 +1204,21 @@ int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
	memset(args, 0, sizeof(*args));

	if (rc < 0) {
		dev_err_ratelimited(hdev->dev,
				"Error %ld on waiting for CS handle %llu\n",
				rc, seq);
		if (rc == -ERESTARTSYS) {
			dev_err_ratelimited(hdev->dev,
				"user process got signal while waiting for CS handle %llu\n",
				seq);
			args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
			rc = -EINTR;
		} else if (rc == -ETIMEDOUT) {
			dev_err_ratelimited(hdev->dev,
				"CS %llu has timed-out while user process is waiting for it\n",
				seq);
			args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
		} else if (rc == -EIO) {
			dev_err_ratelimited(hdev->dev,
				"CS %llu has been aborted while user process is waiting for it\n",
				seq);
			args->out.status = HL_WAIT_CS_STATUS_ABORTED;
		}
		return rc;
Loading