Commit 4dee7a71 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge tag 'misc-habanalabs-next-2022-02-28' of...

Merge tag 'misc-habanalabs-next-2022-02-28' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next

Oded writes:

This tag contains habanalabs driver changes for v5.18:

- Add new feature of recording time-stamp when a completion
  queue counter reaches a target value as determined by the
  userspace application. This is used by the graph compiler
  to accurately measure the time it takes for certain workloads
  to execute, which helps to fine-tune future compilations.

- Add two new attributes to sysfs that expose the VRM and
  f/w OS version

- Add a delay to the reset path that allows the driver to
  receive and handle additional events from the f/w before
  doing the reset. This can help when debugging why a reset
  event was received from the f/w.

- Re-factor some of the sysfs code in the driver. Mainly,
  move functions from hwmgr.c to more relevant files and
  totally remove hwmgr.c file.

- Fix multiple bugs such as races, use-after-free, ignoring
  error codes, etc.

- As usual, multiple minor changes and small fixes.

* tag 'misc-habanalabs-next-2022-02-28' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (50 commits)
  habanalabs: remove deprecated firmware states
  habanalabs: add an option to delay a device reset
  habanalabs: Add check for pci_enable_device
  habanalabs: Fix reset upon device release bug
  habanalabs: make sure device mem alloc is page aligned
  habanalabs/gaudi: add missing handling of NIC related events
  habanalabs/gaudi: handle axi errors from NIC engines
  habanalabs: allow user to set allocation page size
  habanalabs: use kernel-doc for memory ioctl documentation
  habanalabs: avoid using an uninitialized variable
  habanalabs: set max power on device init per ASIC
  habanalabs: use proper max_power variable for device utilization
  habanalabs: enable stop-on-error debugfs setting per ASIC
  habanalabs: change function to static
  habanalabs: add missing include of vmalloc.h
  habanalabs: fix use-after-free bug
  habanalabs: rephrase error messages in PCI initialization
  habanalabs: fix spelling mistake
  habanalabs: Timestamps buffers registration
  habanalabs: fix race when waiting on encaps signal
  ...
parents 0245107a 655221c5
Loading
Loading
Loading
Loading
+2 −18
Original line number Diff line number Diff line
@@ -12,24 +12,7 @@ What: /sys/kernel/debug/habanalabs/hl<n>/clk_gate
Date:           May 2020
KernelVersion:  5.8
Contact:        ogabbay@kernel.org
Description:    Allow the root user to disable/enable in runtime the clock
                gating mechanism in Gaudi. Due to how Gaudi is built, the
                clock gating needs to be disabled in order to access the
                registers of the TPC and MME engines. This is sometimes needed
                during debug by the user and hence the user needs this option.
                The user can supply a bitmask value, each bit represents
                a different engine to disable/enable its clock gating feature.
                The bitmask is composed of 20 bits:

                =======   ============
                0  -  7   DMA channels
                8  - 11   MME engines
                12 - 19   TPC engines
                =======   ============

                The bit's location of a specific engine can be determined
                using (1 << GAUDI_ENGINE_ID_*). GAUDI_ENGINE_ID_* values
                are defined in uapi habanalabs.h file in enum gaudi_engine_id
Description:    This setting is now deprecated as clock gating is handled solely by the f/w

What:           /sys/kernel/debug/habanalabs/hl<n>/command_buffers
Date:           Jan 2019
@@ -239,6 +222,7 @@ KernelVersion: 5.6
Contact:        ogabbay@kernel.org
Description:    Sets the stop-on_error option for the device engines. Value of
                "0" is for disable, otherwise enable.
                Relevant only for GOYA and GAUDI.

What:           /sys/kernel/debug/habanalabs/hl<n>/timeout_locked
Date:           Sep 2021
+14 −2
Original line number Diff line number Diff line
@@ -69,6 +69,12 @@ KernelVersion: 5.1
Contact:        ogabbay@kernel.org
Description:    Displays the device's version from the eFuse

What:           /sys/class/habanalabs/hl<n>/fw_os_ver
Date:           Dec 2021
KernelVersion:  5.18
Contact:        ogabbay@kernel.org
Description:    Version of the firmware OS running on the device's CPU

What:           /sys/class/habanalabs/hl<n>/hard_reset
Date:           Jan 2019
KernelVersion:  5.1
@@ -115,7 +121,7 @@ What: /sys/class/habanalabs/hl<n>/infineon_ver
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Version of the Device's power supply F/W code
Description:    Version of the Device's power supply F/W code. Relevant only to GOYA and GAUDI

What:           /sys/class/habanalabs/hl<n>/max_power
Date:           Jan 2019
@@ -221,3 +227,9 @@ Date: Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Version of the u-boot running on the device's CPU

What:           /sys/class/habanalabs/hl<n>/vrm_ver
Date:           Jan 2022
KernelVersion:  not yet upstreamed
Contact:        ogabbay@kernel.org
Description:    Version of the Device's Voltage Regulator Monitor F/W code. N/A to GOYA and GAUDI
+1 −1
Original line number Diff line number Diff line
@@ -11,4 +11,4 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
		common/command_buffer.o common/hw_queue.o common/irq.o \
		common/sysfs.o common/hwmon.o common/memory.o \
		common/command_submission.o common/firmware_if.o \
		common/state_dump.o common/hwmgr.o
		common/state_dump.o
+3 −1
Original line number Diff line number Diff line
@@ -424,8 +424,8 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
{
	union hl_cb_args *args = data;
	struct hl_device *hdev = hpriv->hdev;
	u64 handle = 0, device_va = 0;
	enum hl_device_status status;
	u64 handle = 0, device_va;
	u32 usage_cnt = 0;
	int rc;

@@ -464,6 +464,8 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
				args->in.flags,
				&usage_cnt,
				&device_va);
		if (rc)
			break;

		memset(&args->out, 0, sizeof(args->out));

+205 −60
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@
#define HL_CS_FLAGS_TYPE_MASK	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
				HL_CS_FLAGS_COLLECTIVE_WAIT)

#define MAX_TS_ITER_NUM 10

/**
 * enum hl_cs_wait_status - cs wait status
 * @CS_WAIT_STATUS_BUSY: cs was not completed yet
@@ -919,12 +921,13 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
		complete_job(hdev, job);
}

void hl_cs_rollback_all(struct hl_device *hdev)
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
{
	int i;
	struct hl_cs *cs, *tmp;

	flush_workqueue(hdev->sob_reset_wq);
	if (!skip_wq_flush) {
		flush_workqueue(hdev->ts_free_obj_wq);

		/* flush all completions before iterating over the CS mirror list in
		 * order to avoid a race with the release functions
@@ -932,6 +935,8 @@ void hl_cs_rollback_all(struct hl_device *hdev)
		for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
			flush_workqueue(hdev->cq_wq[i]);

	}

	/* Make sure we don't have leftovers in the CS mirror list */
	list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
		cs_get(cs);
@@ -948,14 +953,20 @@ void hl_cs_rollback_all(struct hl_device *hdev)
static void
wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
{
	struct hl_user_pending_interrupt *pend;
	struct hl_user_pending_interrupt *pend, *temp;
	unsigned long flags;

	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
	list_for_each_entry(pend, &interrupt->wait_list_head, wait_list_node) {
	list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
		if (pend->ts_reg_info.ts_buff) {
			list_del(&pend->wait_list_node);
			hl_ts_put(pend->ts_reg_info.ts_buff);
			hl_cb_put(pend->ts_reg_info.cq_cb);
		} else {
			pend->fence.error = -EIO;
			complete_all(&pend->fence.completion);
		}
	}
	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
}

@@ -2063,13 +2074,16 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
			idp = &ctx->sig_mgr.handles;
			idr_for_each_entry(idp, encaps_sig_hdl, id) {
				if (encaps_sig_hdl->cs_seq == signal_seq) {
					handle_found = true;
					/* get refcount to protect removing
					 * this handle from idr, needed when
					 * multiple wait cs are used with offset
					/* get refcount to protect removing this handle from idr,
					 * needed when multiple wait cs are used with offset
					 * to wait on reserved encaps signals.
					 * Since kref_put of this handle is executed outside the
					 * current lock, it is possible that the handle refcount
					 * is 0 but it yet to be removed from the list. In this
					 * case need to consider the handle as not valid.
					 */
					kref_get(&encaps_sig_hdl->refcount);
					if (kref_get_unless_zero(&encaps_sig_hdl->refcount))
						handle_found = true;
					break;
				}
			}
@@ -2739,7 +2753,7 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
		mcs_data.update_ts = false;
		rc = hl_cs_poll_fences(&mcs_data, mcs_compl);

		if (mcs_data.completion_bitmap)
		if (rc || mcs_data.completion_bitmap)
			break;

		/*
@@ -2854,64 +2868,174 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
	return 0;
}

static int ts_buff_get_kernel_ts_record(struct hl_ts_buff *ts_buff,
					struct hl_cb *cq_cb,
					u64 ts_offset, u64 cq_offset, u64 target_value,
					spinlock_t *wait_list_lock,
					struct hl_user_pending_interrupt **pend)
{
	struct hl_user_pending_interrupt *requested_offset_record =
				(struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
				ts_offset;
	struct hl_user_pending_interrupt *cb_last =
			(struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
			(ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
	unsigned long flags, iter_counter = 0;
	u64 current_cq_counter;

	/* Validate ts_offset not exceeding last max */
	if (requested_offset_record > cb_last) {
		dev_err(ts_buff->hdev->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
								(u64)(uintptr_t)cb_last);
		return -EINVAL;
	}

start_over:
	spin_lock_irqsave(wait_list_lock, flags);

	/* Unregister only if we didn't reach the target value
	 * since in this case there will be no handling in irq context
	 * and then it's safe to delete the node out of the interrupt list
	 * then re-use it on other interrupt
	 */
	if (requested_offset_record->ts_reg_info.in_use) {
		current_cq_counter = *requested_offset_record->cq_kernel_addr;
		if (current_cq_counter < requested_offset_record->cq_target_value) {
			list_del(&requested_offset_record->wait_list_node);
			spin_unlock_irqrestore(wait_list_lock, flags);

			hl_ts_put(requested_offset_record->ts_reg_info.ts_buff);
			hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);

			dev_dbg(ts_buff->hdev->dev, "ts node removed from interrupt list now can re-use\n");
		} else {
			dev_dbg(ts_buff->hdev->dev, "ts node in middle of irq handling\n");

			/* irq handling in the middle give it time to finish */
			spin_unlock_irqrestore(wait_list_lock, flags);
			usleep_range(1, 10);
			if (++iter_counter == MAX_TS_ITER_NUM) {
				dev_err(ts_buff->hdev->dev, "handling registration interrupt took too long!!\n");
				return -EINVAL;
			}

			goto start_over;
		}
	} else {
		spin_unlock_irqrestore(wait_list_lock, flags);
	}

	/* Fill up the new registration node info */
	requested_offset_record->ts_reg_info.in_use = 1;
	requested_offset_record->ts_reg_info.ts_buff = ts_buff;
	requested_offset_record->ts_reg_info.cq_cb = cq_cb;
	requested_offset_record->ts_reg_info.timestamp_kernel_addr =
			(u64 *) ts_buff->user_buff_address + ts_offset;
	requested_offset_record->cq_kernel_addr =
			(u64 *) cq_cb->kernel_address + cq_offset;
	requested_offset_record->cq_target_value = target_value;

	*pend = requested_offset_record;

	dev_dbg(ts_buff->hdev->dev, "Found available node in TS kernel CB(0x%llx)\n",
						(u64)(uintptr_t)requested_offset_record);
	return 0;
}

static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
				struct hl_cb_mgr *cb_mgr, u64 timeout_us,
				u64 cq_counters_handle,	u64 cq_counters_offset,
				struct hl_cb_mgr *cb_mgr, struct hl_ts_mgr *ts_mgr,
				u64 timeout_us, u64 cq_counters_handle,	u64 cq_counters_offset,
				u64 target_value, struct hl_user_interrupt *interrupt,
				u32 *status,
				u64 *timestamp)
				bool register_ts_record, u64 ts_handle, u64 ts_offset,
				u32 *status, u64 *timestamp)
{
	u32 cq_patched_handle, ts_patched_handle;
	struct hl_user_pending_interrupt *pend;
	struct hl_ts_buff *ts_buff;
	struct hl_cb *cq_cb;
	unsigned long timeout, flags;
	long completion_rc;
	struct hl_cb *cb;
	int rc = 0;
	u32 handle;

	timeout = hl_usecs64_to_jiffies(timeout_us);

	hl_ctx_get(hdev, ctx);

	cq_counters_handle >>= PAGE_SHIFT;
	handle = (u32) cq_counters_handle;
	cq_patched_handle = lower_32_bits(cq_counters_handle >> PAGE_SHIFT);
	cq_cb = hl_cb_get(hdev, cb_mgr, cq_patched_handle);
	if (!cq_cb) {
		rc = -EINVAL;
		goto put_ctx;
	}

	cb = hl_cb_get(hdev, cb_mgr, handle);
	if (!cb) {
		hl_ctx_put(ctx);
		return -EINVAL;
	if (register_ts_record) {
		dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
					interrupt->interrupt_id, ts_offset, cq_counters_offset);

		ts_patched_handle = lower_32_bits(ts_handle >> PAGE_SHIFT);
		ts_buff = hl_ts_get(hdev, ts_mgr, ts_patched_handle);
		if (!ts_buff) {
			rc = -EINVAL;
			goto put_cq_cb;
		}

		/* Find first available record */
		rc = ts_buff_get_kernel_ts_record(ts_buff, cq_cb, ts_offset,
						cq_counters_offset, target_value,
						&interrupt->wait_list_lock, &pend);
		if (rc)
			goto put_ts_buff;
	} else {
		pend = kzalloc(sizeof(*pend), GFP_KERNEL);
		if (!pend) {
		hl_cb_put(cb);
		hl_ctx_put(ctx);
		return -ENOMEM;
			rc = -ENOMEM;
			goto put_cq_cb;
		}

		hl_fence_init(&pend->fence, ULONG_MAX);

	pend->cq_kernel_addr = (u64 *) cb->kernel_address + cq_counters_offset;
		pend->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_counters_offset;
		pend->cq_target_value = target_value;
	}

	spin_lock_irqsave(&interrupt->wait_list_lock, flags);

	/* We check for completion value as interrupt could have been received
	 * before we added the node to the wait list
	 */
	if (*pend->cq_kernel_addr >= target_value) {
		if (register_ts_record)
			pend->ts_reg_info.in_use = 0;
		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);

		*status = HL_WAIT_CS_STATUS_COMPLETED;
		/* There was no interrupt, we assume the completion is now. */

		if (register_ts_record) {
			*pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
			goto put_ts_buff;
		} else {
			pend->fence.timestamp = ktime_get();
			goto set_timestamp;
		}

	if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
	} else if (!timeout_us) {
		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
		*status = HL_WAIT_CS_STATUS_BUSY;
		pend->fence.timestamp = ktime_get();
		goto set_timestamp;
	}

	/* Add pending user interrupt to relevant list for the interrupt
	 * handler to monitor
	 * handler to monitor.
	 * Note that we cannot have sorted list by target value,
	 * in order to shorten the list pass loop, since
	 * same list could have nodes for different cq counter handle.
	 */
	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
	list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);

	if (register_ts_record) {
		rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
		goto ts_registration_exit;
	}

	/* Wait for interrupt handler to signal completion */
	completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
								timeout);
@@ -2932,23 +3056,41 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
				rc = -EIO;
				*status = HL_WAIT_CS_STATUS_ABORTED;
			} else {
				dev_err_ratelimited(hdev->dev, "Waiting for interrupt ID %d timedout\n",
						interrupt->interrupt_id);
				rc = -ETIMEDOUT;
			}
				/* The wait has timed-out. We don't know anything beyond that
				 * because the workload wasn't submitted through the driver.
				 * Therefore, from driver's perspective, the workload is still
				 * executing.
				 */
				rc = 0;
				*status = HL_WAIT_CS_STATUS_BUSY;
			}
		}
	}

	/*
	 * We keep removing the node from list here, and not at the irq handler
	 * for completion timeout case. and if it's a registration
	 * for ts record, the node will be deleted in the irq handler after
	 * we reach the target value.
	 */
	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
	list_del(&pend->wait_list_node);
	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);

set_timestamp:
	*timestamp = ktime_to_ns(pend->fence.timestamp);

	kfree(pend);
	hl_cb_put(cb);
	hl_cb_put(cq_cb);
ts_registration_exit:
	hl_ctx_put(ctx);

	return rc;

put_ts_buff:
	hl_ts_put(ts_buff);
put_cq_cb:
	hl_cb_put(cq_cb);
put_ctx:
	hl_ctx_put(ctx);

	return rc;
@@ -3049,6 +3191,12 @@ static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_
			interrupt->interrupt_id);
		rc = -EINTR;
	} else {
		/* The wait has timed-out. We don't know anything beyond that
		 * because the workload wasn't submitted through the driver.
		 * Therefore, from driver's perspective, the workload is still
		 * executing.
		 */
		rc = 0;
		*status = HL_WAIT_CS_STATUS_BUSY;
	}

@@ -3101,23 +3249,20 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
		interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt];

	if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
		rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->cb_mgr,
		rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->cb_mgr, &hpriv->ts_mem_mgr,
				args->in.interrupt_timeout_us, args->in.cq_counters_handle,
				args->in.cq_counters_offset,
				args->in.target, interrupt, &status,
				&timestamp);
				args->in.target, interrupt,
				!!(args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT),
				args->in.timestamp_handle, args->in.timestamp_offset,
				&status, &timestamp);
	else
		rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
				args->in.interrupt_timeout_us, args->in.addr,
				args->in.target, interrupt, &status,
				&timestamp);
	if (rc) {
		if (rc != -EINTR)
			dev_err_ratelimited(hdev->dev,
				"interrupt_wait_ioctl failed (%d)\n", rc);

	if (rc)
		return rc;
	}

	memset(args, 0, sizeof(*args));
	args->out.status = status;
Loading