Commit ae27e886 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge tag 'misc-habanalabs-next-2022-11-23' of...

Merge tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next

Oded writes:

This tag contains habanalabs driver changes for v6.2:

- New feature of graceful hard-reset. Instead of immediately killing the
  user-process when a command submission times out, we wait a bit and give
  the user-process notification and let it try to close things gracefully,
  with the ability to retrieve debug information.

- Enhance the EventFD mechanism. Add new events such as access to illegal
  address (RAZWI), page fault, device unavailable. In addition, change the
  event workqueue to be handled in a single-threaded workqueue.

- Allow the control device to work during reset of the ASIC, to enable
  monitoring applications to continue getting the data.

- Add handling for Gaudi2 with PCI revision 2.

- Reduce severity of prints due to power/thermal events.

- Change how we use the h/w to perform memory scrubbing in Gaudi2.

- Multiple bug fixes, refactors and renames.

* tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (63 commits)
  habanalabs: fix VA range calculation
  habanalabs: fail driver load if EEPROM errors detected
  habanalabs: make print of engines idle mask more readable
  habanalabs: clear non-released encapsulated signals
  habanalabs: don't put context in hl_encaps_handle_do_release_sob()
  habanalabs: print context refcount value if hard reset fails
  habanalabs: add RMWREG32_SHIFTED to set a val within a mask
  habanalabs: fix rc when new CPUCP opcodes are not supported
  habanalabs/gaudi2: added memset for the cq_size register
  habanalabs: added return value check for hl_fw_dynamic_send_clear_cmd()
  habanalabs: increase the size of busy engines mask
  habanalabs/gaudi2: change memory scrub mechanism
  habanalabs: extend process wait timeout in device fine
  habanalabs: check schedule_hard_reset correctly
  habanalabs: reset device if still in use when released
  habanalabs/gaudi2: return to reset upon SM SEI BRESP error
  habanalabs/gaudi2: don't enable entries in the MSIX_GW table
  habanalabs/gaudi2: remove redundant firmware version check
  habanalabs/gaudi: fix print for firmware-alive event
  habanalabs: fix print for out-of-sync and pkt-failure events
  ...
parents 449ef8fb 19a17a9f
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -91,6 +91,13 @@ Description: Enables the root user to set the device to specific state.
                Valid values are "disable", "enable", "suspend", "resume".
                User can read this property to see the valid values

What:           /sys/kernel/debug/habanalabs/hl<n>/device_release_watchdog_timeout
Date:           Oct 2022
KernelVersion:  6.2
Contact:        ttayar@habana.ai
Description:    The watchdog timeout value in seconds for a device relese upon
                certain error cases, after which the device is reset.

What:           /sys/kernel/debug/habanalabs/hl<n>/dma_size
Date:           Apr 2021
KernelVersion:  5.13
+45 −17
Original line number Diff line number Diff line
@@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
		 */
		if (hl_cs_cmpl->encaps_signals)
			kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
						hl_encaps_handle_do_release);
					hl_encaps_release_handle_and_put_ctx);
	}

	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
			&& cs->encaps_signals)
		kref_put(&cs->encaps_sig_hdl->refcount,
					hl_encaps_handle_do_release);
	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
		kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);

out:
	/* Must be called before hl_ctx_put because inside we use ctx to get
@@ -798,7 +796,7 @@ static void cs_do_release(struct kref *ref)
static void cs_timedout(struct work_struct *work)
{
	struct hl_device *hdev;
	u64 event_mask;
	u64 event_mask = 0x0;
	int rc;
	struct hl_cs *cs = container_of(work, struct hl_cs,
						 work_tdr.work);
@@ -830,11 +828,7 @@ static void cs_timedout(struct work_struct *work)
	if (rc) {
		hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
		hdev->captured_err_info.cs_timeout.seq = cs->sequence;

		event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
				HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;

		hl_notifier_event_send_all(hdev, event_mask);
		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
	}

	switch (cs->type) {
@@ -869,8 +863,12 @@ static void cs_timedout(struct work_struct *work)

	cs_put(cs);

	if (device_reset)
		hl_device_reset(hdev, HL_DRV_RESET_TDR);
	if (device_reset) {
		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
		hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
	} else if (event_mask) {
		hl_notifier_event_send_all(hdev, event_mask);
	}
}

static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
@@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
		hl_complete_job(hdev, job);
}

/*
 * release_reserved_encaps_signals() - release reserved encapsulated signals.
 * @hdev: pointer to habanalabs device structure
 *
 * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
 * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
 * For these signals need also to put the refcount of the H/W SOB which was taken at the
 * reservation.
 */
static void release_reserved_encaps_signals(struct hl_device *hdev)
{
	struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
	struct hl_cs_encaps_sig_handle *handle;
	struct hl_encaps_signals_mgr *mgr;
	u32 id;

	if (!ctx)
		return;

	mgr = &ctx->sig_mgr;

	idr_for_each_entry(&mgr->handles, handle, id)
		if (handle->cs_seq == ULLONG_MAX)
			kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);

	hl_ctx_put(ctx);
}

void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
{
	int i;
@@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
	}

	force_complete_multi_cs(hdev);

	release_reserved_encaps_signals(hdev);
}

static void
@@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
	 */
	handle->pre_sob_val = prop->next_sob_val - handle->count;

	handle->cs_seq = ULLONG_MAX;

	*signals_count = prop->next_sob_val;
	hdev->asic_funcs->hw_queues_unlock(hdev);

@@ -2350,10 +2380,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
	/* We finished with the CS in this function, so put the ref */
	cs_put(cs);
free_cs_chunk_array:
	if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
							is_wait_cs)
		kref_put(&encaps_sig_hdl->refcount,
				hl_encaps_handle_do_release);
	if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
		kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
	kfree(cs_chunk_array);
out:
	return rc;
+32 −22
Original line number Diff line number Diff line
@@ -9,38 +9,46 @@

#include <linux/slab.h>

void hl_encaps_handle_do_release(struct kref *ref)
static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
					bool put_ctx)
{
	struct hl_cs_encaps_sig_handle *handle =
		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
	struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;

	if (put_hw_sob)
		hw_sob_put(handle->hw_sob);

	spin_lock(&mgr->lock);
	idr_remove(&mgr->handles, handle->id);
	spin_unlock(&mgr->lock);

	if (put_ctx)
		hl_ctx_put(handle->ctx);

	kfree(handle);
}

static void hl_encaps_handle_do_release_sob(struct kref *ref)
void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
{
	struct hl_cs_encaps_sig_handle *handle =
			container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
	struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;

	/* if we're here, then there was a signals reservation but cs with
	 * encaps signals wasn't submitted, so need to put refcount
	 * to hw_sob taken at the reservation.
	 */
	hw_sob_put(handle->hw_sob);
	encaps_handle_do_release(handle, false, true);
}

	spin_lock(&mgr->lock);
	idr_remove(&mgr->handles, handle->id);
	spin_unlock(&mgr->lock);
static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
{
	struct hl_cs_encaps_sig_handle *handle =
			container_of(ref, struct hl_cs_encaps_sig_handle, refcount);

	hl_ctx_put(handle->ctx);
	kfree(handle);
	encaps_handle_do_release(handle, true, false);
}

void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
{
	struct hl_cs_encaps_sig_handle *handle =
			container_of(ref, struct hl_cs_encaps_sig_handle, refcount);

	encaps_handle_do_release(handle, true, true);
}

static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
@@ -49,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
	idr_init(&mgr->handles);
}

static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
			struct hl_encaps_signals_mgr *mgr)
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
{
	struct hl_cs_encaps_sig_handle *handle;
	struct idr *idp;
@@ -58,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,

	idp = &mgr->handles;

	/* The IDR is expected to be empty at this stage, because any left signal should have been
	 * released as part of CS roll-back.
	 */
	if (!idr_is_empty(idp)) {
		dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
		dev_warn(hdev->dev,
			"device released while some encaps signals handles are still allocated\n");
		idr_for_each_entry(idp, handle, id)
			kref_put(&handle->refcount,
					hl_encaps_handle_do_release_sob);
			kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
	}

	idr_destroy(&mgr->handles);
+5 −0
Original line number Diff line number Diff line
@@ -1769,6 +1769,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
				dev_entry,
				&hl_timeout_locked_fops);

	debugfs_create_u32("device_release_watchdog_timeout",
				0644,
				dev_entry->root,
				&hdev->device_release_watchdog_timeout_sec);

	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
		debugfs_create_file(hl_debugfs_list[i].name,
					0444,
+377 −75

File changed.

Preview size limit exceeded, changes collapsed.

Loading