Commit eb135291 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay
Browse files

habanalabs: refactor reset information variables



Unify variables related to device reset, which will help us to
add some new reset functionality in future patches.

Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 60bf3bfb
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -250,7 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
	 * Can't use generic function to check this because of special case
	 * where we create a CB as part of the reset process
	 */
	if ((hdev->disabled) || ((atomic_read(&hdev->in_reset)) &&
	if ((hdev->disabled) || ((atomic_read(&hdev->reset_info.in_reset)) &&
					(ctx_id != HL_KERNEL_ASID_ID))) {
		dev_warn_ratelimited(hdev->dev,
			"Device is disabled or in reset. Can't create new CBs\n");
+2 −2
Original line number Diff line number Diff line
@@ -777,7 +777,7 @@ static void cs_timedout(struct work_struct *work)
		if (hdev->reset_on_lockup)
			hl_device_reset(hdev, HL_DRV_RESET_TDR);
		else
			hdev->needs_reset = true;
			hdev->reset_info.needs_reset = true;
	}
}

@@ -814,7 +814,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
	cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
	cs->timeout_jiffies = timeout;
	cs->skip_reset_on_timeout =
		hdev->skip_reset_on_timeout ||
		hdev->reset_info.skip_reset_on_timeout ||
		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
	cs->submission_time_jiffies = jiffies;
	INIT_LIST_HEAD(&cs->job_list);
+9 −9
Original line number Diff line number Diff line
@@ -527,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data)
	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
	struct hl_device *hdev = dev_entry->hdev;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev,
				"Can't check device idle during reset\n");
		return 0;
@@ -658,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
	ssize_t rc;
	u32 val;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
		return 0;
	}
@@ -694,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
	u32 value;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
		return 0;
	}
@@ -731,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
	ssize_t rc;
	u64 val;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
		return 0;
	}
@@ -767,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
	u64 value;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
		return 0;
	}
@@ -802,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf,
	ssize_t rc;
	u32 size;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n");
		return 0;
	}
@@ -1077,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
	u64 value;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev,
				"Can't change clock gating during reset\n");
		return 0;
@@ -1119,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
	u32 value;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
	if (atomic_read(&hdev->reset_info.in_reset)) {
		dev_warn_ratelimited(hdev->dev,
				"Can't change stop on error during reset\n");
		return 0;
@@ -1497,7 +1497,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
	debugfs_create_x8("skip_reset_on_timeout",
				0644,
				dev_entry->root,
				&hdev->skip_reset_on_timeout);
				&hdev->reset_info.skip_reset_on_timeout);

	debugfs_create_file("state_dump",
				0600,
+39 −37
Original line number Diff line number Diff line
@@ -17,9 +17,9 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
{
	enum hl_device_status status;

	if (atomic_read(&hdev->in_reset))
	if (atomic_read(&hdev->reset_info.in_reset))
		status = HL_DEVICE_STATUS_IN_RESET;
	else if (hdev->needs_reset)
	else if (hdev->reset_info.needs_reset)
		status = HL_DEVICE_STATUS_NEEDS_RESET;
	else if (hdev->disabled)
		status = HL_DEVICE_STATUS_MALFUNCTION;
@@ -452,7 +452,7 @@ static int device_early_init(struct hl_device *hdev)
	INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
	mutex_init(&hdev->fpriv_list_lock);
	mutex_init(&hdev->fpriv_ctrl_list_lock);
	atomic_set(&hdev->in_reset, 0);
	atomic_set(&hdev->reset_info.in_reset, 0);
	mutex_init(&hdev->clk_throttling.lock);

	return 0;
@@ -544,8 +544,8 @@ static void hl_device_heartbeat(struct work_struct *work)
	 * status for at least one heartbeat. From this point driver restarts
	 * tracking future consecutive fatal errors.
	 */
	if (!(atomic_read(&hdev->in_reset)))
		hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
	if (!(atomic_read(&hdev->reset_info.in_reset)))
		hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;

	schedule_delayed_work(&hdev->work_heartbeat,
			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
@@ -639,12 +639,12 @@ int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool en
			goto out;
		}

		if (!hdev->hard_reset_pending)
		if (!hdev->reset_info.hard_reset_pending)
			hdev->asic_funcs->halt_coresight(hdev, ctx);

		hdev->in_debug = 0;

		if (!hdev->hard_reset_pending)
		if (!hdev->reset_info.hard_reset_pending)
			hdev->asic_funcs->set_clock_gating(hdev);

		goto out;
@@ -722,7 +722,7 @@ int hl_device_suspend(struct hl_device *hdev)
	pci_save_state(hdev->pdev);

	/* Block future CS/VM/JOB completion operations */
	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
	rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
	if (rc) {
		dev_err(hdev->dev, "Can't suspend while in reset\n");
		return -EIO;
@@ -777,7 +777,7 @@ int hl_device_resume(struct hl_device *hdev)


	hdev->disabled = false;
	atomic_set(&hdev->in_reset, 0);
	atomic_set(&hdev->reset_info.in_reset, 0);

	rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
	if (rc) {
@@ -906,16 +906,16 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
	 * 'reset_cause' will continue holding its 1st recorded reason!
	 */
	if (flags & HL_DRV_RESET_HEARTBEAT) {
		hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
		cur_reset_trigger = HL_DRV_RESET_HEARTBEAT;
	} else if (flags & HL_DRV_RESET_TDR) {
		hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
		cur_reset_trigger = HL_DRV_RESET_TDR;
	} else if (flags & HL_DRV_RESET_FW_FATAL_ERR) {
		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
		cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR;
	} else {
		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
	}

	/*
@@ -923,11 +923,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
	 * is set and if this reset is due to a fatal FW error
	 * device is set to an unstable state.
	 */
	if (hdev->prev_reset_trigger != cur_reset_trigger) {
		hdev->prev_reset_trigger = cur_reset_trigger;
		hdev->reset_trigger_repeated = 0;
	if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
		hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
		hdev->reset_info.reset_trigger_repeated = 0;
	} else {
		hdev->reset_trigger_repeated = 1;
		hdev->reset_info.reset_trigger_repeated = 1;
	}

	/* If reset is due to heartbeat, device CPU is no responsive in
@@ -987,7 +987,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);

	if (!hard_reset && !hdev->supports_soft_reset) {
	if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
		hard_instead_soft = true;
		hard_reset = true;
	}
@@ -1004,7 +1004,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
		goto do_reset;
	}

	if (!hard_reset && !hdev->allow_inference_soft_reset) {
	if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
		hard_instead_soft = true;
		hard_reset = true;
	}
@@ -1024,13 +1024,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
	 */
	if (!from_hard_reset_thread) {
		/* Block future CS/VM/JOB completion operations */
		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
		rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
		if (rc)
			return 0;

		handle_reset_trigger(hdev, flags);

		hdev->is_in_soft_reset = !hard_reset;
		/* This still allows the completion of some KDMA ops */
		hdev->reset_info.is_in_soft_reset = !hard_reset;

		/* This also blocks future CS/VM/JOB completion operations */
		hdev->disabled = true;
@@ -1047,7 +1048,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

again:
	if ((hard_reset) && (!from_hard_reset_thread)) {
		hdev->hard_reset_pending = true;
		hdev->reset_info.hard_reset_pending = true;

		hdev->process_kill_trial_cnt = 0;

@@ -1128,10 +1129,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

	if (hard_reset) {
		hdev->device_cpu_disabled = false;
		hdev->hard_reset_pending = false;
		hdev->reset_info.hard_reset_pending = false;

		if (hdev->reset_trigger_repeated &&
				(hdev->prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR)) {
		if (hdev->reset_info.reset_trigger_repeated &&
				(hdev->reset_info.prev_reset_trigger ==
						HL_DRV_RESET_FW_FATAL_ERR)) {
			/* if there 2 back to back resets from FW,
			 * ensure driver puts the driver in a unusable state
			 */
@@ -1182,7 +1184,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
	 * is required for the initialization itself
	 */
	hdev->disabled = false;
	hdev->is_in_soft_reset = false;
	hdev->reset_info.is_in_soft_reset = false;

	rc = hdev->asic_funcs->hw_init(hdev);
	if (rc) {
@@ -1232,13 +1234,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
		}
	}

	atomic_set(&hdev->in_reset, 0);
	hdev->needs_reset = false;
	atomic_set(&hdev->reset_info.in_reset, 0);
	hdev->reset_info.needs_reset = false;

	dev_notice(hdev->dev, "Successfully finished resetting the device\n");

	if (hard_reset) {
		hdev->hard_reset_cnt++;
		hdev->reset_info.hard_reset_cnt++;

		/* After reset is done, we are ready to receive events from
		 * the F/W. We can't do it before because we will ignore events
@@ -1247,30 +1249,30 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
		 */
		hdev->asic_funcs->enable_events_from_fw(hdev);
	} else if (!reset_upon_device_release) {
		hdev->soft_reset_cnt++;
		hdev->reset_info.soft_reset_cnt++;
	}

	return 0;

out_err:
	hdev->disabled = true;
	hdev->is_in_soft_reset = false;
	hdev->reset_info.is_in_soft_reset = false;

	if (hard_reset) {
		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
		hdev->hard_reset_cnt++;
		hdev->reset_info.hard_reset_cnt++;
	} else if (reset_upon_device_release) {
		dev_err(hdev->dev, "Failed to reset device after user release\n");
		hard_reset = true;
		goto again;
	} else {
		dev_err(hdev->dev, "Failed to do soft-reset\n");
		hdev->soft_reset_cnt++;
		hdev->reset_info.soft_reset_cnt++;
		hard_reset = true;
		goto again;
	}

	atomic_set(&hdev->in_reset, 0);
	atomic_set(&hdev->reset_info.in_reset, 0);

	return rc;
}
@@ -1604,10 +1606,10 @@ void hl_device_fini(struct hl_device *hdev)
	 */

	timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
	rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
	while (rc) {
		usleep_range(50, 200);
		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
		rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
		if (ktime_compare(ktime_get(), timeout) > 0) {
			dev_crit(hdev->dev,
				"Failed to remove device because reset function did not finish\n");
@@ -1629,7 +1631,7 @@ void hl_device_fini(struct hl_device *hdev)

	take_release_locks(hdev);

	hdev->hard_reset_pending = true;
	hdev->reset_info.hard_reset_pending = true;

	hl_hwmon_fini(hdev);

+3 −3
Original line number Diff line number Diff line
@@ -2371,14 +2371,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
	if (rc)
		goto protocol_err;

	if (hdev->curr_reset_cause) {
	if (hdev->reset_info.curr_reset_cause) {
		rc = hl_fw_dynamic_send_msg(hdev, fw_loader,
				HL_COMMS_RESET_CAUSE_TYPE, &hdev->curr_reset_cause);
				HL_COMMS_RESET_CAUSE_TYPE, &hdev->reset_info.curr_reset_cause);
		if (rc)
			goto protocol_err;

		/* Clear current reset cause */
		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
	}

	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
Loading