Commit a00f1f57 authored by Oded Gabbay's avatar Oded Gabbay
Browse files

habanalabs: define soft-reset as inference op



Soft-reset is the procedure where we reset only the compute/DMA engines
of the device, without requiring the current user-space process to
release the device.

This type of reset can happen if TDR event occurred (a workload got
stuck) or by a root request through sysfs.

This is only relevant for inference ASICs, as there is no real-world
use-case to do that in training, because training runs on multiple
devices.

In addition, we also do (in certain ASICs) a reset upon device release.
That reset uses the same code as the soft-reset.

Therefore, to better differentiate between the two resets, it is better
to rename the soft-reset support as "inference soft-reset", to make
the code more self-explanatory.

Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent dd08335f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -954,7 +954,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
		goto do_reset;
	}

	if (!hard_reset && !hdev->allow_external_soft_reset) {
	if (!hard_reset && !hdev->allow_inference_soft_reset) {
		hard_instead_soft = true;
		hard_reset = true;
	}
+7 −3
Original line number Diff line number Diff line
@@ -2440,8 +2440,12 @@ struct multi_cs_data {
 * @collective_mon_idx: helper index for collective initialization
 * @supports_coresight: is CoreSight supported.
 * @supports_soft_reset: is soft reset supported.
 * @allow_external_soft_reset: true if soft reset initiated by user or TDR is
 *                             allowed.
 * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
 *                              initiated by user or TDR. This is only true
 *                              in inference ASICs, as there is no real-world
 *                              use-case of doing soft-reset in training (due
 *                              to the fact that training runs on multiple
 *                              devices)
 * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
 * @needs_reset: true if reset_on_lockup is false and device should be reset
 *               due to lockup.
@@ -2572,7 +2576,7 @@ struct hl_device {
	u8				collective_mon_idx;
	u8				supports_coresight;
	u8				supports_soft_reset;
	u8				allow_external_soft_reset;
	u8				allow_inference_soft_reset;
	u8				supports_cb_mapping;
	u8				needs_reset;
	u8				process_kill_trial_cnt;
+3 −3
Original line number Diff line number Diff line
@@ -206,12 +206,12 @@ static ssize_t soft_reset_store(struct device *dev,
		goto out;
	}

	if (!hdev->allow_external_soft_reset) {
		dev_err(hdev->dev, "Device does not support soft-reset\n");
	if (!hdev->allow_inference_soft_reset) {
		dev_err(hdev->dev, "Device does not support inference soft-reset\n");
		goto out;
	}

	dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
	dev_warn(hdev->dev, "Inference Soft-Reset requested through sysfs\n");

	hl_device_reset(hdev, 0);

+1 −1
Original line number Diff line number Diff line
@@ -959,7 +959,7 @@ static int goya_sw_init(struct hl_device *hdev)
	spin_lock_init(&goya->hw_queues_lock);
	hdev->supports_coresight = true;
	hdev->supports_soft_reset = true;
	hdev->allow_external_soft_reset = true;
	hdev->allow_inference_soft_reset = true;
	hdev->supports_wait_for_multi_cs = false;

	hdev->asic_funcs->set_pci_memory_regions(hdev);