Commit 838ac90d authored by Daniel Vetter's avatar Daniel Vetter
Browse files

Merge tag 'drm-habanalabs-next-2023-04-10' of...

Merge tag 'drm-habanalabs-next-2023-04-10' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux

 into drm-next

This tag contains additional habanalabs driver changes for v6.4:

- uAPI changes:
  - Add a definition of a new Gaudi2 server type. This is used by userspace
    to know what is the connectivity between the accelerators inside the
    server

- New features and improvements:
  - speedup h/w queues test in Gaudi2 to reduce device initialization times.

- Firmware related fixes:
  - Fixes to the handshake protocol during f/w initialization.
  - Sync f/w events interrupt in hard reset to avoid warning message.
  - Improvements to extraction of the firmware version.

- Misc bug fixes and code cleanups. Notable fixes are:
  - Multiple fixes for interrupt handling in Gaudi2.
  - Unmap mapped memory in case TLB invalidation fails.

Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20230410124637.GA2441888@ogabbay-vm-u20.habana-labs.com
parents 4d877b1a 56499c46
Loading
Loading
Loading
Loading
+12 −3
Original line number Original line Diff line number Diff line
@@ -45,20 +45,29 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
	}
	}


	mutex_lock(&hdev->mmu_lock);
	mutex_lock(&hdev->mmu_lock);

	rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size);
	rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size);
	if (rc) {
	if (rc) {
		dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr);
		dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr);
		goto err_va_umap;
		goto err_va_pool_free;
	}
	}

	rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV);
	rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV);
	if (rc)
		goto err_mmu_unmap;

	mutex_unlock(&hdev->mmu_lock);
	mutex_unlock(&hdev->mmu_lock);


	cb->is_mmu_mapped = true;
	cb->is_mmu_mapped = true;
	return rc;


err_va_umap:
	return 0;

err_mmu_unmap:
	hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size);
err_va_pool_free:
	mutex_unlock(&hdev->mmu_lock);
	mutex_unlock(&hdev->mmu_lock);
	gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size);
	gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size);

	return rc;
	return rc;
}
}


+19 −21
Original line number Original line Diff line number Diff line
@@ -43,48 +43,46 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
		intr_source[2], intr_source[3], intr_source[4], intr_source[5]);
		intr_source[2], intr_source[3], intr_source[4], intr_source[5]);
}
}


static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
static void dec_abnrm_intr_work(struct work_struct *work)
{
{
	struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work);
	struct hl_device *hdev = dec->hdev;
	u32 irq_status, event_mask = 0;
	bool reset_required = false;
	bool reset_required = false;
	u32 irq_status, event_mask;


	irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
	irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);


	dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id);
	dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id);


	dec_print_abnrm_intr_source(hdev, irq_status);
	dec_print_abnrm_intr_source(hdev, irq_status);


	/* Clear the interrupt */
	/* Clear the interrupt */
	WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
	WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);


	/* Flush the interrupt clear */
	/* Flush the interrupt clear */
	RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
	RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);


	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
		reset_required = true;
		reset_required = true;
		event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
	} else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
		event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
	} else {
		event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
	}
	}


	if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK)
		event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;

	if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK |
				VCMD_IRQ_STATUS_BUSERR_MASK |
				VCMD_IRQ_STATUS_ABORT_MASK))
		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;

	if (reset_required) {
	if (reset_required) {
		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
		hl_device_cond_reset(hdev, 0, event_mask);
		hl_device_cond_reset(hdev, 0, event_mask);
	} else {
	} else if (event_mask) {
		hl_notifier_event_send_all(hdev, event_mask);
		hl_notifier_event_send_all(hdev, event_mask);
	}
	}
}
}


static void dec_completion_abnrm(struct work_struct *work)
{
	struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work);
	struct hl_device *hdev = dec->hdev;

	dec_error_intr_work(hdev, dec->base_addr, dec->core_id);
}

void hl_dec_fini(struct hl_device *hdev)
void hl_dec_fini(struct hl_device *hdev)
{
{
	kfree(hdev->dec);
	kfree(hdev->dec);
@@ -108,7 +106,7 @@ int hl_dec_init(struct hl_device *hdev)
		dec = hdev->dec + j;
		dec = hdev->dec + j;


		dec->hdev = hdev;
		dec->hdev = hdev;
		INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm);
		INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work);
		dec->core_id = j;
		dec->core_id = j;
		dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j);
		dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j);
		if (!dec->base_addr) {
		if (!dec->base_addr) {
+29 −25
Original line number Original line Diff line number Diff line
@@ -1271,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev)
	return 0;
	return 0;


disable_device:
disable_device:
	pci_clear_master(hdev->pdev);
	pci_disable_device(hdev->pdev);
	pci_disable_device(hdev->pdev);


	return rc;
	return rc;
@@ -1381,6 +1380,34 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
	mutex_unlock(fd_lock);
	mutex_unlock(fd_lock);
}
}


static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
{
	/* If reset is due to heartbeat, device CPU is no responsive in
	 * which case no point sending PCI disable message to it.
	 */
	if ((flags & HL_DRV_RESET_HARD) &&
			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
		/* Disable PCI access from device F/W so he won't send
		 * us additional interrupts. We disable MSI/MSI-X at
		 * the halt_engines function and we can't have the F/W
		 * sending us interrupts after that. We need to disable
		 * the access here because if the device is marked
		 * disable, the message won't be send. Also, in case
		 * of heartbeat, the device CPU is marked as disable
		 * so this message won't be sent
		 */
		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
			dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
			return;
		}

		/* verify that last EQs are handled before disabled is set */
		if (hdev->cpu_queues_enable)
			synchronize_irq(pci_irq_vector(hdev->pdev,
					hdev->asic_prop.eq_interrupt_id));
	}
}

static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
{
{
	u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
	u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
@@ -1419,28 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
	} else {
	} else {
		hdev->reset_info.reset_trigger_repeated = 1;
		hdev->reset_info.reset_trigger_repeated = 1;
	}
	}

	/* If reset is due to heartbeat, device CPU is no responsive in
	 * which case no point sending PCI disable message to it.
	 *
	 * If F/W is performing the reset, no need to send it a message to disable
	 * PCI access
	 */
	if ((flags & HL_DRV_RESET_HARD) &&
			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
		/* Disable PCI access from device F/W so he won't send
		 * us additional interrupts. We disable MSI/MSI-X at
		 * the halt_engines function and we can't have the F/W
		 * sending us interrupts after that. We need to disable
		 * the access here because if the device is marked
		 * disable, the message won't be send. Also, in case
		 * of heartbeat, the device CPU is marked as disable
		 * so this message won't be sent
		 */
		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
			dev_warn(hdev->dev,
				"Failed to disable FW's PCI access\n");
	}
}
}


/*
/*
@@ -1561,6 +1566,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)


escalate_reset_flow:
escalate_reset_flow:
		handle_reset_trigger(hdev, flags);
		handle_reset_trigger(hdev, flags);
		send_disable_pci_access(hdev, flags);


		/* This also blocks future CS/VM/JOB completion operations */
		/* This also blocks future CS/VM/JOB completion operations */
		hdev->disabled = true;
		hdev->disabled = true;
@@ -1823,9 +1829,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
			dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
			dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
			flags = hdev->reset_info.hard_reset_schedule_flags;
			flags = hdev->reset_info.hard_reset_schedule_flags;
			hdev->reset_info.hard_reset_schedule_flags = 0;
			hdev->reset_info.hard_reset_schedule_flags = 0;
			hdev->disabled = true;
			hard_reset = true;
			hard_reset = true;
			handle_reset_trigger(hdev, flags);
			goto escalate_reset_flow;
			goto escalate_reset_flow;
		}
		}
	}
	}
+7 −10
Original line number Original line Diff line number Diff line
@@ -71,7 +71,7 @@ static char *extract_fw_ver_from_str(const char *fw_str)
	return NULL;
	return NULL;
}
}


static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver)
{
{
	char major[8], minor[8], *first_dot, *second_dot;
	char major[8], minor[8], *first_dot, *second_dot;
	int rc;
	int rc;
@@ -86,7 +86,7 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)


	if (rc) {
	if (rc) {
		dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
		dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
		goto out;
		return rc;
	}
	}


	/* skip the first dot */
	/* skip the first dot */
@@ -102,9 +102,6 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)


	if (rc)
	if (rc)
		dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);
		dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);

out:
	kfree(preboot_ver);
	return rc;
	return rc;
}
}


@@ -1263,7 +1260,7 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
				COMMS_RST_DEV, 0, false,
				COMMS_RST_DEV, 0, false,
				hdev->fw_loader.cpu_timeout);
				hdev->fw_loader.cpu_timeout);
		if (rc)
		if (rc)
			dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n");
			dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n");
	} else {
	} else {
		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV);
		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV);
	}
	}
@@ -1281,10 +1278,10 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
	/* Stop device CPU to make sure nothing bad happens */
	/* Stop device CPU to make sure nothing bad happens */
	if (hdev->asic_prop.dynamic_fw_load) {
	if (hdev->asic_prop.dynamic_fw_load) {
		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
				COMMS_GOTO_WFE, 0, true,
				COMMS_GOTO_WFE, 0, false,
				hdev->fw_loader.cpu_timeout);
				hdev->fw_loader.cpu_timeout);
		if (rc)
		if (rc)
			dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
			dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
	} else {
	} else {
		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
		msleep(static_loader->cpu_reset_wait_msec);
		msleep(static_loader->cpu_reset_wait_msec);
@@ -2181,8 +2178,8 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,


			dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
			dev_info(hdev->dev, "preboot version %s\n", preboot_ver);


			/* This function takes care of freeing preboot_ver */
			rc = hl_get_preboot_major_minor(hdev, preboot_ver);
			rc = extract_fw_sub_versions(hdev, preboot_ver);
			kfree(preboot_ver);
			if (rc)
			if (rc)
				return rc;
				return rc;
		}
		}
+7 −7
Original line number Original line Diff line number Diff line
@@ -662,7 +662,7 @@ struct hl_hints_range {
 * @user_interrupt_count: number of user interrupts.
 * @user_interrupt_count: number of user interrupts.
 * @user_dec_intr_count: number of decoder interrupts exposed to user.
 * @user_dec_intr_count: number of decoder interrupts exposed to user.
 * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
 * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
 * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
 * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset.
 * @cache_line_size: device cache line size.
 * @cache_line_size: device cache line size.
 * @server_type: Server type that the ASIC is currently installed in.
 * @server_type: Server type that the ASIC is currently installed in.
 *               The value is according to enum hl_server_type in uapi file.
 *               The value is according to enum hl_server_type in uapi file.
@@ -793,7 +793,7 @@ struct asic_fixed_properties {
	u16				user_interrupt_count;
	u16				user_interrupt_count;
	u16				user_dec_intr_count;
	u16				user_dec_intr_count;
	u16				tpc_interrupt_id;
	u16				tpc_interrupt_id;
	u16				unexpected_user_error_interrupt_id;
	u16				eq_interrupt_id;
	u16				cache_line_size;
	u16				cache_line_size;
	u16				server_type;
	u16				server_type;
	u8				completion_queues_count;
	u8				completion_queues_count;
@@ -1211,13 +1211,13 @@ struct hl_eq {
/**
/**
 * struct hl_dec - describes a decoder sw instance.
 * struct hl_dec - describes a decoder sw instance.
 * @hdev: pointer to the device structure.
 * @hdev: pointer to the device structure.
 * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt
 * @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt.
 * @core_id: ID of the decoder.
 * @core_id: ID of the decoder.
 * @base_addr: base address of the decoder.
 * @base_addr: base address of the decoder.
 */
 */
struct hl_dec {
struct hl_dec {
	struct hl_device	*hdev;
	struct hl_device	*hdev;
	struct work_struct		completion_abnrm_work;
	struct work_struct	abnrm_intr_work;
	u32			core_id;
	u32			core_id;
	u32			base_addr;
	u32			base_addr;
};
};
Loading