Commit e3e3eaab authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge tag 'misc-habanalabs-next-2021-02-08' of...

Merge tag 'misc-habanalabs-next-2021-02-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next

Oded writes:

This tag contains the following changes for 5.12-rc1:

- Improve communication protocol with device CPU CP application.
  The change prevents random (rare) out-of-sync errors.

- Notify F/W to start sending events only after initialization of
  device is done. This fixes the issue where fatal events were received
  but ignored.

- Fix integer handling (static analysis warning).

- Always fetch HBM ECC errors from F/W (if available).

- Minor fix in GAUDI-specific initialization code.

* tag 'misc-habanalabs-next-2021-02-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux:
  habanalabs/gaudi: don't enable clock gating on DMA5
  habanalabs: return block size + block ID
  habanalabs: update security map after init CPU Qs
  habanalabs: enable F/W events after init done
  habanalabs/gaudi: use HBM_ECC_EN bit for ECC ERR
  habanalabs: support fetching first available user CQ
  habanalabs: improve communication protocol with cpucp
  habanalabs: fix integer handling issue
parents 47ddb856 da5dfbb9
Loading
Loading
Loading
Loading
+19 −4
Original line number Diff line number Diff line
@@ -1159,12 +1159,20 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
	atomic_set(&hdev->in_reset, 0);
	hdev->needs_reset = false;

	if (hard_reset)
	dev_notice(hdev->dev, "Successfully finished resetting the device\n");

	if (hard_reset) {
		hdev->hard_reset_cnt++;
	else
		hdev->soft_reset_cnt++;

	dev_warn(hdev->dev, "Successfully finished resetting the device\n");
		/* After reset is done, we are ready to receive events from
		 * the F/W. We can't do it before because we will ignore events
		 * and if those events are fatal, we won't know about it and
		 * the device will be operational although it shouldn't be
		 */
		hdev->asic_funcs->enable_events_from_fw(hdev);
	} else {
		hdev->soft_reset_cnt++;
	}

	return 0;

@@ -1415,6 +1423,13 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)

	hdev->init_done = true;

	/* After initialization is done, we are ready to receive events from
	 * the F/W. We can't do it before because we will ignore events and if
	 * those events are fatal, we won't know about it and the device will
	 * be operational although it shouldn't be
	 */
	hdev->asic_funcs->enable_events_from_fw(hdev);

	return 0;

release_ctx:
+12 −2
Original line number Diff line number Diff line
@@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
				u16 len, u32 timeout, u64 *result)
{
	struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
	struct cpucp_packet *pkt;
	dma_addr_t pkt_dma_addr;
	u32 tmp;
	u32 tmp, expected_ack_val;
	int rc = 0;

	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
@@ -115,14 +116,23 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
		goto out;
	}

	/* set fence to a non valid value */
	pkt->fence = UINT_MAX;

	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
	if (rc) {
		dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
		goto out;
	}

	if (hdev->asic_prop.fw_app_security_map &
			CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
		expected_ack_val = queue->pi;
	else
		expected_ack_val = CPUCP_PACKET_FENCE_VAL;

	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
				(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
				(tmp == expected_ack_val), 1000,
				timeout, true);

	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
+12 −3
Original line number Diff line number Diff line
@@ -411,6 +411,7 @@ struct hl_mmu_properties {
 * @first_available_user_mon: first monitor available for the user
 * @first_available_user_msix_interrupt: first available msix interrupt
 *                                       reserved for the user
 * @first_available_cq: first available CQ for the user.
 * @tpc_enabled_mask: which TPCs are enabled.
 * @completion_queues_count: number of completion queues.
 * @fw_security_disabled: true if security measures are disabled in firmware,
@@ -473,6 +474,7 @@ struct asic_fixed_properties {
	u16				first_available_user_sob[HL_MAX_DCORES];
	u16				first_available_user_mon[HL_MAX_DCORES];
	u16				first_available_user_msix_interrupt;
	u16				first_available_cq[HL_MAX_DCORES];
	u8				tpc_enabled_mask;
	u8				completion_queues_count;
	u8				fw_security_disabled;
@@ -860,7 +862,13 @@ enum div_select_defs {
 *                   showing it to users.
 * @ack_protection_bits_errors: ack and dump all security violations
 * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it.
 *                   also returns the size of the block if caller supplies
 *                   a valid pointer for it
 * @hw_block_mmap: mmap a HW block with a given id.
 * @enable_events_from_fw: send interrupt to firmware to notify them the
 *                         driver is ready to receive asynchronous events. This
 *                         function should be called during the first init and
 *                         after every hard-reset of the device
 */
struct hl_asic_funcs {
	int (*early_init)(struct hl_device *hdev);
@@ -974,9 +982,10 @@ struct hl_asic_funcs {
	u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
	void (*ack_protection_bits_errors)(struct hl_device *hdev);
	int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr,
			u32 *block_id);
				u32 *block_size, u32 *block_id);
	int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
			u32 block_id, u32 block_size);
	void (*enable_events_from_fw)(struct hl_device *hdev);
};


+2 −1
Original line number Diff line number Diff line
@@ -397,7 +397,8 @@ static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
			prop->first_available_user_sob[args->dcore_id];
	sm_info.first_available_monitor =
			prop->first_available_user_mon[args->dcore_id];

	sm_info.first_available_cq =
			prop->first_available_cq[args->dcore_id];

	return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size,
			sizeof(sm_info))) ? -EFAULT : 0;
+11 −8
Original line number Diff line number Diff line
@@ -1289,12 +1289,13 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
	return rc;
}

static int map_block(struct hl_device *hdev, u64 address, u64 *handle)
static int map_block(struct hl_device *hdev, u64 address, u64 *handle,
			u32 *size)
{
	u32 block_id = 0;
	int rc;

	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, &block_id);
	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);

	*handle = block_id | HL_MMAP_TYPE_BLOCK;
	*handle <<= PAGE_SHIFT;
@@ -1371,7 +1372,7 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
	struct hl_device *hdev = hpriv->hdev;
	struct hl_ctx *ctx = hpriv->ctx;
	u64 block_handle, device_addr = 0;
	u32 handle = 0;
	u32 handle = 0, block_size;
	int rc;

	switch (args->in.op) {
@@ -1416,8 +1417,9 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)

	case HL_MEM_OP_MAP_BLOCK:
		rc = map_block(hdev, args->in.map_block.block_addr,
							&block_handle);
		args->out.handle = block_handle;
				&block_handle, &block_size);
		args->out.block_handle = block_handle;
		args->out.block_size = block_size;
		break;

	default:
@@ -1437,7 +1439,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
	struct hl_device *hdev = hpriv->hdev;
	struct hl_ctx *ctx = hpriv->ctx;
	u64 block_handle, device_addr = 0;
	u32 handle = 0;
	u32 handle = 0, block_size;
	int rc;

	if (!hl_device_operational(hdev, &status)) {
@@ -1524,8 +1526,9 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)

	case HL_MEM_OP_MAP_BLOCK:
		rc = map_block(hdev, args->in.map_block.block_addr,
							&block_handle);
		args->out.handle = block_handle;
				&block_handle, &block_size);
		args->out.block_handle = block_handle;
		args->out.block_size = block_size;
		break;

	default:
Loading