Commit ba2b062f authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge tag 'misc-habanalabs-fixes-2021-05-08' of...

Merge tag 'misc-habanalabs-fixes-2021-05-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-linus

Oded writes:

This tag contains the following fixes for 5.13-rc2:

- Expose PLL information per ASIC. This also fixes some casting warnings.
- Skip reading further firmware errors in case PCI link is down.
- Security firmware error should be handled as error and not warning.
- Allow user to ignore firmware errors.
- Fix bug in timeout calculation when waiting for interrupt of CS.
- Fix bug of potential use-after-free.

* tag 'misc-habanalabs-fixes-2021-05-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux:
  habanalabs/gaudi: Fix a potential use after free in gaudi_memset_device_memory
  habanalabs: wait for interrupt wrong timeout calculation
  habanalabs: ignore f/w status error
  habanalabs: change error level of security not ready
  habanalabs: skip reading f/w errors on bad status
  habanalabs: expose ASIC specific PLL index
parents 6efb943b 115726c5
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2017,7 +2017,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
		if (completion_value >= target_value) {
			*status = CS_WAIT_STATUS_COMPLETED;
		} else {
			timeout -= jiffies_to_usecs(completion_rc);
			timeout = completion_rc;
			goto wait_again;
		}
	} else {
+31 −22
Original line number Diff line number Diff line
@@ -362,12 +362,9 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
	}

	if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
		dev_warn(hdev->dev,
		dev_err(hdev->dev,
			"Device boot warning - security not ready\n");
		/* This is a warning so we don't want it to disable the
		 * device
		 */
		err_val &= ~CPU_BOOT_ERR0_SECURITY_NOT_RDY;
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
@@ -403,7 +400,8 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
		err_exists = true;
	}

	if (err_exists)
	if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
				lower_32_bits(hdev->boot_error_status_mask)))
		return -EIO;

	return 0;
@@ -661,18 +659,13 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
	return rc;
}

int get_used_pll_index(struct hl_device *hdev, enum pll_index input_pll_index,
int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
						enum pll_index *pll_index)
{
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	u8 pll_byte, pll_bit_off;
	bool dynamic_pll;

	if (input_pll_index >= PLL_MAX) {
		dev_err(hdev->dev, "PLL index %d is out of range\n",
							input_pll_index);
		return -EINVAL;
	}
	int fw_pll_idx;

	dynamic_pll = prop->fw_security_status_valid &&
		(prop->fw_app_security_map & CPU_BOOT_DEV_STS0_DYN_PLL_EN);
@@ -680,28 +673,39 @@ int get_used_pll_index(struct hl_device *hdev, enum pll_index input_pll_index,
	if (!dynamic_pll) {
		/*
		 * in case we are working with legacy FW (each asic has unique
		 * PLL numbering) extract the legacy numbering
		 * PLL numbering) use the driver based index as they are
		 * aligned with fw legacy numbering
		 */
		*pll_index = hdev->legacy_pll_map[input_pll_index];
		*pll_index = input_pll_index;
		return 0;
	}

	/* retrieve a FW compatible PLL index based on
	 * ASIC specific user request
	 */
	fw_pll_idx = hdev->asic_funcs->map_pll_idx_to_fw_idx(input_pll_index);
	if (fw_pll_idx < 0) {
		dev_err(hdev->dev, "Invalid PLL index (%u) error %d\n",
			input_pll_index, fw_pll_idx);
		return -EINVAL;
	}

	/* PLL map is a u8 array */
	pll_byte = prop->cpucp_info.pll_map[input_pll_index >> 3];
	pll_bit_off = input_pll_index & 0x7;
	pll_byte = prop->cpucp_info.pll_map[fw_pll_idx >> 3];
	pll_bit_off = fw_pll_idx & 0x7;

	if (!(pll_byte & BIT(pll_bit_off))) {
		dev_err(hdev->dev, "PLL index %d is not supported\n",
							input_pll_index);
			fw_pll_idx);
		return -EINVAL;
	}

	*pll_index = input_pll_index;
	*pll_index = fw_pll_idx;

	return 0;
}

int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, enum pll_index pll_index,
int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
		u16 *pll_freq_arr)
{
	struct cpucp_packet pkt;
@@ -844,6 +848,11 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
	if (rc) {
		dev_err(hdev->dev, "Failed to read preboot version\n");
		detect_cpu_boot_status(hdev, status);

		/* If we read all FF, then something is totally wrong, no point
		 * of reading specific errors
		 */
		if (status != -1)
			fw_read_errors(hdev, boot_err0_reg,
					cpu_security_boot_status_reg);
		return -EIO;
+15 −8
Original line number Diff line number Diff line
@@ -930,6 +930,9 @@ enum div_select_defs {
 *                         driver is ready to receive asynchronous events. This
 *                         function should be called during the first init and
 *                         after every hard-reset of the device
 * @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event
 * @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to
 *                         generic f/w compatible PLL Indexes
 */
struct hl_asic_funcs {
	int (*early_init)(struct hl_device *hdev);
@@ -1054,6 +1057,7 @@ struct hl_asic_funcs {
			u32 block_id, u32 block_size);
	void (*enable_events_from_fw)(struct hl_device *hdev);
	void (*get_msi_info)(u32 *table);
	int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
};


@@ -1950,8 +1954,6 @@ struct hl_mmu_funcs {
 * @aggregated_cs_counters: aggregated cs counters among all contexts
 * @mmu_priv: device-specific MMU data.
 * @mmu_func: device-related MMU functions.
 * @legacy_pll_map: map holding map between dynamic (common) PLL indexes and
 *                  static (asic specific) PLL indexes.
 * @dram_used_mem: current DRAM memory consumption.
 * @timeout_jiffies: device CS timeout value.
 * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -1960,6 +1962,12 @@ struct hl_mmu_funcs {
 * @clock_gating_mask: is clock gating enabled. bitmask that represents the
 *                     different engines. See debugfs-driver-habanalabs for
 *                     details.
 * @boot_error_status_mask: contains a mask of the device boot error status.
 *                          Each bit represents a different error, according to
 *                          the defines in hl_boot_if.h. If the bit is cleared,
 *                          the error will be ignored by the driver during
 *                          device initialization. Mainly used to debug and
 *                          workaround firmware bugs
 * @in_reset: is device in reset flow.
 * @curr_pll_profile: current PLL profile.
 * @card_type: Various ASICs have several card types. This indicates the card
@@ -2071,12 +2079,11 @@ struct hl_device {
	struct hl_mmu_priv		mmu_priv;
	struct hl_mmu_funcs		mmu_func[MMU_NUM_PGT_LOCATIONS];

	enum pll_index			*legacy_pll_map;

	atomic64_t			dram_used_mem;
	u64				timeout_jiffies;
	u64				max_power;
	u64				clock_gating_mask;
	u64				boot_error_status_mask;
	atomic_t			in_reset;
	enum hl_pll_frequency		curr_pll_profile;
	enum cpucp_card_types		card_type;
@@ -2387,9 +2394,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
		struct hl_info_pci_counters *counters);
int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
			u64 *total_energy);
int get_used_pll_index(struct hl_device *hdev, enum pll_index input_pll_index,
int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
						enum pll_index *pll_index);
int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, enum pll_index pll_index,
int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
		u16 *pll_freq_arr);
int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power);
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
@@ -2411,9 +2418,9 @@ int hl_pci_set_outbound_region(struct hl_device *hdev,
int hl_pci_init(struct hl_device *hdev);
void hl_pci_fini(struct hl_device *hdev);

long hl_get_frequency(struct hl_device *hdev, enum pll_index pll_index,
long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
								bool curr);
void hl_set_frequency(struct hl_device *hdev, enum pll_index pll_index,
void hl_set_frequency(struct hl_device *hdev, u32 pll_index,
								u64 freq);
int hl_get_temperature(struct hl_device *hdev,
		       int sensor_index, u32 attr, long *value);
+7 −0
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@ static DEFINE_MUTEX(hl_devs_idr_lock);
static int timeout_locked = 30;
static int reset_on_lockup = 1;
static int memory_scrub = 1;
static ulong boot_error_status_mask = ULONG_MAX;

module_param(timeout_locked, int, 0444);
MODULE_PARM_DESC(timeout_locked,
@@ -43,6 +44,10 @@ module_param(memory_scrub, int, 0444);
MODULE_PARM_DESC(memory_scrub,
	"Scrub device memory in various states (0 = no, 1 = yes, default yes)");

module_param(boot_error_status_mask, ulong, 0444);
MODULE_PARM_DESC(boot_error_status_mask,
	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");

#define PCI_VENDOR_ID_HABANALABS	0x1da3

#define PCI_IDS_GOYA			0x0001
@@ -319,6 +324,8 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
	hdev->major = hl_major;
	hdev->reset_on_lockup = reset_on_lockup;
	hdev->memory_scrub = memory_scrub;
	hdev->boot_error_status_mask = boot_error_status_mask;

	hdev->pldm = 0;

	set_driver_behavior_per_device(hdev);
+2 −2
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@

#include <linux/pci.h>

long hl_get_frequency(struct hl_device *hdev, enum pll_index pll_index,
long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
								bool curr)
{
	struct cpucp_packet pkt;
@@ -44,7 +44,7 @@ long hl_get_frequency(struct hl_device *hdev, enum pll_index pll_index,
	return (long) result;
}

void hl_set_frequency(struct hl_device *hdev, enum pll_index pll_index,
void hl_set_frequency(struct hl_device *hdev, u32 pll_index,
								u64 freq)
{
	struct cpucp_packet pkt;
Loading