Commit bafed3f2 authored by Kalesh AP's avatar Kalesh AP Committed by David S. Miller
Browse files

bnxt_en: implement hw health reporter



This reporter will report NVM errors which are non-fatal.
When we receive these NVM error events, we'll report it
through this new hw health reporter.

Reviewed-by: default avatarEdwin Peer <edwin.peer@broadcom.com>
Signed-off-by: default avatarKalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: default avatarMichael Chan <michael.chan@broadcom.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f16a9169
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
@@ -2061,6 +2061,22 @@ static void bnxt_event_error_report(struct bnxt *bp, u32 data1, u32 data2)
	case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD:
		netdev_warn(bp->dev, "One or more MMIO doorbells dropped by the device!\n");
		break;
	case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM: {
		struct bnxt_hw_health *hw_health = &bp->hw_health;

		hw_health->nvm_err_address = EVENT_DATA2_NVM_ERR_ADDR(data2);
		if (EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)) {
			hw_health->synd = BNXT_HW_STATUS_NVM_WRITE_ERR;
			hw_health->nvm_write_errors++;
		} else if (EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)) {
			hw_health->synd = BNXT_HW_STATUS_NVM_ERASE_ERR;
			hw_health->nvm_erase_errors++;
		} else {
			hw_health->synd = BNXT_HW_STATUS_NVM_UNKNOWN_ERR;
		}
		set_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event);
		break;
	}
	default:
		netdev_err(bp->dev, "FW reported unknown error type %u\n",
			   err_type);
@@ -11887,6 +11903,9 @@ static void bnxt_sp_task(struct work_struct *work)
	if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event))
		bnxt_fw_echo_reply(bp);

	if (test_and_clear_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event))
		bnxt_devlink_health_hw_report(bp);

	/* These functions below will clear BNXT_STATE_IN_SP_TASK.  They
	 * must be the last functions to be called before exiting.
	 */
+33 −0
Original line number Diff line number Diff line
@@ -516,6 +516,21 @@ struct rx_tpa_end_cmp_ext {
	  ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_MASK) >>\
	 ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_SFT)

#define EVENT_DATA2_NVM_ERR_ADDR(data2)					\
	(((data2) &							\
	  ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_MASK) >>\
	 ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_SFT)

#define EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)				\
	(((data1) &							\
	  ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\
	 ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_WRITE)

#define EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)				\
	(((data1) &							\
	  ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\
	 ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE)

struct nqe_cn {
	__le16	type;
	#define NQ_CN_TYPE_MASK           0x3fUL
@@ -1528,6 +1543,21 @@ struct bnxt_ctx_mem_info {
	struct bnxt_mem_init	mem_init[BNXT_CTX_MEM_INIT_MAX];
};

enum bnxt_hw_err {
	BNXT_HW_STATUS_HEALTHY		= 0x0,
	BNXT_HW_STATUS_NVM_WRITE_ERR	= 0x1,
	BNXT_HW_STATUS_NVM_ERASE_ERR	= 0x2,
	BNXT_HW_STATUS_NVM_UNKNOWN_ERR	= 0x3,
};

struct bnxt_hw_health {
	u32 nvm_err_address;
	u32 nvm_write_errors;
	u32 nvm_erase_errors;
	u8 synd;
	struct devlink_health_reporter *hw_reporter;
};

enum bnxt_health_severity {
	SEVERITY_NORMAL = 0,
	SEVERITY_WARNING,
@@ -2045,6 +2075,7 @@ struct bnxt {
#define BNXT_FW_EXCEPTION_SP_EVENT	19
#define BNXT_LINK_CFG_CHANGE_SP_EVENT	21
#define BNXT_FW_ECHO_REQUEST_SP_EVENT	23
#define BNXT_FW_NVM_ERR_SP_EVENT	25

	struct delayed_work	fw_reset_task;
	int			fw_reset_state;
@@ -2145,6 +2176,8 @@ struct bnxt {
	struct dentry		*debugfs_pdev;
	struct device		*hwmon_dev;
	enum board_idx		board_idx;

	struct bnxt_hw_health	hw_health;
};

#define BNXT_NUM_RX_RING_STATS			8
+73 −0
Original line number Diff line number Diff line
@@ -241,6 +241,69 @@ static const struct devlink_health_reporter_ops bnxt_dl_fw_reporter_ops = {
	.recover = bnxt_fw_recover,
};

static int bnxt_hw_recover(struct devlink_health_reporter *reporter,
			   void *priv_ctx,
			   struct netlink_ext_ack *extack)
{
	struct bnxt *bp = devlink_health_reporter_priv(reporter);
	struct bnxt_hw_health *hw_health = &bp->hw_health;

	hw_health->synd = BNXT_HW_STATUS_HEALTHY;
	return 0;
}

static const char *hw_err_str(u8 synd)
{
	switch (synd) {
	case BNXT_HW_STATUS_HEALTHY:
		return "healthy";
	case BNXT_HW_STATUS_NVM_WRITE_ERR:
		return "nvm write error";
	case BNXT_HW_STATUS_NVM_ERASE_ERR:
		return "nvm erase error";
	case BNXT_HW_STATUS_NVM_UNKNOWN_ERR:
		return "unrecognized nvm error";
	default:
		return "unknown hw error";
	}
}

static int bnxt_hw_diagnose(struct devlink_health_reporter *reporter,
			    struct devlink_fmsg *fmsg,
			    struct netlink_ext_ack *extack)
{
	struct bnxt *bp = devlink_health_reporter_priv(reporter);
	struct bnxt_hw_health *h = &bp->hw_health;
	int rc;

	rc = devlink_fmsg_string_pair_put(fmsg, "Status", hw_err_str(h->synd));
	if (rc)
		return rc;
	rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_write_errors", h->nvm_write_errors);
	if (rc)
		return rc;
	rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_erase_errors", h->nvm_erase_errors);
	if (rc)
		return rc;
	return 0;
}

void bnxt_devlink_health_hw_report(struct bnxt *bp)
{
	struct bnxt_hw_health *hw_health = &bp->hw_health;

	netdev_warn(bp->dev, "%s reported at address 0x%x\n", hw_err_str(hw_health->synd),
		    hw_health->nvm_err_address);

	devlink_health_report(hw_health->hw_reporter, hw_err_str(hw_health->synd), NULL);
}

static const struct devlink_health_reporter_ops bnxt_dl_hw_reporter_ops = {
	.name = "hw",
	.diagnose = bnxt_hw_diagnose,
	.recover = bnxt_hw_recover,
};

static struct devlink_health_reporter *
__bnxt_dl_reporter_create(struct bnxt *bp,
			  const struct devlink_health_reporter_ops *ops)
@@ -260,6 +323,10 @@ __bnxt_dl_reporter_create(struct bnxt *bp,
void bnxt_dl_fw_reporters_create(struct bnxt *bp)
{
	struct bnxt_fw_health *fw_health = bp->fw_health;
	struct bnxt_hw_health *hw_health = &bp->hw_health;

	if (!hw_health->hw_reporter)
		hw_health->hw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_hw_reporter_ops);

	if (fw_health && !fw_health->fw_reporter)
		fw_health->fw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_fw_reporter_ops);
@@ -268,6 +335,12 @@ void bnxt_dl_fw_reporters_create(struct bnxt *bp)
void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
{
	struct bnxt_fw_health *fw_health = bp->fw_health;
	struct bnxt_hw_health *hw_health = &bp->hw_health;

	if (hw_health->hw_reporter) {
		devlink_health_reporter_destroy(hw_health->hw_reporter);
		hw_health->hw_reporter = NULL;
	}

	if (fw_health && fw_health->fw_reporter) {
		devlink_health_reporter_destroy(fw_health->fw_reporter);
+1 −0
Original line number Diff line number Diff line
@@ -74,6 +74,7 @@ enum bnxt_dl_version_type {
void bnxt_devlink_health_fw_report(struct bnxt *bp);
void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy);
void bnxt_dl_health_fw_recovery_done(struct bnxt *bp);
void bnxt_devlink_health_hw_report(struct bnxt *bp);
void bnxt_dl_fw_reporters_create(struct bnxt *bp);
void bnxt_dl_fw_reporters_destroy(struct bnxt *bp);
int bnxt_dl_register(struct bnxt *bp);