Commit 25b450c0 authored by Shannon Nelson's avatar Shannon Nelson Committed by David S. Miller
Browse files

pds_core: add devlink health facilities



Add devlink health reporting on top of our fw watchdog.

Example:
  # devlink health show pci/0000:2b:00.0 reporter fw
  pci/0000:2b:00.0:
    reporter fw
      state healthy error 0 recover 0
  # devlink health diagnose pci/0000:2b:00.0 reporter fw
   Status: healthy State: 1 Generation: 0 Recoveries: 0

Signed-off-by: default avatarShannon Nelson <shannon.nelson@amd.com>
Acked-by: default avatarJakub Kicinski <kuba@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent c2dbb090
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -26,6 +26,18 @@ messages such as these::
  pds_core 0000:b6:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
  pds_core 0000:b6:00.0: FW: 1.60.0-73

Health Reporters
================

The driver supports a devlink health reporter for FW status::

  # devlink health show pci/0000:2b:00.0 reporter fw
  pci/0000:2b:00.0:
    reporter fw
      state healthy error 0 recover 0
  # devlink health diagnose pci/0000:2b:00.0 reporter fw
   Status: healthy State: 1 Generation: 0 Recoveries: 0

Support
=======

+1 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@
obj-$(CONFIG_PDS_CORE) := pds_core.o

pds_core-y := main.o \
	      devlink.o \
	      dev.o \
	      core.o

+7 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@

int pdsc_setup(struct pdsc *pdsc, bool init)
{
	int err = 0;
	int err;

	if (init)
		err = pdsc_dev_init(pdsc);
@@ -42,6 +42,8 @@ static void pdsc_fw_down(struct pdsc *pdsc)
		return;
	}

	devlink_health_report(pdsc->fw_reporter, "FW down reported", pdsc);

	pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY);
}

@@ -58,6 +60,10 @@ static void pdsc_fw_up(struct pdsc *pdsc)
	if (err)
		goto err_out;

	pdsc->fw_recoveries++;
	devlink_health_reporter_state_update(pdsc->fw_reporter,
					     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);

	return;

err_out:
+6 −0
Original line number Diff line number Diff line
@@ -68,6 +68,8 @@ struct pdsc {
	struct timer_list wdtimer;
	unsigned int wdtimer_period;
	struct work_struct health_work;
	struct devlink_health_reporter *fw_reporter;
	u32 fw_recoveries;

	struct pdsc_devinfo dev_info;
	struct pds_core_dev_identity dev_ident;
@@ -88,6 +90,10 @@ struct pdsc {
	u64 __iomem *kern_dbpage;
};

int pdsc_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
			      struct devlink_fmsg *fmsg,
			      struct netlink_ext_ack *extack);

void pdsc_debugfs_create(void);
void pdsc_debugfs_destroy(void);
void pdsc_debugfs_add_dev(struct pdsc *pdsc);
+40 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2023 Advanced Micro Devices, Inc */

#include "core.h"

int pdsc_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
			      struct devlink_fmsg *fmsg,
			      struct netlink_ext_ack *extack)
{
	struct pdsc *pdsc = devlink_health_reporter_priv(reporter);
	int err;

	mutex_lock(&pdsc->config_lock);

	if (test_bit(PDSC_S_FW_DEAD, &pdsc->state))
		err = devlink_fmsg_string_pair_put(fmsg, "Status", "dead");
	else if (!pdsc_is_fw_good(pdsc))
		err = devlink_fmsg_string_pair_put(fmsg, "Status", "unhealthy");
	else
		err = devlink_fmsg_string_pair_put(fmsg, "Status", "healthy");

	mutex_unlock(&pdsc->config_lock);

	if (err)
		return err;

	err = devlink_fmsg_u32_pair_put(fmsg, "State",
					pdsc->fw_status &
						~PDS_CORE_FW_STS_F_GENERATION);
	if (err)
		return err;

	err = devlink_fmsg_u32_pair_put(fmsg, "Generation",
					pdsc->fw_generation >> 4);
	if (err)
		return err;

	return devlink_fmsg_u32_pair_put(fmsg, "Recoveries",
					 pdsc->fw_recoveries);
}
Loading