Commit a22d73b6 authored by James Smart's avatar James Smart Committed by Martin K. Petersen
Browse files

scsi: lpfc: Implement health checking when aborting I/O

Several errors have occurred where the adapter stops or fails but does not
raise the register values for the driver to detect failure. Thus driver is
unaware of the failure. The failure typically results in I/O timeouts, the
I/O timeout handler failing (after several seconds), and the error handler
escalating recovery policy and resulting in more errors. Eventually, the
driver is in a position where things have spiraled and it can't do recovery
because other recovery ops are still outstanding and it becomes unusable.

Resolve the situation by having the I/O timeout handler (actually a els,
SCSI I/O, NVMe ls, or NVMe I/O timeout), in addition to aborting the I/O,
perform a mailbox command and look for a response from the hardware.  If
the mailbox command fails, it will mark the adapter offline and then invoke
the adapter reset handler to clean up.

The new I/O timeout test will be limited to a test every 5s. If there are
multiple I/O timeouts concurrently, only the 1st I/O timeout will generate
the mailbox command. Further testing will only occur once a timeout occurs
after a 5s delay from the last mailbox command has expired.

Link: https://lore.kernel.org/r/20210104180240.46824-14-jsmart2021@gmail.com


Co-developed-by: default avatarDick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: default avatarDick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: default avatarJames Smart <jsmart2021@gmail.com>
Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent 243156c0
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -780,6 +780,8 @@ struct lpfc_hba {
#define HBA_FLOGI_ISSUED	0x100000 /* FLOGI was issued */
#define HBA_DEFER_FLOGI		0x800000 /* Defer FLOGI till read_sparm cmpl */
#define HBA_NEEDS_CFG_PORT	0x2000000 /* SLI3 - needs a CONFIG_PORT mbox */
#define HBA_HBEAT_INP		0x4000000 /* mbox HBEAT is in progress */
#define HBA_HBEAT_TMO		0x8000000 /* HBEAT initiated after timeout */

	uint32_t fcp_ring_in_use; /* When polling test if intr-hndlr active*/
	struct lpfc_dmabuf slim2p;
@@ -1136,7 +1138,6 @@ struct lpfc_hba {
	unsigned long last_completion_time;
	unsigned long skipped_hb;
	struct timer_list hb_tmofunc;
	uint8_t hb_outstanding;
	struct timer_list rrq_tmr;
	enum hba_temp_state over_temp_state;
	/*
+2 −0
Original line number Diff line number Diff line
@@ -1788,6 +1788,8 @@ lpfc_board_mode_store(struct device *dev, struct device_attribute *attr,
	else if (strncmp(buf, "pci_bus_reset", sizeof("pci_bus_reset") - 1)
		 == 0)
		status = lpfc_reset_pci_bus(phba);
	else if (strncmp(buf, "heartbeat", sizeof("heartbeat") - 1) == 0)
		lpfc_issue_hb_tmo(phba);
	else if (strncmp(buf, "trunk", sizeof("trunk") - 1) == 0)
		status = lpfc_set_trunking(phba, (char *)buf + sizeof("trunk"));
	else
+2 −0
Original line number Diff line number Diff line
@@ -359,6 +359,8 @@ lpfc_sli_abort_taskmgmt(struct lpfc_vport *, struct lpfc_sli_ring *,

void lpfc_mbox_timeout(struct timer_list *t);
void lpfc_mbox_timeout_handler(struct lpfc_hba *);
int lpfc_issue_hb_mbox(struct lpfc_hba *phba);
void lpfc_issue_hb_tmo(struct lpfc_hba *phba);

struct lpfc_nodelist *lpfc_findnode_did(struct lpfc_vport *, uint32_t);
struct lpfc_nodelist *lpfc_findnode_wwpn(struct lpfc_vport *,
+9 −0
Original line number Diff line number Diff line
@@ -1428,6 +1428,9 @@ lpfc_els_abort_flogi(struct lpfc_hba *phba)
							   NULL);
		}
	}
	/* Make sure HBA is alive */
	lpfc_issue_hb_tmo(phba);

	spin_unlock_irq(&phba->hbalock);

	return 0;
@@ -8127,6 +8130,9 @@ lpfc_els_timeout_handler(struct lpfc_vport *vport)
		spin_unlock_irq(&phba->hbalock);
	}

	/* Make sure HBA is alive */
	lpfc_issue_hb_tmo(phba);

	if (!list_empty(&pring->txcmplq))
		if (!(phba->pport->load_flag & FC_UNLOADING))
			mod_timer(&vport->els_tmofunc,
@@ -8226,6 +8232,9 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport)
		lpfc_sli_issue_abort_iotag(phba, pring, piocb, NULL);
		spin_unlock_irqrestore(&phba->hbalock, iflags);
	}
	/* Make sure HBA is alive */
	lpfc_issue_hb_tmo(phba);

	if (!list_empty(&abort_list))
		lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
				 "3387 abort list for txq not empty\n");
+3 −0
Original line number Diff line number Diff line
@@ -5619,6 +5619,9 @@ lpfc_free_tx(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp)
	}
	spin_unlock_irq(&phba->hbalock);

	/* Make sure HBA is alive */
	lpfc_issue_hb_tmo(phba);

	/* Cancel all the IOCBs from the completions list */
	lpfc_sli_cancel_iocbs(phba, &completions, IOSTAT_LOCAL_REJECT,
			      IOERR_SLI_ABORTED);
Loading