Commit 089ea22e authored by Justin Tee's avatar Justin Tee Committed by Martin K. Petersen
Browse files

scsi: lpfc: Abort outstanding ELS cmds when mailbox timeout error is detected



A mailbox timeout error usually indicates something has gone wrong, and a
follow up reset of the HBA is a typical recovery mechanism.  Introduce a
MBX_TMO_ERR flag to detect such cases and have lpfc_els_flush_cmd abort ELS
commands if the MBX_TMO_ERR flag condition was set.  This ensures all of
the registered SGL resources meant for ELS traffic are not leaked after an
HBA reset.

Signed-off-by: default avatarJustin Tee <justin.tee@broadcom.com>
Link: https://lore.kernel.org/r/20230712180522.112722-9-justintee8345@gmail.com


Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent 9388da30
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -872,6 +872,7 @@ enum lpfc_irq_chann_mode {
enum lpfc_hba_bit_flags {
	FABRIC_COMANDS_BLOCKED,
	HBA_PCI_ERR,
	MBX_TMO_ERR,
};

struct lpfc_hba {
+18 −7
Original line number Diff line number Diff line
@@ -9603,11 +9603,13 @@ void
lpfc_els_flush_cmd(struct lpfc_vport *vport)
{
	LIST_HEAD(abort_list);
	LIST_HEAD(cancel_list);
	struct lpfc_hba  *phba = vport->phba;
	struct lpfc_sli_ring *pring;
	struct lpfc_iocbq *tmp_iocb, *piocb;
	u32 ulp_command;
	unsigned long iflags = 0;
	bool mbx_tmo_err;

	lpfc_fabric_abort_vport(vport);

@@ -9629,15 +9631,16 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport)
	if (phba->sli_rev == LPFC_SLI_REV4)
		spin_lock(&pring->ring_lock);

	mbx_tmo_err = test_bit(MBX_TMO_ERR, &phba->bit_flags);
	/* First we need to issue aborts to outstanding cmds on txcmpl */
	list_for_each_entry_safe(piocb, tmp_iocb, &pring->txcmplq, list) {
		if (piocb->cmd_flag & LPFC_IO_LIBDFC)
		if (piocb->cmd_flag & LPFC_IO_LIBDFC && !mbx_tmo_err)
			continue;

		if (piocb->vport != vport)
			continue;

		if (piocb->cmd_flag & LPFC_DRIVER_ABORTED)
		if (piocb->cmd_flag & LPFC_DRIVER_ABORTED && !mbx_tmo_err)
			continue;

		/* On the ELS ring we can have ELS_REQUESTs or
@@ -9656,8 +9659,8 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport)
			 */
			if (phba->link_state == LPFC_LINK_DOWN)
				piocb->cmd_cmpl = lpfc_cmpl_els_link_down;
		}
		if (ulp_command == CMD_GEN_REQUEST64_CR)
		} else if (ulp_command == CMD_GEN_REQUEST64_CR ||
			   mbx_tmo_err)
			list_add_tail(&piocb->dlist, &abort_list);
	}

@@ -9669,9 +9672,17 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport)
	list_for_each_entry_safe(piocb, tmp_iocb, &abort_list, dlist) {
		spin_lock_irqsave(&phba->hbalock, iflags);
		list_del_init(&piocb->dlist);
		if (mbx_tmo_err)
			list_move_tail(&piocb->list, &cancel_list);
		else
			lpfc_sli_issue_abort_iotag(phba, pring, piocb, NULL);

		spin_unlock_irqrestore(&phba->hbalock, iflags);
	}
	if (!list_empty(&cancel_list))
		lpfc_sli_cancel_iocbs(phba, &cancel_list, IOSTAT_LOCAL_REJECT,
				      IOERR_SLI_ABORTED);
	else
		/* Make sure HBA is alive */
		lpfc_issue_hb_tmo(phba);

+17 −3
Original line number Diff line number Diff line
@@ -7550,6 +7550,8 @@ lpfc_disable_pci_dev(struct lpfc_hba *phba)
void
lpfc_reset_hba(struct lpfc_hba *phba)
{
	int rc = 0;

	/* If resets are disabled then set error state and return. */
	if (!phba->cfg_enable_hba_reset) {
		phba->link_state = LPFC_HBA_ERROR;
@@ -7560,14 +7562,26 @@ lpfc_reset_hba(struct lpfc_hba *phba)
	if (phba->sli.sli_flag & LPFC_SLI_ACTIVE) {
		lpfc_offline_prep(phba, LPFC_MBX_WAIT);
	} else {
		if (test_bit(MBX_TMO_ERR, &phba->bit_flags)) {
			/* Perform a PCI function reset to start from clean */
			rc = lpfc_pci_function_reset(phba);
			lpfc_els_flush_all_cmd(phba);
		}
		lpfc_offline_prep(phba, LPFC_MBX_NO_WAIT);
		lpfc_sli_flush_io_rings(phba);
	}
	lpfc_offline(phba);
	clear_bit(MBX_TMO_ERR, &phba->bit_flags);
	if (unlikely(rc)) {
		lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
				"8888 PCI function reset failed rc %x\n",
				rc);
	} else {
		lpfc_sli_brdrestart(phba);
		lpfc_online(phba);
		lpfc_unblock_mgmt_io(phba);
	}
}

/**
 * lpfc_sli_sriov_nr_virtfn_get - Get the number of sr-iov virtual functions
+7 −1
Original line number Diff line number Diff line
@@ -3935,6 +3935,8 @@ void lpfc_poll_eratt(struct timer_list *t)
	uint64_t sli_intr, cnt;
	phba = from_timer(phba, t, eratt_poll);
	if (!(phba->hba_flag & HBA_SETUP))
		return;
	/* Here we will also keep track of interrupts per sec of the hba */
	sli_intr = phba->sli.slistat.sli_intr;
@@ -7693,7 +7695,9 @@ lpfc_sli4_repost_sgl_list(struct lpfc_hba *phba,
		spin_unlock_irq(&phba->hbalock);
	} else {
		lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
				"3161 Failure to post sgl to port.\n");
				"3161 Failure to post sgl to port,status %x "
				"blkcnt %d totalcnt %d postcnt %d\n",
				status, block_cnt, total_cnt, post_cnt);
		return -EIO;
	}
@@ -8478,6 +8482,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba)
			spin_unlock_irq(&phba->hbalock);
		}
	}
	phba->hba_flag &= ~HBA_SETUP;
	lpfc_sli4_dip(phba);
@@ -9282,6 +9287,7 @@ lpfc_mbox_timeout_handler(struct lpfc_hba *phba)
	 * would get IOCB_ERROR from lpfc_sli_issue_iocb, allowing
	 * it to fail all outstanding SCSI IO.
	 */
	set_bit(MBX_TMO_ERR, &phba->bit_flags);
	spin_lock_irq(&phba->pport->work_port_lock);
	phba->pport->work_port_events &= ~WORKER_MBOX_TMO;
	spin_unlock_irq(&phba->pport->work_port_lock);