Commit 9e855b1f authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'fix-rtnl_mutex-deadlock-with-dpaa2-and-sfp-modules'

Vladimir Oltean says:

====================
Fix rtnl_mutex deadlock with DPAA2 and SFP modules

This patch set deliberately targets net-next and lacks Fixes: tags due
to caution on my part.

While testing some SFP modules on the Solidrun Honeycomb LX2K platform,
I noticed that rebooting causes a deadlock:

============================================
WARNING: possible recursive locking detected
6.1.0-rc5-07010-ga9b9500ffaac-dirty #656 Not tainted
--------------------------------------------
systemd-shutdow/1 is trying to acquire lock:
ffffa62db6cf42f0 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_lock+0x1c/0x30

but task is already holding lock:
ffffa62db6cf42f0 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_lock+0x1c/0x30

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(rtnl_mutex);
  lock(rtnl_mutex);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

6 locks held by systemd-shutdow/1:
 #0: ffffa62db6863c70 (system_transition_mutex){+.+.}-{4:4}, at: __do_sys_reboot+0xd4/0x260
 #1: ffff2f2b0176f100 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xf4/0x260
 #2: ffff2f2b017be900 (&dev->mutex){....}-{4:4}, at: device_shutdown+0x104/0x260
 #3: ffff2f2b017680f0 (&dev->mutex){....}-{4:4}, at: device_release_driver_internal+0x40/0x260
 #4: ffff2f2b0e1608f0 (&dev->mutex){....}-{4:4}, at: device_release_driver_internal+0x40/0x260
 #5: ffffa62db6cf42f0 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_lock+0x1c/0x30

stack backtrace:
CPU: 6 PID: 1 Comm: systemd-shutdow Not tainted 6.1.0-rc5-07010-ga9b9500ffaac-dirty #656
Hardware name: SolidRun LX2160A Honeycomb (DT)
Call trace:
 lock_acquire+0x68/0x84
 __mutex_lock+0x98/0x460
 mutex_lock_nested+0x2c/0x40
 rtnl_lock+0x1c/0x30
 sfp_bus_del_upstream+0x1c/0xac
 phylink_destroy+0x1c/0x50
 dpaa2_mac_disconnect+0x28/0x70
 dpaa2_eth_remove+0x1dc/0x1f0
 fsl_mc_driver_remove+0x24/0x60
 device_remove+0x70/0x80
 device_release_driver_internal+0x1f0/0x260
 device_links_unbind_consumers+0xe0/0x110
 device_release_driver_internal+0x138/0x260
 device_release_driver+0x18/0x24
 bus_remove_device+0x12c/0x13c
 device_del+0x16c/0x424
 fsl_mc_device_remove+0x28/0x40
 __fsl_mc_device_remove+0x10/0x20
 device_for_each_child+0x5c/0xac
 dprc_remove+0x94/0xb4
 fsl_mc_driver_remove+0x24/0x60
 device_remove+0x70/0x80
 device_release_driver_internal+0x1f0/0x260
 device_release_driver+0x18/0x24
 bus_remove_device+0x12c/0x13c
 device_del+0x16c/0x424
 fsl_mc_bus_remove+0x8c/0x10c
 fsl_mc_bus_shutdown+0x10/0x20
 platform_shutdown+0x24/0x3c
 device_shutdown+0x15c/0x260
 kernel_restart+0x40/0xa4
 __do_sys_reboot+0x1e4/0x260
 __arm64_sys_reboot+0x24/0x30

But fixing this appears to be not so simple. The patch set represents my
attempt to address it.

In short, the problem is that dpaa2_mac_connect() and dpaa2_mac_disconnect()
call 2 phylink functions in a row, one takes rtnl_lock() itself -
phylink_create(), and one which requires rtnl_lock() to be held by the
caller - phylink_fwnode_phy_connect(). The existing approach in the
drivers is too simple. We take rtnl_lock() when calling dpaa2_mac_connect(),
which is what results in the deadlock.

Fixing just that creates another problem. The drivers make use of
rtnl_lock() for serializing with other code paths too. I think I've
found all those code paths, and established other mechanisms for
serializing with them.
====================

Link: https://lore.kernel.org/r/20221129141221.872653-1-vladimir.oltean@nxp.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 682f560b 87db82cb
Loading
Loading
Loading
Loading
+6 −3
Original line number Diff line number Diff line
@@ -181,10 +181,13 @@ when necessary using the below listed API::
 - int dpaa2_mac_connect(struct dpaa2_mac *mac);
 - void dpaa2_mac_disconnect(struct dpaa2_mac *mac);

A phylink integration is necessary only when the partner DPMAC is not of TYPE_FIXED.
One can check for this condition using the below API::
A phylink integration is necessary only when the partner DPMAC is not of
``TYPE_FIXED``. This means it is either of ``TYPE_PHY``, or of
``TYPE_BACKPLANE`` (the difference being the two that in the ``TYPE_BACKPLANE``
mode, the MC firmware does not access the PCS registers). One can check for
this condition using the following helper::

 - bool dpaa2_mac_is_type_fixed(struct fsl_mc_device *dpmac_dev,struct fsl_mc_io *mc_io);
 - static inline bool dpaa2_mac_is_type_phy(struct dpaa2_mac *mac);

Before connection to a MAC, the caller must allocate and populate the
dpaa2_mac structure with the associated net_device, a pointer to the MC portal
+58 −29
Original line number Diff line number Diff line
@@ -2147,8 +2147,11 @@ static int dpaa2_eth_link_state_update(struct dpaa2_eth_priv *priv)

	/* When we manage the MAC/PHY using phylink there is no need
	 * to manually update the netif_carrier.
	 * We can avoid locking because we are called from the "link changed"
	 * IRQ handler, which is the same as the "endpoint changed" IRQ handler
	 * (the writer to priv->mac), so we cannot race with it.
	 */
	if (dpaa2_eth_is_type_phy(priv))
	if (dpaa2_mac_is_type_phy(priv->mac))
		goto out;

	/* Chech link state; speed / duplex changes are not treated yet */
@@ -2179,6 +2182,8 @@ static int dpaa2_eth_open(struct net_device *net_dev)

	dpaa2_eth_seed_pools(priv);

	mutex_lock(&priv->mac_lock);

	if (!dpaa2_eth_is_type_phy(priv)) {
		/* We'll only start the txqs when the link is actually ready;
		 * make sure we don't race against the link up notification,
@@ -2197,14 +2202,15 @@ static int dpaa2_eth_open(struct net_device *net_dev)

	err = dpni_enable(priv->mc_io, 0, priv->mc_token);
	if (err < 0) {
		mutex_unlock(&priv->mac_lock);
		netdev_err(net_dev, "dpni_enable() failed\n");
		goto enable_err;
	}

	if (dpaa2_eth_is_type_phy(priv)) {
	if (dpaa2_eth_is_type_phy(priv))
		dpaa2_mac_start(priv->mac);
		phylink_start(priv->mac->phylink);
	}

	mutex_unlock(&priv->mac_lock);

	return 0;

@@ -2277,14 +2283,17 @@ static int dpaa2_eth_stop(struct net_device *net_dev)
	int dpni_enabled = 0;
	int retries = 10;

	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_is_type_phy(priv)) {
		phylink_stop(priv->mac->phylink);
		dpaa2_mac_stop(priv->mac);
	} else {
		netif_tx_stop_all_queues(net_dev);
		netif_carrier_off(net_dev);
	}

	mutex_unlock(&priv->mac_lock);

	/* On dpni_disable(), the MC firmware will:
	 * - stop MAC Rx and wait for all Rx frames to be enqueued to software
	 * - cut off WRIOP dequeues from egress FQs and wait until transmission
@@ -2610,12 +2619,20 @@ static int dpaa2_eth_ts_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
static int dpaa2_eth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
	struct dpaa2_eth_priv *priv = netdev_priv(dev);
	int err;

	if (cmd == SIOCSHWTSTAMP)
		return dpaa2_eth_ts_ioctl(dev, rq, cmd);

	if (dpaa2_eth_is_type_phy(priv))
		return phylink_mii_ioctl(priv->mac->phylink, rq, cmd);
	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_is_type_phy(priv)) {
		err = phylink_mii_ioctl(priv->mac->phylink, rq, cmd);
		mutex_unlock(&priv->mac_lock);
		return err;
	}

	mutex_unlock(&priv->mac_lock);

	return -EOPNOTSUPP;
}
@@ -3791,7 +3808,7 @@ static int dpaa2_eth_setup_dpni(struct fsl_mc_device *ls_dev)
		dev_err(dev, "DPNI version %u.%u not supported, need >= %u.%u\n",
			priv->dpni_ver_major, priv->dpni_ver_minor,
			DPNI_VER_MAJOR, DPNI_VER_MINOR);
		err = -ENOTSUPP;
		err = -EOPNOTSUPP;
		goto close;
	}

@@ -4627,9 +4644,8 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv *priv)
	err = dpaa2_mac_open(mac);
	if (err)
		goto err_free_mac;
	priv->mac = mac;

	if (dpaa2_eth_is_type_phy(priv)) {
	if (dpaa2_mac_is_type_phy(mac)) {
		err = dpaa2_mac_connect(mac);
		if (err) {
			if (err == -EPROBE_DEFER)
@@ -4643,11 +4659,14 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv *priv)
		}
	}

	mutex_lock(&priv->mac_lock);
	priv->mac = mac;
	mutex_unlock(&priv->mac_lock);

	return 0;

err_close_mac:
	dpaa2_mac_close(mac);
	priv->mac = NULL;
err_free_mac:
	kfree(mac);
	return err;
@@ -4655,15 +4674,21 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv *priv)

static void dpaa2_eth_disconnect_mac(struct dpaa2_eth_priv *priv)
{
	if (dpaa2_eth_is_type_phy(priv))
		dpaa2_mac_disconnect(priv->mac);
	struct dpaa2_mac *mac;

	if (!dpaa2_eth_has_mac(priv))
	mutex_lock(&priv->mac_lock);
	mac = priv->mac;
	priv->mac = NULL;
	mutex_unlock(&priv->mac_lock);

	if (!mac)
		return;

	dpaa2_mac_close(priv->mac);
	kfree(priv->mac);
	priv->mac = NULL;
	if (dpaa2_mac_is_type_phy(mac))
		dpaa2_mac_disconnect(mac);

	dpaa2_mac_close(mac);
	kfree(mac);
}

static irqreturn_t dpni_irq0_handler_thread(int irq_num, void *arg)
@@ -4673,6 +4698,7 @@ static irqreturn_t dpni_irq0_handler_thread(int irq_num, void *arg)
	struct fsl_mc_device *dpni_dev = to_fsl_mc_device(dev);
	struct net_device *net_dev = dev_get_drvdata(dev);
	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
	bool had_mac;
	int err;

	err = dpni_get_irq_status(dpni_dev->mc_io, 0, dpni_dev->mc_handle,
@@ -4689,12 +4715,15 @@ static irqreturn_t dpni_irq0_handler_thread(int irq_num, void *arg)
		dpaa2_eth_set_mac_addr(netdev_priv(net_dev));
		dpaa2_eth_update_tx_fqids(priv);

		rtnl_lock();
		if (dpaa2_eth_has_mac(priv))
		/* We can avoid locking because the "endpoint changed" IRQ
		 * handler is the only one who changes priv->mac at runtime,
		 * so we are not racing with anyone.
		 */
		had_mac = !!priv->mac;
		if (had_mac)
			dpaa2_eth_disconnect_mac(priv);
		else
			dpaa2_eth_connect_mac(priv);
		rtnl_unlock();
	}

	return IRQ_HANDLED;
@@ -4792,6 +4821,8 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev)
	priv->net_dev = net_dev;
	SET_NETDEV_DEVLINK_PORT(net_dev, &priv->devlink_port);

	mutex_init(&priv->mac_lock);

	priv->iommu_domain = iommu_get_domain_for_dev(dev);

	priv->tx_tstamp_type = HWTSTAMP_TX_OFF;
@@ -4899,6 +4930,10 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev)
	}
#endif

	err = dpaa2_eth_connect_mac(priv);
	if (err)
		goto err_connect_mac;

	err = dpaa2_eth_setup_irqs(dpni_dev);
	if (err) {
		netdev_warn(net_dev, "Failed to set link interrupt, fall back to polling\n");
@@ -4911,10 +4946,6 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev)
		priv->do_link_poll = true;
	}

	err = dpaa2_eth_connect_mac(priv);
	if (err)
		goto err_connect_mac;

	err = dpaa2_eth_dl_alloc(priv);
	if (err)
		goto err_dl_register;
@@ -4948,13 +4979,13 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev)
err_dl_trap_register:
	dpaa2_eth_dl_free(priv);
err_dl_register:
	dpaa2_eth_disconnect_mac(priv);
err_connect_mac:
	if (priv->do_link_poll)
		kthread_stop(priv->poll_thread);
	else
		fsl_mc_free_irqs(dpni_dev);
err_poll_thread:
	dpaa2_eth_disconnect_mac(priv);
err_connect_mac:
	dpaa2_eth_free_rings(priv);
err_alloc_rings:
err_csum:
@@ -5002,9 +5033,6 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev)
#endif

	unregister_netdev(net_dev);
	rtnl_lock();
	dpaa2_eth_disconnect_mac(priv);
	rtnl_unlock();

	dpaa2_eth_dl_port_del(priv);
	dpaa2_eth_dl_traps_unregister(priv);
@@ -5015,6 +5043,7 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev)
	else
		fsl_mc_free_irqs(ls_dev);

	dpaa2_eth_disconnect_mac(priv);
	dpaa2_eth_free_rings(priv);
	free_percpu(priv->fd);
	free_percpu(priv->sgt_cache);
+6 −5
Original line number Diff line number Diff line
@@ -615,6 +615,8 @@ struct dpaa2_eth_priv {
#endif

	struct dpaa2_mac *mac;
	/* Serializes changes to priv->mac */
	struct mutex		mac_lock;
	struct workqueue_struct	*dpaa2_ptp_wq;
	struct work_struct	tx_onestep_tstamp;
	struct sk_buff_head	tx_skbs;
@@ -768,16 +770,15 @@ static inline unsigned int dpaa2_eth_rx_head_room(struct dpaa2_eth_priv *priv)

static inline bool dpaa2_eth_is_type_phy(struct dpaa2_eth_priv *priv)
{
	if (priv->mac &&
	    (priv->mac->attr.link_type == DPMAC_LINK_TYPE_PHY ||
	     priv->mac->attr.link_type == DPMAC_LINK_TYPE_BACKPLANE))
		return true;
	lockdep_assert_held(&priv->mac_lock);

	return false;
	return dpaa2_mac_is_type_phy(priv->mac);
}

static inline bool dpaa2_eth_has_mac(struct dpaa2_eth_priv *priv)
{
	lockdep_assert_held(&priv->mac_lock);

	return priv->mac ? true : false;
}

+50 −20
Original line number Diff line number Diff line
@@ -85,11 +85,16 @@ static void dpaa2_eth_get_drvinfo(struct net_device *net_dev,
static int dpaa2_eth_nway_reset(struct net_device *net_dev)
{
	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
	int err = -EOPNOTSUPP;

	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_is_type_phy(priv))
		return phylink_ethtool_nway_reset(priv->mac->phylink);
		err = phylink_ethtool_nway_reset(priv->mac->phylink);

	return -EOPNOTSUPP;
	mutex_unlock(&priv->mac_lock);

	return err;
}

static int
@@ -97,10 +102,18 @@ dpaa2_eth_get_link_ksettings(struct net_device *net_dev,
			     struct ethtool_link_ksettings *link_settings)
{
	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
	int err;

	if (dpaa2_eth_is_type_phy(priv))
		return phylink_ethtool_ksettings_get(priv->mac->phylink,
	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_is_type_phy(priv)) {
		err = phylink_ethtool_ksettings_get(priv->mac->phylink,
						    link_settings);
		mutex_unlock(&priv->mac_lock);
		return err;
	}

	mutex_unlock(&priv->mac_lock);

	link_settings->base.autoneg = AUTONEG_DISABLE;
	if (!(priv->link_state.options & DPNI_LINK_OPT_HALF_DUPLEX))
@@ -115,11 +128,17 @@ dpaa2_eth_set_link_ksettings(struct net_device *net_dev,
			     const struct ethtool_link_ksettings *link_settings)
{
	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
	int err = -EOPNOTSUPP;

	mutex_lock(&priv->mac_lock);

	if (!dpaa2_eth_is_type_phy(priv))
		return -ENOTSUPP;
	if (dpaa2_eth_is_type_phy(priv))
		err = phylink_ethtool_ksettings_set(priv->mac->phylink,
						    link_settings);

	return phylink_ethtool_ksettings_set(priv->mac->phylink, link_settings);
	mutex_unlock(&priv->mac_lock);

	return err;
}

static void dpaa2_eth_get_pauseparam(struct net_device *net_dev,
@@ -128,11 +147,16 @@ static void dpaa2_eth_get_pauseparam(struct net_device *net_dev,
	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
	u64 link_options = priv->link_state.options;

	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_is_type_phy(priv)) {
		phylink_ethtool_get_pauseparam(priv->mac->phylink, pause);
		mutex_unlock(&priv->mac_lock);
		return;
	}

	mutex_unlock(&priv->mac_lock);

	pause->rx_pause = dpaa2_eth_rx_pause_enabled(link_options);
	pause->tx_pause = dpaa2_eth_tx_pause_enabled(link_options);
	pause->autoneg = AUTONEG_DISABLE;
@@ -151,9 +175,17 @@ static int dpaa2_eth_set_pauseparam(struct net_device *net_dev,
		return -EOPNOTSUPP;
	}

	if (dpaa2_eth_is_type_phy(priv))
		return phylink_ethtool_set_pauseparam(priv->mac->phylink,
	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_is_type_phy(priv)) {
		err = phylink_ethtool_set_pauseparam(priv->mac->phylink,
						     pause);
		mutex_unlock(&priv->mac_lock);
		return err;
	}

	mutex_unlock(&priv->mac_lock);

	if (pause->autoneg)
		return -EOPNOTSUPP;

@@ -185,7 +217,6 @@ static int dpaa2_eth_set_pauseparam(struct net_device *net_dev,
static void dpaa2_eth_get_strings(struct net_device *netdev, u32 stringset,
				  u8 *data)
{
	struct dpaa2_eth_priv *priv = netdev_priv(netdev);
	u8 *p = data;
	int i;

@@ -199,7 +230,6 @@ static void dpaa2_eth_get_strings(struct net_device *netdev, u32 stringset,
			strscpy(p, dpaa2_ethtool_extras[i], ETH_GSTRING_LEN);
			p += ETH_GSTRING_LEN;
		}
		if (dpaa2_eth_has_mac(priv))
		dpaa2_mac_get_strings(p);
		break;
	}
@@ -207,14 +237,10 @@ static void dpaa2_eth_get_strings(struct net_device *netdev, u32 stringset,

static int dpaa2_eth_get_sset_count(struct net_device *net_dev, int sset)
{
	int num_ss_stats = DPAA2_ETH_NUM_STATS + DPAA2_ETH_NUM_EXTRA_STATS;
	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);

	switch (sset) {
	case ETH_SS_STATS: /* ethtool_get_stats(), ethtool_get_drvinfo() */
		if (dpaa2_eth_has_mac(priv))
			num_ss_stats += dpaa2_mac_get_sset_count();
		return num_ss_stats;
		return DPAA2_ETH_NUM_STATS + DPAA2_ETH_NUM_EXTRA_STATS +
		       dpaa2_mac_get_sset_count();
	default:
		return -EOPNOTSUPP;
	}
@@ -315,8 +341,12 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev,
	}
	*(data + i++) = buf_cnt_total;

	mutex_lock(&priv->mac_lock);

	if (dpaa2_eth_has_mac(priv))
		dpaa2_mac_get_ethtool_stats(priv->mac, data + i);

	mutex_unlock(&priv->mac_lock);
}

static int dpaa2_eth_prep_eth_rule(struct ethhdr *eth_value, struct ethhdr *eth_mask,
+13 −3
Original line number Diff line number Diff line
@@ -338,12 +338,20 @@ static void dpaa2_mac_set_supported_interfaces(struct dpaa2_mac *mac)

void dpaa2_mac_start(struct dpaa2_mac *mac)
{
	ASSERT_RTNL();

	if (mac->serdes_phy)
		phy_power_on(mac->serdes_phy);

	phylink_start(mac->phylink);
}

void dpaa2_mac_stop(struct dpaa2_mac *mac)
{
	ASSERT_RTNL();

	phylink_stop(mac->phylink);

	if (mac->serdes_phy)
		phy_power_off(mac->serdes_phy);
}
@@ -422,7 +430,9 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac)
	}
	mac->phylink = phylink;

	rtnl_lock();
	err = phylink_fwnode_phy_connect(mac->phylink, dpmac_node, 0);
	rtnl_unlock();
	if (err) {
		netdev_err(net_dev, "phylink_fwnode_phy_connect() = %d\n", err);
		goto err_phylink_destroy;
@@ -440,10 +450,10 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac)

void dpaa2_mac_disconnect(struct dpaa2_mac *mac)
{
	if (!mac->phylink)
		return;

	rtnl_lock();
	phylink_disconnect_phy(mac->phylink);
	rtnl_unlock();

	phylink_destroy(mac->phylink);
	dpaa2_pcs_destroy(mac);
	of_phy_put(mac->serdes_phy);
Loading