Commit 6bab6f66 authored by Xiongfeng Wang's avatar Xiongfeng Wang
Browse files

pciehp: fix a race between pciehp and removing operations by sysfs

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8YPFU


CVE: NA

-------------------------------------------------

When I run a stress test about pcie hotplug and removing operations by
sysfs, I got a hange task, and the following call trace is printed.

[  242.683775] INFO: task irq/52-pciehp:128 blocked for more than 120 seconds.
[  242.684615]       Not tainted 6.6.0+ #168
[  242.685065] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  242.685932] task:irq/52-pciehp   state:D stack:0     pid:128   ppid:2      flags:0x00000008
[  242.686858] Call trace:
[  242.687178]  __switch_to+0xa8/0xc8
[  242.687566]  __schedule+0x1ec/0x4a0
[  242.688019]  schedule+0x2c/0x80
[  242.688382]  schedule_preempt_disabled+0x18/0x30
[  242.688905]  __mutex_lock.isra.0+0x140/0x2e0
[  242.689385]  __mutex_lock_slowpath+0x1c/0x30
[  242.689867]  mutex_lock+0x6c/0x88
[  242.690239]  pci_lock_rescan_remove+0x24/0x38
[  242.690346] EXT4-fs error (device vda): ext4_lookup:1857: inode #37226: comm systemd-journal: deleted inode referenced: 41314
[  242.690730]  pciehp_configure_device+0x34/0x168
[  242.692589]  board_added+0xf8/0x160
[  242.692979]  __pciehp_enable_slot+0x44/0xf8
[  242.693446]  pciehp_enable_slot+0x40/0xd8
[  242.693892]  pciehp_handle_presence_or_link_change+0xfc/0x208
[  242.694530]  pciehp_ist+0x1c4/0x1d0
[  242.694920]  irq_thread_fn+0x34/0xb8
[  242.695332]  irq_thread+0xd8/0x198
[  242.695725]  kthread+0xe4/0xf0
[  242.696072]  ret_from_fork+0x10/0x20
[  242.696483] INFO: task bash:432 blocked for more than 120 seconds.
[  242.697176]       Not tainted 6.6.0+ #168
[  242.697625] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  242.698495] task:bash            state:D stack:0     pid:432   ppid:400    flags:0x00000008
[  242.699438] Call trace:
[  242.699726]  __switch_to+0xa8/0xc8
[  242.700116]  __schedule+0x1ec/0x4a0
[  242.700511]  schedule+0x2c/0x80
[  242.700867]  __synchronize_irq+0x80/0xc0
[  242.701308]  __free_irq+0xf0/0x308
[  242.701691]  free_irq+0x3c/0x90
[  242.702046]  pcie_shutdown_notification+0x48/0x90
[  242.702574]  pciehp_remove+0x30/0x60
[  242.702977]  pcie_port_remove_service+0x40/0x70
[  242.703495]  device_remove+0x78/0x90
[  242.703934]  __device_release_driver+0x150/0x1c8
[  242.704453]  device_release_driver+0x34/0x58
[  242.704929]  bus_remove_device+0xd8/0x158
[  242.705374]  device_del+0x168/0x2f8
[  242.705766]  device_unregister+0x28/0x88
[  242.706205]  remove_iter+0x34/0x50
[  242.706585]  device_for_each_child+0x64/0xb8
[  242.707061]  pcie_portdrv_remove+0x40/0xb0
[  242.707525]  pci_device_remove+0x44/0xa8
[  242.708006]  device_remove+0x54/0x90
[  242.708407]  __device_release_driver+0x150/0x1c8
[  242.708921]  device_release_driver+0x34/0x58
[  242.709396]  pci_stop_dev+0x44/0xa8
[  242.709785]  pci_stop_bus_device+0x64/0x80
[  242.710240]  pci_stop_and_remove_bus_device_locked+0x60/0xd8
[  242.710866]  remove_store+0x98/0xb0
[  242.711264]  dev_attr_store+0x20/0x40
[  242.711672]  sysfs_kf_write+0x4c/0x68
[  242.712142]  kernfs_fop_write_iter+0x130/0x1c8
[  242.712641]  new_sync_write+0xa4/0x128
[  242.713060]  vfs_write.part.0+0x128/0x168
[  242.713503]  vfs_write+0x9c/0xf0
[  242.713864]  ksys_write+0x74/0x108
[  242.714245]  __arm64_sys_write+0x24/0x38
[  242.714682]  invoke_syscall.constprop.0+0x54/0xe8
[  242.715214]  el0_svc_common.constprop.0+0x44/0xc8
[  242.715752]  do_el0_svc+0x1c/0x40
[  242.716125]  el0_svc+0x4c/0xd8
[  242.716470]  el0t_64_sync_handler+0xc0/0xc8
[  242.716936]  el0t_64_sync+0x1a4/0x1a8

When we remove a slot by sysfs.
'pci_stop_and_remove_bus_device_locked()' will be called. This function
will get the global mutex lock 'pci_rescan_remove_lock', and remove the
slot. If the irq thread 'pciehp_ist' is still running, we will wait
until it exits.

If a pciehp interrupt happens immediately after we remove the slot by
sysfs, but before we free the pciehp irq in
'pci_stop_and_remove_bus_device_locked()'. 'pciehp_ist' will hung
because the global mutex lock 'pci_rescan_remove_lock' is held by the
sysfs operation. But the sysfs operation is waiting for the pciehp irq
thread 'pciehp_ist' ends. Then a hung task occurs.

So this two kinds of operation, removing through attention buttion and
removing through /sys/devices/pci***/remove, should not be excuted at
the same time. This patch add a global variable to mark that one of these
operations is under processing. When this variable is set,  if another
operation is requested, it will be rejected.

We use a global variable 'slot_being_removed_rescaned' to mark whether a
slot is being removed or rescaned. This will cause a slot hotplug
operation is delayed if another slot is being remove or rescaned. But
if these two slots are under different root ports, they should not
influence each other. This patch make the flag
'slot_being_removed_rescanned' per root port so that one slot hotplug
operation doesn't influence slots below another root port.

We record the root port in struct pci_dev when the pci device is
initialized and added into the system instead of using
'pcie_find_root_port()' to find the root port when we need it. Because
iterating the pci tree needs the protection of
'pci_lock_rescan_remove()'. This will make the problem more complexed
because the lock is very coarse-grained. We don't need to worry about
'use-after-free' because child pci devices are always removed before the
root port device is removed.

Signed-off-by: default avatarXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: default avatarHanjun Guo <guohanjun@huawei.com>

 Conflicts:
	drivers/pci/hotplug/pciehp_ctrl.c
	drivers/pci/hotplug/pciehp_hpc.c
	include/linux/pci.h
Signed-off-by: default avatarXiongfeng Wang <wangxiongfeng2@huawei.com>
parent 34a4a473
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -199,6 +199,11 @@ static inline const char *slot_name(struct controller *ctrl)
	return hotplug_slot_name(&ctrl->hotplug_slot);
}

static inline struct pci_dev *ctrl_dev(struct controller *ctrl)
{
	return ctrl->pcie->port;
}

static inline struct controller *to_ctrl(struct hotplug_slot *hotplug_slot)
{
	return container_of(hotplug_slot, struct controller, hotplug_slot);
+40 −0
Original line number Diff line number Diff line
@@ -143,6 +143,8 @@ void pciehp_queue_pushbutton_work(struct work_struct *work)
{
	struct controller *ctrl = container_of(work, struct controller,
					       button_work.work);
	int events = ctrl->button_work.data;
	struct pci_dev *rpdev = ctrl_dev(ctrl)->rpdev;

	mutex_lock(&ctrl->state_lock);
	switch (ctrl->state) {
@@ -153,6 +155,15 @@ void pciehp_queue_pushbutton_work(struct work_struct *work)
		pciehp_request(ctrl, PCI_EXP_SLTSTA_PDC);
		break;
	default:
		if (events) {
			atomic_or(events, &ctrl->pending_events);
			if (!pciehp_poll_mode)
				irq_wake_thread(ctrl->pcie->irq, ctrl);
		} else {
			if (rpdev)
				clear_bit(0,
					  &rpdev->slot_being_removed_rescanned);
		}
		break;
	}
	mutex_unlock(&ctrl->state_lock);
@@ -160,6 +171,8 @@ void pciehp_queue_pushbutton_work(struct work_struct *work)

void pciehp_handle_button_press(struct controller *ctrl)
{
	struct pci_dev *rpdev = ctrl_dev(ctrl)->rpdev;

	mutex_lock(&ctrl->state_lock);
	switch (ctrl->state) {
	case OFF_STATE:
@@ -176,6 +189,7 @@ void pciehp_handle_button_press(struct controller *ctrl)
		/* blink power indicator and turn off attention */
		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
				      PCI_EXP_SLTCTL_ATTN_IND_OFF);
		ctrl->button_work.data = 0;
		schedule_delayed_work(&ctrl->button_work, 5 * HZ);
		break;
	case BLINKINGOFF_STATE:
@@ -199,10 +213,14 @@ void pciehp_handle_button_press(struct controller *ctrl)
			ctrl_info(ctrl, "Slot(%s): Button press: canceling request to power on\n",
				  slot_name(ctrl));
		}
		if (rpdev)
			clear_bit(0, &rpdev->slot_being_removed_rescanned);
		break;
	default:
		ctrl_err(ctrl, "Slot(%s): Button press: ignoring invalid state %#x\n",
			 slot_name(ctrl), ctrl->state);
		if (rpdev)
			clear_bit(0, &rpdev->slot_being_removed_rescanned);
		break;
	}
	mutex_unlock(&ctrl->state_lock);
@@ -210,6 +228,8 @@ void pciehp_handle_button_press(struct controller *ctrl)

void pciehp_handle_disable_request(struct controller *ctrl)
{
	struct pci_dev *rpdev = ctrl_dev(ctrl)->rpdev;

	mutex_lock(&ctrl->state_lock);
	switch (ctrl->state) {
	case BLINKINGON_STATE:
@@ -221,11 +241,14 @@ void pciehp_handle_disable_request(struct controller *ctrl)
	mutex_unlock(&ctrl->state_lock);

	ctrl->request_result = pciehp_disable_slot(ctrl, SAFE_REMOVAL);
	if (rpdev)
		clear_bit(0, &rpdev->slot_being_removed_rescanned);
}

void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
{
	int present, link_active;
	struct pci_dev *rpdev = ctrl_dev(ctrl)->rpdev;

	/*
	 * If the slot is on and presence or link has changed, turn it off.
@@ -266,6 +289,8 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
				  slot_name(ctrl));
		}
		mutex_unlock(&ctrl->state_lock);
		if (rpdev)
			clear_bit(0, &rpdev->slot_being_removed_rescanned);
		return;
	}

@@ -288,6 +313,8 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
		mutex_unlock(&ctrl->state_lock);
		break;
	}
	if (rpdev)
		clear_bit(0, &rpdev->slot_being_removed_rescanned);
}

static int __pciehp_enable_slot(struct controller *ctrl)
@@ -408,6 +435,14 @@ int pciehp_sysfs_enable_slot(struct hotplug_slot *hotplug_slot)
int pciehp_sysfs_disable_slot(struct hotplug_slot *hotplug_slot)
{
	struct controller *ctrl = to_ctrl(hotplug_slot);
	struct pci_dev *rpdev = ctrl_dev(ctrl)->rpdev;

	if (rpdev && test_and_set_bit(0,
				&rpdev->slot_being_removed_rescanned)) {
		ctrl_info(ctrl, "Slot(%s): Slot is being removed or rescanned, please try later!\n",
			  slot_name(ctrl));
		return -EINVAL;
	}

	mutex_lock(&ctrl->state_lock);
	switch (ctrl->state) {
@@ -418,6 +453,8 @@ int pciehp_sysfs_disable_slot(struct hotplug_slot *hotplug_slot)
		wait_event(ctrl->requester,
			   !atomic_read(&ctrl->pending_events) &&
			   !ctrl->ist_running);
		if (rpdev)
			clear_bit(0, &rpdev->slot_being_removed_rescanned);
		return ctrl->request_result;
	case POWEROFF_STATE:
		ctrl_info(ctrl, "Slot(%s): Already in powering off state\n",
@@ -436,5 +473,8 @@ int pciehp_sysfs_disable_slot(struct hotplug_slot *hotplug_slot)
	}
	mutex_unlock(&ctrl->state_lock);

	if (rpdev)
		clear_bit(0, &rpdev->slot_being_removed_rescanned);

	return -ENODEV;
}
+68 −11
Original line number Diff line number Diff line
@@ -45,11 +45,6 @@ static const struct dmi_system_id inband_presence_disabled_dmi_table[] = {
	{}
};

static inline struct pci_dev *ctrl_dev(struct controller *ctrl)
{
	return ctrl->pcie->port;
}

static irqreturn_t pciehp_isr(int irq, void *dev_id);
static irqreturn_t pciehp_ist(int irq, void *dev_id);
static int pciehp_poll(void *data);
@@ -694,6 +689,7 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id)
{
	struct controller *ctrl = (struct controller *)dev_id;
	struct pci_dev *pdev = ctrl_dev(ctrl);
	struct pci_dev *rpdev = pdev->rpdev;
	irqreturn_t ret;
	u32 events;

@@ -716,8 +712,20 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id)
	}

	/* Check Attention Button Pressed */
	if (events & PCI_EXP_SLTSTA_ABP)
	if (events & PCI_EXP_SLTSTA_ABP) {
		if (!rpdev || (rpdev && !test_and_set_bit(0,
					&rpdev->slot_being_removed_rescanned)))
			pciehp_handle_button_press(ctrl);
		else {
			if (ctrl->state == BLINKINGOFF_STATE ||
					ctrl->state == BLINKINGON_STATE)
				pciehp_handle_button_press(ctrl);
			else
				ctrl_info(ctrl, "Slot(%s): Slot operation failed because a remove or"
					  " rescan operation is under processing, please try later!\n",
					  slot_name(ctrl));
		}
	}

	/* Check Power Fault Detected */
	if (events & PCI_EXP_SLTSTA_PFD) {
@@ -741,10 +749,59 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id)
	 * or Data Link Layer State Changed events.
	 */
	down_read_nested(&ctrl->reset_lock, ctrl->depth);
	if (events & DISABLE_SLOT)
	if (events & DISABLE_SLOT) {
		if (!rpdev || (rpdev && !test_and_set_bit(0,
					&rpdev->slot_being_removed_rescanned)))
			pciehp_handle_disable_request(ctrl);
		else {
			if (ctrl->state == BLINKINGOFF_STATE ||
					ctrl->state == BLINKINGON_STATE)
				pciehp_handle_disable_request(ctrl);
	else if (events & (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC))
			else {
				ctrl_info(ctrl, "Slot(%s): DISABLE_SLOT event in remove or rescan process!\n",
						slot_name(ctrl));
				/*
				 * we use the work_struct private data to store
				 * the event type
				 */
				ctrl->button_work.data = DISABLE_SLOT;
				/*
				 * If 'work.timer' is pending, schedule the work will
				 * cause BUG_ON().
				 */
				if (!timer_pending(&ctrl->button_work.timer))
					schedule_delayed_work(&ctrl->button_work, 3 * HZ);
				else
					ctrl_info(ctrl, "Slot(%s): Didn't schedule delayed_work because timer is pending!\n",
							slot_name(ctrl));
			}
		}
	} else if (events & (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC)) {
		if (!rpdev || (rpdev && !test_and_set_bit(0,
					&rpdev->slot_being_removed_rescanned)))
			pciehp_handle_presence_or_link_change(ctrl, events);
		else {
			if (ctrl->state == BLINKINGOFF_STATE ||
					ctrl->state == BLINKINGON_STATE)
				pciehp_handle_presence_or_link_change(ctrl,
						events);
			else {
				/*
				 * When we are removing or rescanning through
				 * sysfs, suprise link down/up happens. So we
				 * will handle this event 3 seconds later.
				 */
				ctrl_info(ctrl, "Slot(%s): Surprise link down/up in remove or rescan process!\n",
						slot_name(ctrl));
				ctrl->button_work.data = events & (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
				if (!timer_pending(&ctrl->button_work.timer))
					schedule_delayed_work(&ctrl->button_work, 3 * HZ);
				else
					ctrl_info(ctrl, "Slot(%s): Didn't schedule delayed_work because timer is pending!\n",
							slot_name(ctrl));
			}
		}
	}
	up_read(&ctrl->reset_lock);

	ret = IRQ_HANDLED;
+22 −0
Original line number Diff line number Diff line
@@ -484,16 +484,38 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
{
	unsigned long val;
	struct pci_dev *pdev = to_pci_dev(dev);
	struct pci_dev *rpdev = pdev->rpdev;

	if (kstrtoul(buf, 0, &val) < 0)
		return -EINVAL;

	if (rpdev && test_and_set_bit(0,
				&rpdev->slot_being_removed_rescanned)) {
		pr_info("Slot is being removed or rescanned, please try later!\n");
		return -EINVAL;
	}

	/*
	 * if 'dev' is root port itself, 'pci_stop_and_remove_bus_device()' may
	 * free the 'rpdev', but we need to clear
	 * 'rpdev->slot_being_removed_rescanned' in the end. So get 'rpdev' to
	 * avoid possible 'use-after-free'.
	 */
	if (rpdev)
		pci_dev_get(rpdev);

	if (val) {
		pci_dev_get(pdev);
		if (device_remove_file_self(dev, attr))
			pci_stop_and_remove_bus_device_locked(pdev);
		pci_dev_put(pdev);
	}

	if (rpdev) {
		clear_bit(0, &rpdev->slot_being_removed_rescanned);
		pci_dev_put(rpdev);
	}

	return count;
}
static DEVICE_ATTR_IGNORE_LOCKDEP(remove, 0220, NULL,
+5 −0
Original line number Diff line number Diff line
@@ -2584,6 +2584,11 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
	/* Set up MSI IRQ domain */
	pci_set_msi_domain(dev);

	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
		dev->rpdev = dev;
	else
		dev->rpdev = pcie_find_root_port(dev);

	/* Notifier could use PCI capabilities */
	dev->match_driver = false;
	ret = device_add(&dev->dev);
Loading