Commit 081f359e authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'hyperv-fixes-signed-20221125' of...

Merge tag 'hyperv-fixes-signed-20221125' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv fixes from Wei Liu:

 - Fix IRTE allocation in Hyper-V PCI controller (Dexuan Cui)

 - Fix handling of SCSI srb_status and capacity change events (Michael
   Kelley)

 - Restore VP assist page after CPU offlining and onlining (Vitaly
   Kuznetsov)

 - Fix some memory leak issues in VMBus (Yang Yingliang)

* tag 'hyperv-fixes-signed-20221125' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux:
  Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register()
  Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work()
  PCI: hv: Only reuse existing IRTE allocation for Multi-MSI
  scsi: storvsc: Fix handling of srb_status and capacity change events
  x86/hyperv: Restore VP assist page after cpu offlining/onlining
parents 0b1dcc2c 25c94b05
Loading
Loading
Loading
Loading
+26 −28
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
	union hv_vp_assist_msr_contents msr = { 0 };
	struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
	struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
	int ret;

	ret = hv_common_cpu_init(cpu);
@@ -87,15 +87,13 @@ static int hv_cpu_init(unsigned int cpu)
	if (!hv_vp_assist_page)
		return 0;

	if (!*hvp) {
	if (hv_root_partition) {
		/*
		 * For root partition we get the hypervisor provided VP assist
		 * page, instead of allocating a new page.
		 */
		rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
			*hvp = memremap(msr.pfn <<
					HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
		*hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
				PAGE_SIZE, MEMREMAP_WB);
	} else {
		/*
@@ -106,16 +104,16 @@ static int hv_cpu_init(unsigned int cpu)
		 * in hv_cpu_die(), otherwise a CPU may not be stopped in the
		 * case of CPU offlining and the VM will hang.
		 */
		if (!*hvp)
			*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
		if (*hvp)
			msr.pfn = vmalloc_to_pfn(*hvp);

	}
		WARN_ON(!(*hvp));
		if (*hvp) {
	if (!WARN_ON(!(*hvp))) {
		msr.enable = 1;
		wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
	}
	}

	return hyperv_init_ghcb();
}
+5 −1
Original line number Diff line number Diff line
@@ -533,13 +533,17 @@ static void vmbus_add_channel_work(struct work_struct *work)
	 * Add the new device to the bus. This will kick off device-driver
	 * binding which eventually invokes the device driver's AddDevice()
	 * method.
	 *
	 * If vmbus_device_register() fails, the 'device_obj' is freed in
	 * vmbus_device_release() as called by device_unregister() in the
	 * error path of vmbus_device_register(). In the outside error
	 * path, there's no need to free it.
	 */
	ret = vmbus_device_register(newchannel->device_obj);

	if (ret != 0) {
		pr_err("unable to add child device object (relid %d)\n",
			newchannel->offermsg.child_relid);
		kfree(newchannel->device_obj);
		goto err_deq_chan;
	}

+1 −0
Original line number Diff line number Diff line
@@ -2082,6 +2082,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
	ret = device_register(&child_device_obj->device);
	if (ret) {
		pr_err("Unable to register child device\n");
		put_device(&child_device_obj->device);
		return ret;
	}

+75 −15
Original line number Diff line number Diff line
@@ -1613,7 +1613,7 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp,
}

static u32 hv_compose_msi_req_v1(
	struct pci_create_interrupt *int_pkt, const struct cpumask *affinity,
	struct pci_create_interrupt *int_pkt,
	u32 slot, u8 vector, u16 vector_count)
{
	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
@@ -1631,6 +1631,35 @@ static u32 hv_compose_msi_req_v1(
	return sizeof(*int_pkt);
}

/*
 * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
 * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
 * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
 * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
 * not irrelevant because Hyper-V chooses the physical CPU to handle the
 * interrupts based on the vCPU specified in message sent to the vPCI VSP in
 * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
 * but assigning too many vPCI device interrupts to the same pCPU can cause a
 * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
 * to spread out the pCPUs that it selects.
 *
 * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
 * to always return the same dummy vCPU, because a second call to
 * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
 * new pCPU for the interrupt. But for the multi-MSI case, the second call to
 * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
 * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
 * the pCPUs are spread out. All interrupts for a multi-MSI device end up using
 * the same pCPU, even though the vCPUs will be spread out by later calls
 * to hv_irq_unmask(), but that is the best we can do now.
 *
 * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
 * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
 * enhancement is planned for a future version. With that enhancement, the
 * dummy vCPU selection won't matter, and interrupts for the same multi-MSI
 * device will be spread across multiple pCPUs.
 */

/*
 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
 * by subsequent retarget in hv_irq_unmask().
@@ -1640,18 +1669,39 @@ static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
	return cpumask_first_and(affinity, cpu_online_mask);
}

static u32 hv_compose_msi_req_v2(
	struct pci_create_interrupt2 *int_pkt, const struct cpumask *affinity,
	u32 slot, u8 vector, u16 vector_count)
/*
 * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
 */
static int hv_compose_multi_msi_req_get_cpu(void)
{
	static DEFINE_SPINLOCK(multi_msi_cpu_lock);

	/* -1 means starting with CPU 0 */
	static int cpu_next = -1;

	unsigned long flags;
	int cpu;

	spin_lock_irqsave(&multi_msi_cpu_lock, flags);

	cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids,
				     false);
	cpu = cpu_next;

	spin_unlock_irqrestore(&multi_msi_cpu_lock, flags);

	return cpu;
}

static u32 hv_compose_msi_req_v2(
	struct pci_create_interrupt2 *int_pkt, int cpu,
	u32 slot, u8 vector, u16 vector_count)
{
	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
	int_pkt->wslot.slot = slot;
	int_pkt->int_desc.vector = vector;
	int_pkt->int_desc.vector_count = vector_count;
	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
	cpu = hv_compose_msi_req_get_cpu(affinity);
	int_pkt->int_desc.processor_array[0] =
		hv_cpu_number_to_vp_number(cpu);
	int_pkt->int_desc.processor_count = 1;
@@ -1660,18 +1710,15 @@ static u32 hv_compose_msi_req_v2(
}

static u32 hv_compose_msi_req_v3(
	struct pci_create_interrupt3 *int_pkt, const struct cpumask *affinity,
	struct pci_create_interrupt3 *int_pkt, int cpu,
	u32 slot, u32 vector, u16 vector_count)
{
	int cpu;

	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
	int_pkt->wslot.slot = slot;
	int_pkt->int_desc.vector = vector;
	int_pkt->int_desc.reserved = 0;
	int_pkt->int_desc.vector_count = vector_count;
	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
	cpu = hv_compose_msi_req_get_cpu(affinity);
	int_pkt->int_desc.processor_array[0] =
		hv_cpu_number_to_vp_number(cpu);
	int_pkt->int_desc.processor_count = 1;
@@ -1715,12 +1762,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
			struct pci_create_interrupt3 v3;
		} int_pkts;
	} __packed ctxt;
	bool multi_msi;
	u64 trans_id;
	u32 size;
	int ret;
	int cpu;

	msi_desc  = irq_data_get_msi_desc(data);
	multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
		    msi_desc->nvec_used > 1;

	/* Reuse the previous allocation */
	if (data->chip_data) {
	if (data->chip_data && multi_msi) {
		int_desc = data->chip_data;
		msg->address_hi = int_desc->address >> 32;
		msg->address_lo = int_desc->address & 0xffffffff;
@@ -1728,7 +1781,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
		return;
	}

	msi_desc  = irq_data_get_msi_desc(data);
	pdev = msi_desc_to_pci_dev(msi_desc);
	dest = irq_data_get_effective_affinity_mask(data);
	pbus = pdev->bus;
@@ -1738,11 +1790,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
	if (!hpdev)
		goto return_null_message;

	/* Free any previous message that might have already been composed. */
	if (data->chip_data && !multi_msi) {
		int_desc = data->chip_data;
		data->chip_data = NULL;
		hv_int_desc_free(hpdev, int_desc);
	}

	int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
	if (!int_desc)
		goto drop_reference;

	if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) {
	if (multi_msi) {
		/*
		 * If this is not the first MSI of Multi MSI, we already have
		 * a mapping.  Can exit early.
@@ -1767,9 +1826,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
		 */
		vector = 32;
		vector_count = msi_desc->nvec_used;
		cpu = hv_compose_multi_msi_req_get_cpu();
	} else {
		vector = hv_msi_get_int_vector(data);
		vector_count = 1;
		cpu = hv_compose_msi_req_get_cpu(dest);
	}

	/*
@@ -1785,7 +1846,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
	switch (hbus->protocol_version) {
	case PCI_PROTOCOL_VERSION_1_1:
		size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
					dest,
					hpdev->desc.win_slot.slot,
					(u8)vector,
					vector_count);
@@ -1794,7 +1854,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
	case PCI_PROTOCOL_VERSION_1_2:
	case PCI_PROTOCOL_VERSION_1_3:
		size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
					dest,
					cpu,
					hpdev->desc.win_slot.slot,
					(u8)vector,
					vector_count);
@@ -1802,7 +1862,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)

	case PCI_PROTOCOL_VERSION_1_4:
		size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
					dest,
					cpu,
					hpdev->desc.win_slot.slot,
					vector,
					vector_count);
+34 −35
Original line number Diff line number Diff line
@@ -303,16 +303,21 @@ enum storvsc_request_type {
};

/*
 * SRB status codes and masks; a subset of the codes used here.
 * SRB status codes and masks. In the 8-bit field, the two high order bits
 * are flags, while the remaining 6 bits are an integer status code.  The
 * definitions here include only the subset of the integer status codes that
 * are tested for in this driver.
 */

#define SRB_STATUS_AUTOSENSE_VALID	0x80
#define SRB_STATUS_QUEUE_FROZEN		0x40
#define SRB_STATUS_INVALID_LUN	0x20

/* SRB status integer codes */
#define SRB_STATUS_SUCCESS		0x01
#define SRB_STATUS_ABORTED		0x02
#define SRB_STATUS_ERROR		0x04
#define SRB_STATUS_INVALID_REQUEST	0x06
#define SRB_STATUS_DATA_OVERRUN		0x12
#define SRB_STATUS_INVALID_LUN		0x20

#define SRB_STATUS(status) \
	(status & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN))
@@ -969,38 +974,25 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
	void (*process_err_fn)(struct work_struct *work);
	struct hv_host_device *host_dev = shost_priv(host);

	/*
	 * In some situations, Hyper-V sets multiple bits in the
	 * srb_status, such as ABORTED and ERROR. So process them
	 * individually, with the most specific bits first.
	 */

	if (vm_srb->srb_status & SRB_STATUS_INVALID_LUN) {
		set_host_byte(scmnd, DID_NO_CONNECT);
		process_err_fn = storvsc_remove_lun;
		goto do_work;
	}

	if (vm_srb->srb_status & SRB_STATUS_ABORTED) {
		if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID &&
		    /* Capacity data has changed */
		    (asc == 0x2a) && (ascq == 0x9)) {
	switch (SRB_STATUS(vm_srb->srb_status)) {
	case SRB_STATUS_ERROR:
	case SRB_STATUS_ABORTED:
	case SRB_STATUS_INVALID_REQUEST:
		if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) {
			/* Check for capacity change */
			if ((asc == 0x2a) && (ascq == 0x9)) {
				process_err_fn = storvsc_device_scan;
			/*
			 * Retry the I/O that triggered this.
			 */
				/* Retry the I/O that triggered this. */
				set_host_byte(scmnd, DID_REQUEUE);
				goto do_work;
			}
	}

	if (vm_srb->srb_status & SRB_STATUS_ERROR) {
			/*
		 * Let upper layer deal with error when
		 * sense message is present.
			 * Otherwise, let upper layer deal with the
			 * error when sense message is present
			 */
		if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)
			return;
		}

		/*
		 * If there is an error; offline the device since all
@@ -1023,6 +1015,13 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
		default:
			set_host_byte(scmnd, DID_ERROR);
		}
		return;

	case SRB_STATUS_INVALID_LUN:
		set_host_byte(scmnd, DID_NO_CONNECT);
		process_err_fn = storvsc_remove_lun;
		goto do_work;

	}
	return;