Commit 9c5b80b7 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'hyperv-next-signed-20210216' of...

Merge tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull Hyper-V updates from Wei Liu:

 - VMBus hardening patches from Andrea Parri and Andres Beltran.

 - Patches to make Linux boot as the root partition on Microsoft
   Hypervisor from Wei Liu.

 - One patch to add a new sysfs interface to support hibernation on
   Hyper-V from Dexuan Cui.

 - Two miscellaneous clean-up patches from Colin and Gustavo.

* tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (31 commits)
  Revert "Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer"
  iommu/hyperv: setup an IO-APIC IRQ remapping domain for root partition
  x86/hyperv: implement an MSI domain for root partition
  asm-generic/hyperv: import data structures for mapping device interrupts
  asm-generic/hyperv: introduce hv_device_id and auxiliary structures
  asm-generic/hyperv: update hv_interrupt_entry
  asm-generic/hyperv: update hv_msi_entry
  x86/hyperv: implement and use hv_smp_prepare_cpus
  x86/hyperv: provide a bunch of helper functions
  ACPI / NUMA: add a stub function for node_to_pxm()
  x86/hyperv: handling hypercall page setup for root
  x86/hyperv: extract partition ID from Microsoft Hypervisor if necessary
  x86/hyperv: allocate output arg pages if required
  clocksource/hyperv: use MSR-based access if running as root
  Drivers: hv: vmbus: skip VMBus initialization if Linux is root
  x86/hyperv: detect if Linux is the root partition
  asm-generic/hyperv: change HV_CPU_POWER_MANAGEMENT to HV_CPU_MANAGEMENT
  hv: hyperv.h: Replace one-element array with flexible-array in struct icmsg_negotiate
  hv_netvsc: Restrict configurations on isolated guests
  Drivers: hv: vmbus: Enforce 'VMBus version >= 5.2' on isolated guests
  ...
parents 08179b47 30192702
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
What:		/sys/bus/vmbus/hibernation
Date:		Jan 2021
KernelVersion:	5.12
Contact:	Dexuan Cui <decui@microsoft.com>
Description:	Whether the host supports hibernation for the VM.
Users:		Daemon that sets up swap partition/file for hibernation.

What:		/sys/bus/vmbus/devices/<UUID>/id
Date:		Jul 2009
KernelVersion:	2.6.31
+2 −2
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only
obj-y			:= hv_init.o mmu.o nested.o
obj-$(CONFIG_X86_64)	+= hv_apic.o
obj-y			:= hv_init.o mmu.o nested.o irqdomain.o
obj-$(CONFIG_X86_64)	+= hv_apic.o hv_proc.o

ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS)	+= hv_spinlock.o
+114 −8
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
#include <linux/bitfield.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
@@ -26,8 +27,11 @@
#include <linux/cpuhotplug.h>
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>

int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
EXPORT_SYMBOL_GPL(hv_current_partition_id);

void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -44,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page);
void  __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);

void  __percpu **hyperv_pcpu_output_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);

u32 hv_max_vp_index;
EXPORT_SYMBOL_GPL(hv_max_vp_index);

@@ -76,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu)
	void **input_arg;
	struct page *pg;

	input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
	/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
	pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
	pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
	if (unlikely(!pg))
		return -ENOMEM;

	input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
	*input_arg = page_address(pg);
	if (hv_root_partition) {
		void **output_arg;

		output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
		*output_arg = page_address(pg + 1);
	}

	hv_get_vp_index(msr_vp_index);

@@ -208,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu)
	unsigned int new_cpu;
	unsigned long flags;
	void **input_arg;
	void *input_pg = NULL;
	void *pg;

	local_irq_save(flags);
	input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
	input_pg = *input_arg;
	pg = *input_arg;
	*input_arg = NULL;

	if (hv_root_partition) {
		void **output_arg;

		output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
		*output_arg = NULL;
	}

	local_irq_restore(flags);
	free_page((unsigned long)input_pg);

	free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);

	if (hv_vp_assist_page && hv_vp_assist_page[cpu])
		wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
@@ -264,6 +287,9 @@ static int hv_suspend(void)
	union hv_x64_msr_hypercall_contents hypercall_msr;
	int ret;

	if (hv_root_partition)
		return -EPERM;

	/*
	 * Reset the hypercall page as it is going to be invalidated
	 * accross hibernation. Setting hv_hypercall_pg to NULL ensures
@@ -334,6 +360,24 @@ static void __init hv_stimer_setup_percpu_clockev(void)
		old_setup_percpu_clockev();
}

static void __init hv_get_partition_id(void)
{
	struct hv_get_partition_id *output_page;
	u64 status;
	unsigned long flags;

	local_irq_save(flags);
	output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
	status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
		/* No point in proceeding if this failed */
		pr_err("Failed to get partition ID: %lld\n", status);
		BUG();
	}
	hv_current_partition_id = output_page->partition_id;
	local_irq_restore(flags);
}

/*
 * This function is to be invoked early in the boot sequence after the
 * hypervisor has been detected.
@@ -368,6 +412,12 @@ void __init hyperv_init(void)

	BUG_ON(hyperv_pcpu_input_arg == NULL);

	/* Allocate the per-CPU state for output arg for root */
	if (hv_root_partition) {
		hyperv_pcpu_output_arg = alloc_percpu(void *);
		BUG_ON(hyperv_pcpu_output_arg == NULL);
	}

	/* Allocate percpu VP index */
	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
				    GFP_KERNEL);
@@ -408,8 +458,35 @@ void __init hyperv_init(void)

	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
	hypercall_msr.enable = 1;

	if (hv_root_partition) {
		struct page *pg;
		void *src, *dst;

		/*
		 * For the root partition, the hypervisor will set up its
		 * hypercall page. The hypervisor guarantees it will not show
		 * up in the root's address space. The root can't change the
		 * location of the hypercall page.
		 *
		 * Order is important here. We must enable the hypercall page
		 * so it is populated with code, then copy the code to an
		 * executable page.
		 */
		wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);

		pg = vmalloc_to_page(hv_hypercall_pg);
		dst = kmap(pg);
		src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
				MEMREMAP_WB);
		BUG_ON(!(src && dst));
		memcpy(dst, src, HV_HYP_PAGE_SIZE);
		memunmap(src);
		kunmap(pg);
	} else {
		hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
		wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
	}

	/*
	 * hyperv_init() is called before LAPIC is initialized: see
@@ -428,6 +505,21 @@ void __init hyperv_init(void)
	register_syscore_ops(&hv_syscore_ops);

	hyperv_init_cpuhp = cpuhp;

	if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
		hv_get_partition_id();

	BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);

#ifdef CONFIG_PCI_MSI
	/*
	 * If we're running as root, we want to create our own PCI MSI domain.
	 * We can't set this in hv_pci_init because that would be too late.
	 */
	if (hv_root_partition)
		x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif

	return;

remove_cpuhp_state:
@@ -552,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);

bool hv_is_hibernation_supported(void)
{
	return acpi_sleep_state_supported(ACPI_STATE_S4);
	return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);

enum hv_isolation_type hv_get_isolation_type(void)
{
	if (!(ms_hyperv.features_b & HV_ISOLATION))
		return HV_ISOLATION_TYPE_NONE;
	return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
}
EXPORT_SYMBOL_GPL(hv_get_isolation_type);

bool hv_is_isolation_supported(void)
{
	return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
+219 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
#include <linux/acpi.h>
#include <linux/hyperv.h>
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
#include <linux/minmax.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/apic.h>

#include <asm/trace/hyperv.h>

/*
 * See struct hv_deposit_memory. The first u64 is partition ID, the rest
 * are GPAs.
 */
#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)

/* Deposits exact number of pages. Must be called with interrupts enabled.  */
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
{
	struct page **pages, *page;
	int *counts;
	int num_allocations;
	int i, j, page_count;
	int order;
	u64 status;
	int ret;
	u64 base_pfn;
	struct hv_deposit_memory *input_page;
	unsigned long flags;

	if (num_pages > HV_DEPOSIT_MAX)
		return -E2BIG;
	if (!num_pages)
		return 0;

	/* One buffer for page pointers and counts */
	page = alloc_page(GFP_KERNEL);
	if (!page)
		return -ENOMEM;
	pages = page_address(page);

	counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
	if (!counts) {
		free_page((unsigned long)pages);
		return -ENOMEM;
	}

	/* Allocate all the pages before disabling interrupts */
	i = 0;

	while (num_pages) {
		/* Find highest order we can actually allocate */
		order = 31 - __builtin_clz(num_pages);

		while (1) {
			pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
			if (pages[i])
				break;
			if (!order) {
				ret = -ENOMEM;
				num_allocations = i;
				goto err_free_allocations;
			}
			--order;
		}

		split_page(pages[i], order);
		counts[i] = 1 << order;
		num_pages -= counts[i];
		i++;
	}
	num_allocations = i;

	local_irq_save(flags);

	input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);

	input_page->partition_id = partition_id;

	/* Populate gpa_page_list - these will fit on the input page */
	for (i = 0, page_count = 0; i < num_allocations; ++i) {
		base_pfn = page_to_pfn(pages[i]);
		for (j = 0; j < counts[i]; ++j, ++page_count)
			input_page->gpa_page_list[page_count] = base_pfn + j;
	}
	status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
				     page_count, 0, input_page, NULL);
	local_irq_restore(flags);

	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
		pr_err("Failed to deposit pages: %lld\n", status);
		ret = status;
		goto err_free_allocations;
	}

	ret = 0;
	goto free_buf;

err_free_allocations:
	for (i = 0; i < num_allocations; ++i) {
		base_pfn = page_to_pfn(pages[i]);
		for (j = 0; j < counts[i]; ++j)
			__free_page(pfn_to_page(base_pfn + j));
	}

free_buf:
	free_page((unsigned long)pages);
	kfree(counts);
	return ret;
}

int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
{
	struct hv_add_logical_processor_in *input;
	struct hv_add_logical_processor_out *output;
	u64 status;
	unsigned long flags;
	int ret = 0;
	int pxm = node_to_pxm(node);

	/*
	 * When adding a logical processor, the hypervisor may return
	 * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
	 * pages and retry.
	 */
	do {
		local_irq_save(flags);

		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
		/* We don't do anything with the output right now */
		output = *this_cpu_ptr(hyperv_pcpu_output_arg);

		input->lp_index = lp_index;
		input->apic_id = apic_id;
		input->flags = 0;
		input->proximity_domain_info.domain_id = pxm;
		input->proximity_domain_info.flags.reserved = 0;
		input->proximity_domain_info.flags.proximity_info_valid = 1;
		input->proximity_domain_info.flags.proximity_preferred = 1;
		status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
					 input, output);
		local_irq_restore(flags);

		status &= HV_HYPERCALL_RESULT_MASK;

		if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
			if (status != HV_STATUS_SUCCESS) {
				pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
				       lp_index, apic_id, status);
				ret = status;
			}
			break;
		}
		ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
	} while (!ret);

	return ret;
}

int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
{
	struct hv_create_vp *input;
	u64 status;
	unsigned long irq_flags;
	int ret = 0;
	int pxm = node_to_pxm(node);

	/* Root VPs don't seem to need pages deposited */
	if (partition_id != hv_current_partition_id) {
		/* The value 90 is empirically determined. It may change. */
		ret = hv_call_deposit_pages(node, partition_id, 90);
		if (ret)
			return ret;
	}

	do {
		local_irq_save(irq_flags);

		input = *this_cpu_ptr(hyperv_pcpu_input_arg);

		input->partition_id = partition_id;
		input->vp_index = vp_index;
		input->flags = flags;
		input->subnode_type = HvSubnodeAny;
		if (node != NUMA_NO_NODE) {
			input->proximity_domain_info.domain_id = pxm;
			input->proximity_domain_info.flags.reserved = 0;
			input->proximity_domain_info.flags.proximity_info_valid = 1;
			input->proximity_domain_info.flags.proximity_preferred = 1;
		} else {
			input->proximity_domain_info.as_uint64 = 0;
		}
		status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
		local_irq_restore(irq_flags);

		status &= HV_HYPERCALL_RESULT_MASK;

		if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
			if (status != HV_STATUS_SUCCESS) {
				pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
				       vp_index, flags, status);
				ret = status;
			}
			break;
		}
		ret = hv_call_deposit_pages(node, partition_id, 1);

	} while (!ret);

	return ret;
}
+385 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

/*
 * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
 *
 * Authors:
 *  Sunil Muthuswamy <sunilmut@microsoft.com>
 *  Wei Liu <wei.liu@kernel.org>
 */

#include <linux/pci.h>
#include <linux/irq.h>
#include <asm/mshyperv.h>

static int hv_map_interrupt(union hv_device_id device_id, bool level,
		int cpu, int vector, struct hv_interrupt_entry *entry)
{
	struct hv_input_map_device_interrupt *input;
	struct hv_output_map_device_interrupt *output;
	struct hv_device_interrupt_descriptor *intr_desc;
	unsigned long flags;
	u64 status;
	int nr_bank, var_size;

	local_irq_save(flags);

	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
	output = *this_cpu_ptr(hyperv_pcpu_output_arg);

	intr_desc = &input->interrupt_descriptor;
	memset(input, 0, sizeof(*input));
	input->partition_id = hv_current_partition_id;
	input->device_id = device_id.as_uint64;
	intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
	intr_desc->vector_count = 1;
	intr_desc->target.vector = vector;

	if (level)
		intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
	else
		intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;

	intr_desc->target.vp_set.valid_bank_mask = 0;
	intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
	nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
	if (nr_bank < 0) {
		local_irq_restore(flags);
		pr_err("%s: unable to generate VP set\n", __func__);
		return EINVAL;
	}
	intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;

	/*
	 * var-sized hypercall, var-size starts after vp_mask (thus
	 * vp_set.format does not count, but vp_set.valid_bank_mask
	 * does).
	 */
	var_size = nr_bank + 1;

	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
			input, output);
	*entry = output->interrupt_entry;

	local_irq_restore(flags);

	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
		pr_err("%s: hypercall failed, status %lld\n", __func__, status);

	return status & HV_HYPERCALL_RESULT_MASK;
}

static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
{
	unsigned long flags;
	struct hv_input_unmap_device_interrupt *input;
	struct hv_interrupt_entry *intr_entry;
	u64 status;

	local_irq_save(flags);
	input = *this_cpu_ptr(hyperv_pcpu_input_arg);

	memset(input, 0, sizeof(*input));
	intr_entry = &input->interrupt_entry;
	input->partition_id = hv_current_partition_id;
	input->device_id = id;
	*intr_entry = *old_entry;

	status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
	local_irq_restore(flags);

	return status & HV_HYPERCALL_RESULT_MASK;
}

#ifdef CONFIG_PCI_MSI
struct rid_data {
	struct pci_dev *bridge;
	u32 rid;
};

static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
{
	struct rid_data *rd = data;
	u8 bus = PCI_BUS_NUM(rd->rid);

	if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
		rd->bridge = pdev;
		rd->rid = alias;
	}

	return 0;
}

static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
{
	union hv_device_id dev_id;
	struct rid_data data = {
		.bridge = NULL,
		.rid = PCI_DEVID(dev->bus->number, dev->devfn)
	};

	pci_for_each_dma_alias(dev, get_rid_cb, &data);

	dev_id.as_uint64 = 0;
	dev_id.device_type = HV_DEVICE_TYPE_PCI;
	dev_id.pci.segment = pci_domain_nr(dev->bus);

	dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
	dev_id.pci.bdf.device = PCI_SLOT(data.rid);
	dev_id.pci.bdf.function = PCI_FUNC(data.rid);
	dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;

	if (data.bridge) {
		int pos;

		/*
		 * Microsoft Hypervisor requires a bus range when the bridge is
		 * running in PCI-X mode.
		 *
		 * To distinguish conventional vs PCI-X bridge, we can check
		 * the bridge's PCI-X Secondary Status Register, Secondary Bus
		 * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
		 * Specification Revision 1.0 5.2.2.1.3.
		 *
		 * Value zero means it is in conventional mode, otherwise it is
		 * in PCI-X mode.
		 */

		pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
		if (pos) {
			u16 status;

			pci_read_config_word(data.bridge, pos +
					PCI_X_BRIDGE_SSTATUS, &status);

			if (status & PCI_X_SSTATUS_FREQ) {
				/* Non-zero, PCI-X mode */
				u8 sec_bus, sub_bus;

				dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;

				pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
				dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
				pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
				dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
			}
		}
	}

	return dev_id;
}

static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
				struct hv_interrupt_entry *entry)
{
	union hv_device_id device_id = hv_build_pci_dev_id(dev);

	return hv_map_interrupt(device_id, false, cpu, vector, entry);
}

static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
{
	/* High address is always 0 */
	msg->address_hi = 0;
	msg->address_lo = entry->msi_entry.address.as_uint32;
	msg->data = entry->msi_entry.data.as_uint32;
}

static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
	struct msi_desc *msidesc;
	struct pci_dev *dev;
	struct hv_interrupt_entry out_entry, *stored_entry;
	struct irq_cfg *cfg = irqd_cfg(data);
	cpumask_t *affinity;
	int cpu;
	u64 status;

	msidesc = irq_data_get_msi_desc(data);
	dev = msi_desc_to_pci_dev(msidesc);

	if (!cfg) {
		pr_debug("%s: cfg is NULL", __func__);
		return;
	}

	affinity = irq_data_get_effective_affinity_mask(data);
	cpu = cpumask_first_and(affinity, cpu_online_mask);

	if (data->chip_data) {
		/*
		 * This interrupt is already mapped. Let's unmap first.
		 *
		 * We don't use retarget interrupt hypercalls here because
		 * Microsoft Hypervisor doens't allow root to change the vector
		 * or specify VPs outside of the set that is initially used
		 * during mapping.
		 */
		stored_entry = data->chip_data;
		data->chip_data = NULL;

		status = hv_unmap_msi_interrupt(dev, stored_entry);

		kfree(stored_entry);

		if (status != HV_STATUS_SUCCESS) {
			pr_debug("%s: failed to unmap, status %lld", __func__, status);
			return;
		}
	}

	stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
	if (!stored_entry) {
		pr_debug("%s: failed to allocate chip data\n", __func__);
		return;
	}

	status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
	if (status != HV_STATUS_SUCCESS) {
		kfree(stored_entry);
		return;
	}

	*stored_entry = out_entry;
	data->chip_data = stored_entry;
	entry_to_msi_msg(&out_entry, msg);

	return;
}

static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
{
	return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
}

static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
	u64 status;
	struct hv_interrupt_entry old_entry;
	struct irq_desc *desc;
	struct irq_data *data;
	struct msi_msg msg;

	desc = irq_to_desc(irq);
	if (!desc) {
		pr_debug("%s: no irq desc\n", __func__);
		return;
	}

	data = &desc->irq_data;
	if (!data) {
		pr_debug("%s: no irq data\n", __func__);
		return;
	}

	if (!data->chip_data) {
		pr_debug("%s: no chip data\n!", __func__);
		return;
	}

	old_entry = *(struct hv_interrupt_entry *)data->chip_data;
	entry_to_msi_msg(&old_entry, &msg);

	kfree(data->chip_data);
	data->chip_data = NULL;

	status = hv_unmap_msi_interrupt(dev, &old_entry);

	if (status != HV_STATUS_SUCCESS) {
		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
		return;
	}
}

static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
	int i;
	struct msi_desc *entry;
	struct pci_dev *pdev;

	if (WARN_ON_ONCE(!dev_is_pci(dev)))
		return;

	pdev = to_pci_dev(dev);

	for_each_pci_msi_entry(entry, pdev) {
		if (entry->irq) {
			for (i = 0; i < entry->nvec_used; i++) {
				hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
				irq_domain_free_irqs(entry->irq + i, 1);
			}
		}
	}
}

/*
 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
 * which implement the MSI or MSI-X Capability Structure.
 */
static struct irq_chip hv_pci_msi_controller = {
	.name			= "HV-PCI-MSI",
	.irq_unmask		= pci_msi_unmask_irq,
	.irq_mask		= pci_msi_mask_irq,
	.irq_ack		= irq_chip_ack_parent,
	.irq_retrigger		= irq_chip_retrigger_hierarchy,
	.irq_compose_msi_msg	= hv_irq_compose_msi_msg,
	.irq_set_affinity	= msi_domain_set_affinity,
	.flags			= IRQCHIP_SKIP_SET_WAKE,
};

static struct msi_domain_ops pci_msi_domain_ops = {
	.domain_free_irqs	= hv_msi_domain_free_irqs,
	.msi_prepare		= pci_msi_prepare,
};

static struct msi_domain_info hv_pci_msi_domain_info = {
	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
			  MSI_FLAG_PCI_MSIX,
	.ops		= &pci_msi_domain_ops,
	.chip		= &hv_pci_msi_controller,
	.handler	= handle_edge_irq,
	.handler_name	= "edge",
};

struct irq_domain * __init hv_create_pci_msi_domain(void)
{
	struct irq_domain *d = NULL;
	struct fwnode_handle *fn;

	fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI");
	if (fn)
		d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain);

	/* No point in going further if we can't get an irq domain */
	BUG_ON(!d);

	return d;
}

#endif /* CONFIG_PCI_MSI */

int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
{
	union hv_device_id device_id;

	device_id.as_uint64 = 0;
	device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
	device_id.ioapic.ioapic_id = (u8)ioapic_id;

	return hv_unmap_interrupt(device_id.as_uint64, entry);
}
EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);

int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
		struct hv_interrupt_entry *entry)
{
	union hv_device_id device_id;

	device_id.as_uint64 = 0;
	device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
	device_id.ioapic.ioapic_id = (u8)ioapic_id;

	return hv_map_interrupt(device_id, level, cpu, vector, entry);
}
EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
Loading