Unverified Commit 30db8170 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!2762 Intel: backport SPR/EMR vt-d pcie upstream bug fix for 5.10

Merge Pull Request from: @etzhao 
 
Hi,
  This backport patch set includes Intel vt-d/PCIe critical bug fix for 5.10.

  Kernel issue:
  https://gitee.com/openeuler/intel-kernel/issues/I8C8B4

  Test:
 a. Kdump function test passed:
  crashkernel=1G-4G:192M,4G-64G:256M,64G-:512M intel_iommu=on, sm
  
  #echo 1 > /proc/sys/kernel/sysrq
  #echo c > /proc/sysrq-trigger

 b. DSA test passed.
 
  
 Kernel config:
 NA.

 Help to review and merge
 Thanks,
 Ethan

   
 
Link:https://gitee.com/openeuler/kernel/pulls/2762

 

Reviewed-by: default avatarJason Zeng <jason.zeng@intel.com>
Reviewed-by: default avatarAichun Shi <aichun.shi@intel.com>
Signed-off-by: default avatarJialin Zhang <zhangjialin11@huawei.com>
parents 3ca7635e 2c8409f6
Loading
Loading
Loading
Loading
+22 −8
Original line number Diff line number Diff line
@@ -49,6 +49,18 @@ struct iommu_dma_cookie {
	struct iommu_domain		*fq_domain;
};

static void iommu_dma_entry_dtor(unsigned long data)
{
	struct page *freelist = (struct page *)data;

	while (freelist) {
		unsigned long p = (unsigned long)page_address(freelist);

		freelist = freelist->freelist;
		free_page(p);
	}
}

static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
{
	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
@@ -345,7 +357,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
	if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
			DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
					NULL))
					  iommu_dma_entry_dtor))
			pr_warn("iova flush queue initialization failed\n");
		else
			cookie->fq_domain = domain;
@@ -442,7 +454,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
}

static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
		dma_addr_t iova, size_t size)
		dma_addr_t iova, size_t size, struct page *freelist)
{
	struct iova_domain *iovad = &cookie->iovad;

@@ -451,7 +463,8 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
		cookie->msi_iova -= size;
	else if (cookie->fq_domain)	/* non-strict mode */
		queue_iova(iovad, iova_pfn(iovad, iova),
				size >> iova_shift(iovad), 0);
				size >> iova_shift(iovad),
				(unsigned long)freelist);
	else
		free_iova_fast(iovad, iova_pfn(iovad, iova),
				size >> iova_shift(iovad));
@@ -470,13 +483,14 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
	dma_addr -= iova_off;
	size = iova_align(iovad, size + iova_off);
	iommu_iotlb_gather_init(&iotlb_gather);
	iotlb_gather.queued = cookie->fq_domain;

	unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
	WARN_ON(unmapped != size);

	if (!cookie->fq_domain)
		iommu_iotlb_sync(domain, &iotlb_gather);
	iommu_dma_free_iova(cookie, dma_addr, size);
	iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist);
}

static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -498,7 +512,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
		return DMA_MAPPING_ERROR;

	if (iommu_map_atomic(domain, iova, phys - iova_off, size, prot)) {
		iommu_dma_free_iova(cookie, iova, size);
		iommu_dma_free_iova(cookie, iova, size, NULL);
		return DMA_MAPPING_ERROR;
	}
	return iova + iova_off;
@@ -651,7 +665,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
out_free_sg:
	sg_free_table(&sgt);
out_free_iova:
	iommu_dma_free_iova(cookie, iova, size);
	iommu_dma_free_iova(cookie, iova, size, NULL);
out_free_pages:
	__iommu_dma_free_pages(pages, count);
	return NULL;
@@ -902,7 +916,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
	return __finalise_sg(dev, sg, nents, iova);

out_free_iova:
	iommu_dma_free_iova(cookie, iova, iova_len);
	iommu_dma_free_iova(cookie, iova, iova_len, NULL);
out_restore_sg:
	__invalidate_sg(sg, nents);
	return 0;
@@ -1230,7 +1244,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
	return msi_page;

out_free_iova:
	iommu_dma_free_iova(cookie, iova, size);
	iommu_dma_free_iova(cookie, iova, size, NULL);
out_free_page:
	kfree(msi_page);
	return NULL;
+150 −76
Original line number Diff line number Diff line
@@ -208,38 +208,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re)
	return re->hi & VTD_PAGE_MASK;
}

static inline void context_clear_pasid_enable(struct context_entry *context)
{
	context->lo &= ~(1ULL << 11);
}

static inline bool context_pasid_enabled(struct context_entry *context)
{
	return !!(context->lo & (1ULL << 11));
}

static inline void context_set_copied(struct context_entry *context)
{
	context->hi |= (1ull << 3);
}

static inline bool context_copied(struct context_entry *context)
{
	return !!(context->hi & (1ULL << 3));
}

static inline bool __context_present(struct context_entry *context)
{
	return (context->lo & 1);
}

bool context_present(struct context_entry *context)
{
	return context_pasid_enabled(context) ?
	     __context_present(context) :
	     __context_present(context) && !context_copied(context);
}

static inline void context_set_present(struct context_entry *context)
{
	context->lo |= 1;
@@ -287,6 +255,26 @@ static inline void context_clear_entry(struct context_entry *context)
	context->hi = 0;
}

static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
{
	if (!iommu->copied_tables)
		return false;

	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
}

static inline void
set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
{
	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
}

static inline void
clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
{
	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
}

/*
 * This domain is a statically identity mapping domain.
 *	1. This domain creats a static 1:1 mapping to all usable memory.
@@ -793,6 +781,13 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
	struct context_entry *context;
	u64 *entry;

	/*
	 * Except that the caller requested to allocate a new entry,
	 * returning a copied context entry makes no sense.
	 */
	if (!alloc && context_copied(iommu, bus, devfn))
		return NULL;

	entry = &root->lo;
	if (sm_supported(iommu)) {
		if (devfn >= 0x80) {
@@ -1291,17 +1286,17 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
   pages can only be freed after the IOTLB flush has been done. */
static struct page *domain_unmap(struct dmar_domain *domain,
				 unsigned long start_pfn,
				 unsigned long last_pfn)
				 unsigned long last_pfn,
				 struct page *freelist)
{
	struct page *freelist;

	BUG_ON(!domain_pfn_supported(domain, start_pfn));
	BUG_ON(!domain_pfn_supported(domain, last_pfn));
	BUG_ON(start_pfn > last_pfn);

	/* we don't need lock here; nobody else touches the iova range */
	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
				       domain->pgd, 0, start_pfn, last_pfn, NULL);
				       domain->pgd, 0, start_pfn, last_pfn,
				       freelist);

	/* free pgd */
	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
@@ -1540,6 +1535,24 @@ static void domain_update_iotlb(struct dmar_domain *domain)
	domain->has_iotlb_device = has_iotlb_device;
}

/*
 * The extra devTLB flush quirk impacts those QAT devices with PCI device
 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
 * check because it applies only to the built-in QAT devices and it doesn't
 * grant additional privileges.
 */
#define BUGGY_QAT_DEVID_MASK 0x4940
static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
{
	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
		return false;

	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
		return false;

	return true;
}

static void iommu_enable_dev_iotlb(struct device_domain_info *info)
{
	struct pci_dev *pdev;
@@ -1627,6 +1640,7 @@ static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
	qdep = info->ats_qdep;
	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
			   qdep, addr, mask);
	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
}

static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
@@ -1915,6 +1929,10 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
	}

	g_iommus[iommu->seq_id] = NULL;
	if (iommu->copied_tables) {
		bitmap_free(iommu->copied_tables);
		iommu->copied_tables = NULL;
	}

	/* free context mapping */
	free_context_table(iommu);
@@ -2103,7 +2121,8 @@ static void domain_exit(struct dmar_domain *domain)
	if (domain->pgd) {
		struct page *freelist;

		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
		freelist = domain_unmap(domain, 0,
					DOMAIN_MAX_PFN(domain->gaw), NULL);
		dma_free_pagelist(freelist);
	}

@@ -2190,7 +2209,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
		goto out_unlock;

	ret = 0;
	if (context_present(context))
	if (context_present(context) && !context_copied(iommu, bus, devfn))
		goto out_unlock;

	/*
@@ -2202,7 +2221,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
	 * in-flight DMA will exist, and we don't need to worry anymore
	 * hereafter.
	 */
	if (context_copied(context)) {
	if (context_copied(iommu, bus, devfn)) {
		u16 did_old = context_domain_id(context);

		if (did_old < cap_ndoms(iommu->cap)) {
@@ -2213,6 +2232,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
						 DMA_TLB_DSI_FLUSH);
		}

		clear_context_copied(iommu, bus, devfn);
	}

	context_clear_entry(context);
@@ -2761,9 +2782,11 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,

		if (ecap_dev_iotlb_support(iommu->ecap) &&
		    pci_ats_supported(pdev) &&
		    dmar_find_matched_atsr_unit(pdev))
		    dmar_find_matched_atsr_unit(pdev)) {
			info->ats_supported = 1;

			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
		}
		if (sm_supported(iommu)) {
			if (pasid_supported(iommu)) {
				int features = pci_pasid_features(pdev);
@@ -3153,32 +3176,14 @@ static int copy_context_table(struct intel_iommu *iommu,
		/* Now copy the context entry */
		memcpy(&ce, old_ce + idx, sizeof(ce));

		if (!__context_present(&ce))
		if (!context_present(&ce))
			continue;

		did = context_domain_id(&ce);
		if (did >= 0 && did < cap_ndoms(iommu->cap))
			set_bit(did, iommu->domain_ids);

		/*
		 * We need a marker for copied context entries. This
		 * marker needs to work for the old format as well as
		 * for extended context entries.
		 *
		 * Bit 67 of the context entry is used. In the old
		 * format this bit is available to software, in the
		 * extended format it is the PGE bit, but PGE is ignored
		 * by HW if PASIDs are disabled (and thus still
		 * available).
		 *
		 * So disable PASIDs first and then mark the entry
		 * copied. This means that we don't copy PASID
		 * translations from the old kernel, but this is fine as
		 * faults there are not fatal.
		 */
		context_clear_pasid_enable(&ce);
		context_set_copied(&ce);

		set_context_copied(iommu, bus, devfn);
		new_ce[idx] = ce;
	}

@@ -3205,8 +3210,8 @@ static int copy_translation_tables(struct intel_iommu *iommu)
	bool new_ext, ext;

	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
	new_ext    = !!ecap_ecs(iommu->ecap);
	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
	new_ext    = !!sm_supported(iommu);

	/*
	 * The RTT bit can only be changed when translation is disabled,
@@ -3217,6 +3222,10 @@ static int copy_translation_tables(struct intel_iommu *iommu)
	if (new_ext != ext)
		return -EINVAL;

	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
	if (!iommu->copied_tables)
		return -ENOMEM;

	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
	if (!old_rt_phys)
		return -EINVAL;
@@ -3700,7 +3709,7 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
	if (dev_is_pci(dev))
		pdev = to_pci_dev(dev);

	freelist = domain_unmap(domain, start_pfn, last_pfn);
	freelist = domain_unmap(domain, start_pfn, last_pfn, NULL);
	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
			!has_iova_flush_queue(&domain->iovad)) {
		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
@@ -4778,7 +4787,8 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
			struct page *freelist;

			freelist = domain_unmap(si_domain,
						start_vpfn, last_vpfn);
						start_vpfn, last_vpfn,
						NULL);

			rcu_read_lock();
			for_each_active_iommu(iommu, drhd)
@@ -5858,10 +5868,8 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
				struct iommu_iotlb_gather *gather)
{
	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
	struct page *freelist = NULL;
	unsigned long start_pfn, last_pfn;
	unsigned int npages;
	int iommu_id, level = 0;
	int level = 0;

	/* Cope with horrid API which requires us to unmap more than the
	   size argument if it happens to be a large-page mapping. */
@@ -5873,22 +5881,42 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
	start_pfn = iova >> VTD_PAGE_SHIFT;
	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;

	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);

	npages = last_pfn - start_pfn + 1;

	for_each_domain_iommu(iommu_id, dmar_domain)
		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
				      start_pfn, npages, !freelist, 0);

	dma_free_pagelist(freelist);
	gather->freelist = domain_unmap(dmar_domain, start_pfn,
					last_pfn, gather->freelist);

	if (dmar_domain->max_addr == iova + size)
		dmar_domain->max_addr = iova;

	/*
	 * We do not use page-selective IOTLB invalidation in flush queue,
	 * so there is no need to track page and sync iotlb.
	 */
	if (!(gather && gather->queued))
		iommu_iotlb_gather_add_page(domain, gather, iova, size);

	return size;
}

static void intel_iommu_tlb_sync(struct iommu_domain *domain,
				 struct iommu_iotlb_gather *gather)
{
	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
	unsigned long iova_pfn = IOVA_PFN(gather->start);
	size_t size = gather->end - gather->start;
	unsigned long start_pfn;
	unsigned long nrpages;
	int iommu_id;

	nrpages = aligned_nrpages(gather->start, size);
	start_pfn = mm_to_dma_pfn(iova_pfn);

	for_each_domain_iommu(iommu_id, dmar_domain)
		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
				      start_pfn, nrpages, !gather->freelist, 0);

	dma_free_pagelist(gather->freelist);
}

static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
					    dma_addr_t iova)
{
@@ -6359,6 +6387,7 @@ const struct iommu_ops intel_iommu_ops = {
	.aux_get_pasid		= intel_iommu_aux_get_pasid,
	.map			= intel_iommu_map,
	.unmap			= intel_iommu_unmap,
	.iotlb_sync		= intel_iommu_tlb_sync,
	.iova_to_phys		= intel_iommu_iova_to_phys,
	.probe_device		= intel_iommu_probe_device,
	.probe_finalize		= intel_iommu_probe_finalize,
@@ -6573,3 +6602,48 @@ static void __init check_tylersburg_isoch(void)
	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
	       vtisochctrl);
}

/*
 * Here we deal with a device TLB defect where device may inadvertently issue ATS
 * invalidation completion before posted writes initiated with translated address
 * that utilized translations matching the invalidation address range, violating
 * the invalidation completion ordering.
 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
 * under the control of the trusted/privileged host device driver must use this
 * quirk.
 * Device TLBs are invalidated under the following six conditions:
 * 1. Device driver does DMA API unmap IOVA
 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
 *    exit_mmap() due to crash
 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
 *    VM has to free pages that were unmapped
 * 5. Userspace driver unmaps a DMA buffer
 * 6. Cache invalidation in vSVA usage (upcoming)
 *
 * For #1 and #2, device drivers are responsible for stopping DMA traffic
 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
 * invalidate TLB the same way as normal user unmap which will use this quirk.
 * The dTLB invalidation after PASID cache flush does not need this quirk.
 *
 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
 */
void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
			       unsigned long address, unsigned long mask,
			       u32 pasid, u16 qdep)
{
	u16 sid;

	if (likely(!info->dtlb_extra_inval))
		return;

	sid = PCI_DEVID(info->bus, info->devfn);
	if (pasid == PASID_RID2PASID) {
		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
				   qdep, address, mask);
	} else {
		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
					 pasid, qdep, address, mask);
	}
}
+4 −1
Original line number Diff line number Diff line
@@ -149,10 +149,13 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
		return;

	qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
	if (info->ats_enabled)
	if (info->ats_enabled) {
		qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
					 svm->pasid, sdev->qdep, address,
					 order_base_2(pages));
		quirk_extra_dev_tlb_flush(info, address, order_base_2(pages),
					  svm->pasid, sdev->qdep);
	}
}

static void intel_flush_svm_range_dev(struct intel_svm *svm,
+7 −0
Original line number Diff line number Diff line
@@ -565,6 +565,13 @@ void queue_iova(struct iova_domain *iovad,
	unsigned long flags;
	unsigned idx;

	/*
	 * Order against the IOMMU driver's pagetable update from unmapping
	 * @pte, to guarantee that iova_domain_flush() observes that if called
	 * from a different CPU before we release the lock below.
	 */
	smp_wmb();

	spin_lock_irqsave(&fq->lock, flags);

	/*
+7 −0
Original line number Diff line number Diff line
@@ -96,6 +96,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
	[PCI_EXT_CAP_ID_SECPCI]	=	0,	/* not yet */
	[PCI_EXT_CAP_ID_PMUX]	=	0,	/* not yet */
	[PCI_EXT_CAP_ID_PASID]	=	0,	/* not yet */
	[PCI_EXT_CAP_ID_DVSEC]	=	0xFF,
};

/*
@@ -1065,6 +1066,7 @@ int __init vfio_pci_init_perm_bits(void)
	ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
	ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
	ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
	ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write;

	if (ret)
		vfio_pci_uninit_perm_bits();
@@ -1404,6 +1406,11 @@ static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
			return PCI_TPH_BASE_SIZEOF + (sts * 2) + 2;
		}
		return PCI_TPH_BASE_SIZEOF;
	case PCI_EXT_CAP_ID_DVSEC:
		ret = pci_read_config_dword(pdev, epos + PCI_DVSEC_HEADER1, &dword);
		if (ret)
			return pcibios_err_to_errno(ret);
		return PCI_DVSEC_HEADER1_LEN(dword);
	default:
		pci_warn(pdev, "%s: unknown length for PCI ecap %#x@%#x\n",
			 __func__, ecap, epos);
Loading