Unverified Commit 0d1041c8 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!4159 Backport iommufd dirty tracking from v6.7

Merge Pull Request from: @x56Jason 
 
## Description
This is to backport iommufd dirty tracking from upstream v6.7.

## Issue
#I8Y6AM

## Test
- Kernel SelfTest
    - Run IOMMU related kernel selftest
        - all PASS
- Regression Test
    - Using Qemu to passthrough a NIC to guest VM, which exercises traditional VFIO/IOMMU code
        - Guest VM can run successfully and the NIC works well

## Known Issue
N/A

## Default config change
CONFIG_IOMMUFD_DRIVER will be automatically selected if enables Intel/AMD iommu and iommufd.  
 
Link:https://gitee.com/openeuler/kernel/pulls/4159

 

Reviewed-by: default avatarAichun Shi <aichun.shi@intel.com>
Reviewed-by: default avatarWeilong Chen <chenweilong@huawei.com>
Signed-off-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
parents 8bf376b0 46deabca
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -7,6 +7,10 @@ config IOMMU_IOVA
config IOMMU_API
	bool

config IOMMUFD_DRIVER
	bool
	default n

menuconfig IOMMU_SUPPORT
	bool "IOMMU Hardware Support"
	depends on MMU
+1 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ config AMD_IOMMU
	select IOMMU_API
	select IOMMU_IOVA
	select IOMMU_IO_PGTABLE
	select IOMMUFD_DRIVER if IOMMUFD
	depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
	help
	  With this option you can enable support for AMD IOMMU hardware in
+12 −0
Original line number Diff line number Diff line
@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK	(3ULL)
#define FEATURE_GAM_VAPIC	BIT_ULL(21)
#define FEATURE_GIOSUP		BIT_ULL(48)
#define FEATURE_HASUP		BIT_ULL(49)
#define FEATURE_EPHSUP		BIT_ULL(50)
#define FEATURE_HDSUP		BIT_ULL(52)
#define FEATURE_SNP		BIT_ULL(63)

#define FEATURE_PASID_SHIFT	32
@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */
#define DEV_ENTRY_VALID         0x00
#define DEV_ENTRY_TRANSLATION   0x01
#define DEV_ENTRY_HAD           0x07
#define DEV_ENTRY_PPR           0x34
#define DEV_ENTRY_IR            0x3d
#define DEV_ENTRY_IW            0x3e
@@ -370,10 +373,16 @@
#define PTE_LEVEL_PAGE_SIZE(level)			\
	(1ULL << (12 + (9 * (level))))

/*
 * The IOPTE dirty bit
 */
#define IOMMU_PTE_HD_BIT (6)

/*
 * Bit value definition for I/O PTE fields
 */
#define IOMMU_PTE_PR	BIT_ULL(0)
#define IOMMU_PTE_HD	BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U	BIT_ULL(59)
#define IOMMU_PTE_FC	BIT_ULL(60)
#define IOMMU_PTE_IR	BIT_ULL(61)
@@ -384,6 +393,7 @@
 */
#define DTE_FLAG_V	BIT_ULL(0)
#define DTE_FLAG_TV	BIT_ULL(1)
#define DTE_FLAG_HAD	(3ULL << 7)
#define DTE_FLAG_GIOV	BIT_ULL(54)
#define DTE_FLAG_GV	BIT_ULL(55)
#define DTE_GLX_SHIFT	(56)
@@ -413,6 +423,7 @@

#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)

@@ -563,6 +574,7 @@ struct protection_domain {
	int nid;		/* Node ID */
	u64 *gcr3_tbl;		/* Guest CR3 table */
	unsigned long flags;	/* flags to find out type of domain */
	bool dirty_tracking;	/* dirty tracking is enabled in the domain */
	unsigned dev_cnt;	/* devices assigned to this domain */
	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
+68 −0
Original line number Diff line number Diff line
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
	return (__pte & ~offset_mask) | (iova & offset_mask);
}

static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
				     unsigned long flags)
{
	bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
	bool dirty = false;
	int i, count;

	/*
	 * 2.2.3.2 Host Dirty Support
	 * When a non-default page size is used , software must OR the
	 * Dirty bits in all of the replicated host PTEs used to map
	 * the page. The IOMMU does not guarantee the Dirty bits are
	 * set in all of the replicated PTEs. Any portion of the page
	 * may have been written even if the Dirty bit is set in only
	 * one of the replicated PTEs.
	 */
	count = PAGE_SIZE_PTE_COUNT(size);
	for (i = 0; i < count && test_only; i++) {
		if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
			dirty = true;
			break;
		}
	}

	for (i = 0; i < count && !test_only; i++) {
		if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
				       (unsigned long *)&ptep[i])) {
			dirty = true;
		}
	}

	return dirty;
}

static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
					 unsigned long iova, size_t size,
					 unsigned long flags,
					 struct iommu_dirty_bitmap *dirty)
{
	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
	unsigned long end = iova + size - 1;

	do {
		unsigned long pgsize = 0;
		u64 *ptep, pte;

		ptep = fetch_pte(pgtable, iova, &pgsize);
		if (ptep)
			pte = READ_ONCE(*ptep);
		if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
			pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
			iova += pgsize;
			continue;
		}

		/*
		 * Mark the whole IOVA range as dirty even if only one of
		 * the replicated PTEs were marked dirty.
		 */
		if (pte_test_and_clear_dirty(ptep, pgsize, flags))
			iommu_dirty_bitmap_record(dirty, iova, pgsize);
		iova += pgsize;
	} while (iova < end);

	return 0;
}

/*
 * ----------------------------------------------------
 */
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
	pgtable->iop.ops.map_pages    = iommu_v1_map_pages;
	pgtable->iop.ops.unmap_pages  = iommu_v1_unmap_pages;
	pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
	pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;

	return &pgtable->iop;
}
+141 −3
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>

#include "amd_iommu.h"
#include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);

const struct iommu_ops amd_iommu_ops;
const struct iommu_dirty_ops amd_dirty_ops;

static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
			pte_root |= 1ULL << DEV_ENTRY_PPR;
	}

	if (domain->dirty_tracking)
		pte_root |= DTE_FLAG_HAD;

	if (domain->flags & PD_IOMMUV2_MASK) {
		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
		u64 glx  = domain->glx;
@@ -2155,28 +2160,76 @@ static inline u64 dma_max_address(void)
	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
}

static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
	return iommu && (iommu->features & FEATURE_HDSUP);
}

static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
						  struct device *dev, u32 flags)
{
	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
	struct protection_domain *domain;
	struct amd_iommu *iommu = NULL;

	if (dev) {
		iommu = rlookup_amd_iommu(dev);
		if (!iommu)
			return ERR_PTR(-ENODEV);
	}

	/*
	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
	 * default to use IOMMU_DOMAIN_DMA[_FQ].
	 */
	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
		return NULL;
		return ERR_PTR(-EINVAL);

	if (dirty_tracking && !amd_iommu_hd_support(iommu))
		return ERR_PTR(-EOPNOTSUPP);

	domain = protection_domain_alloc(type);
	if (!domain)
		return NULL;
		return ERR_PTR(-ENOMEM);

	domain->domain.geometry.aperture_start = 0;
	domain->domain.geometry.aperture_end   = dma_max_address();
	domain->domain.geometry.force_aperture = true;

	if (iommu) {
		domain->domain.type = type;
		domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
		domain->domain.ops = iommu->iommu.ops->default_domain_ops;

		if (dirty_tracking)
			domain->domain.dirty_ops = &amd_dirty_ops;
	}

	return &domain->domain;
}

static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
{
	struct iommu_domain *domain;

	domain = do_iommu_domain_alloc(type, NULL, 0);
	if (IS_ERR(domain))
		return NULL;

	return domain;
}

static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev,
							u32 flags)
{
	unsigned int type = IOMMU_DOMAIN_UNMANAGED;

	if (flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
		return ERR_PTR(-EOPNOTSUPP);

	return do_iommu_domain_alloc(type, dev, flags);
}

static void amd_iommu_domain_free(struct iommu_domain *dom)
{
	struct protection_domain *domain;
@@ -2214,6 +2267,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,

	dev_data->defer_attach = false;

	/*
	 * Restrict to devices with compatible IOMMU hardware support
	 * when enforcement of dirty tracking is enabled.
	 */
	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
		return -EINVAL;

	if (dev_data->domain)
		detach_device(dev);

@@ -2332,6 +2392,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
		return true;
	case IOMMU_CAP_DEFERRED_FLUSH:
		return true;
	case IOMMU_CAP_DIRTY_TRACKING: {
		struct amd_iommu *iommu = rlookup_amd_iommu(dev);

		return amd_iommu_hd_support(iommu);
	}
	default:
		break;
	}
@@ -2339,6 +2404,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
	return false;
}

static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
					bool enable)
{
	struct protection_domain *pdomain = to_pdomain(domain);
	struct dev_table_entry *dev_table;
	struct iommu_dev_data *dev_data;
	bool domain_flush = false;
	struct amd_iommu *iommu;
	unsigned long flags;
	u64 pte_root;

	spin_lock_irqsave(&pdomain->lock, flags);
	if (!(pdomain->dirty_tracking ^ enable)) {
		spin_unlock_irqrestore(&pdomain->lock, flags);
		return 0;
	}

	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
		iommu = rlookup_amd_iommu(dev_data->dev);
		if (!iommu)
			continue;

		dev_table = get_dev_table(iommu);
		pte_root = dev_table[dev_data->devid].data[0];

		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
				     pte_root & ~DTE_FLAG_HAD);

		/* Flush device DTE */
		dev_table[dev_data->devid].data[0] = pte_root;
		device_flush_dte(dev_data);
		domain_flush = true;
	}

	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
	if (domain_flush) {
		amd_iommu_domain_flush_tlb_pde(pdomain);
		amd_iommu_domain_flush_complete(pdomain);
	}
	pdomain->dirty_tracking = enable;
	spin_unlock_irqrestore(&pdomain->lock, flags);

	return 0;
}

static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
					  unsigned long iova, size_t size,
					  unsigned long flags,
					  struct iommu_dirty_bitmap *dirty)
{
	struct protection_domain *pdomain = to_pdomain(domain);
	struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
	unsigned long lflags;

	if (!ops || !ops->read_and_clear_dirty)
		return -EOPNOTSUPP;

	spin_lock_irqsave(&pdomain->lock, lflags);
	if (!pdomain->dirty_tracking && dirty->bitmap) {
		spin_unlock_irqrestore(&pdomain->lock, lflags);
		return -EINVAL;
	}
	spin_unlock_irqrestore(&pdomain->lock, lflags);

	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
}

static void amd_iommu_get_resv_regions(struct device *dev,
				       struct list_head *head)
{
@@ -2461,9 +2593,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
	return true;
}

const struct iommu_dirty_ops amd_dirty_ops = {
	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
};

const struct iommu_ops amd_iommu_ops = {
	.capable = amd_iommu_capable,
	.domain_alloc = amd_iommu_domain_alloc,
	.domain_alloc_user = amd_iommu_domain_alloc_user,
	.probe_device = amd_iommu_probe_device,
	.release_device = amd_iommu_release_device,
	.probe_finalize = amd_iommu_probe_finalize,
Loading