Unverified Commit bb74bc36 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!5480 [OLK-6.6] Fix DMA RW sequence disorder issue

Merge Pull Request from: @leoliu-oc 
 
The Zhaoxin KH-40000 platform can't keep the PCIE transaction order for DMA writes, whose target addresses are located in different NUMA nodes, from the same device. Patch this issue by flushing the target DMA write with a subsequent PCIE configuration space read operation.

### Issue
https://gitee.com/openeuler/kernel/issues/I9ARTM

### Test
NA

### Known Issue
NA

### Default config change
NA 
 
Link:https://gitee.com/openeuler/kernel/pulls/5480

 

Reviewed-by: default avatarJason Zeng <jason.zeng@intel.com>
Signed-off-by: default avatarZhang Peng <zhangpeng362@huawei.com>
parents ea2dca63 ef20808d
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -9,4 +9,12 @@ static inline const struct dma_map_ops *get_arch_dma_ops(void)
	return dma_ops;
}

#ifdef CONFIG_PCI

extern bool is_zhaoxin_kh40000;
extern const struct dma_map_ops kh40000_dma_direct_ops;
extern void kh40000_set_iommu_dma_ops(struct device *dev);

#endif

#endif
+1 −0
Original line number Diff line number Diff line
@@ -160,4 +160,5 @@ ifeq ($(CONFIG_X86_64),y)

	obj-$(CONFIG_MMCONF_FAM10H)	+= mmconf-fam10h_64.o
	obj-y				+= vsmp_64.o
	obj-$(CONFIG_PCI)		+= zhaoxin_kh40000.o
endif
+17 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <asm/gart.h>
#include <asm/irq_remapping.h>
#include <asm/early_ioremap.h>
#include <asm/dma-mapping.h>

static void __init early_pci_clear_msi(int bus, int slot, int func)
{
@@ -716,6 +717,18 @@ static void __init apple_airport_reset(int bus, int slot, int func)
	early_iounmap(mmio, BCM4331_MMIO_SIZE);
}

static void quirk_zhaoxin_dma_patch(int num, int slot, int func)
{
	u8 revision;

	revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
	if (revision == 0x10) {
		is_zhaoxin_kh40000 = true;
		dma_ops = &kh40000_dma_direct_ops;
		pr_info("zhaoxin direct dma patch enabled\n");
	}
}

#define QFLAG_APPLY_ONCE 	0x1
#define QFLAG_APPLIED		0x2
#define QFLAG_DONE		(QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -759,6 +772,10 @@ static struct chipset early_qrk[] __initdata = {
		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
	{ PCI_VENDOR_ID_BROADCOM, 0x4331,
	  PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
	{ PCI_VENDOR_ID_ZHAOXIN, 0x1001, PCI_CLASS_BRIDGE_HOST,
	  PCI_BASE_CLASS_BRIDGE, QFLAG_APPLY_ONCE, quirk_zhaoxin_dma_patch },
	{ PCI_VENDOR_ID_ZHAOXIN, 0x345B, PCI_CLASS_BRIDGE_HOST,
	  PCI_BASE_CLASS_BRIDGE, QFLAG_APPLY_ONCE, quirk_zhaoxin_dma_patch },
	{ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, early_pci_clear_msi},
	{}
};
+353 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
#include <linux/device.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
#include <linux/dma-mapping.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/iommu.h>
#include <linux/kstrtox.h>
#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/printk.h>
#include <linux/scatterlist.h>
#include <linux/types.h>

#include <asm/dma-mapping.h>

#include "../../../kernel/dma/direct.h"

bool is_zhaoxin_kh40000;

/***
 * usage:
 *  set "zhaoxin_patch_bitmask=<value>" in cmdline
 * value description:
 *  bit 0: enable(1) node check or not(0). default 1
 */
enum {
	ZHAOXIN_P2CW_NODE_CHECK = BIT(0),
	ZHAOXIN_PATCH_CODE_MAX  = ZHAOXIN_P2CW_NODE_CHECK,
};

#define ZHAOXIN_PATCH_CODE_DEFAULT	ZHAOXIN_P2CW_NODE_CHECK

unsigned long zhaoxin_patch_code = ZHAOXIN_PATCH_CODE_DEFAULT;

static int __init zhaoxin_patch_code_setup(char *str)
{
	int err = kstrtoul(str, 0, &zhaoxin_patch_code);

	if (err || (zhaoxin_patch_code > ZHAOXIN_PATCH_CODE_MAX)) {
		pr_err("cmdline 'zhaoxin_patch_bitmask=%s' inappropriate\n", str);
		zhaoxin_patch_code = ZHAOXIN_PATCH_CODE_DEFAULT;
		return err;
	}

	if (ZHAOXIN_P2CW_NODE_CHECK | zhaoxin_patch_code)
		pr_info("zhaoxin dma patch node check is enabled\n");

	return 0;
}
__setup("zhaoxin_patch_bitmask=", zhaoxin_patch_code_setup);

static struct pci_dev *kh40000_get_pci_dev(struct device *dev)
{
	if (dev_is_pci(dev))
		return to_pci_dev(dev);

	if (dev->parent)
		return kh40000_get_pci_dev(dev->parent);

	return NULL;
}

static void kh40000_sync_single_dma_for_cpu(struct device *dev, dma_addr_t paddr,
		enum dma_data_direction dir, bool is_iommu)
{
	u8 vid;
	struct pci_dev *pci;
	u64 dma_mask = *dev->dma_mask;

	/* check direction */
	if ((dir != DMA_FROM_DEVICE) && (dir != DMA_BIDIRECTIONAL))
		return;

	/* check dma capability */
	if (dma_mask <= DMA_BIT_MASK(32))
		return;

	/* check device type */
	pci = kh40000_get_pci_dev(dev);
	if (pci == NULL)
		return;

	/* get real physical address */
	if (is_iommu) {
		struct iommu_domain *domain = iommu_get_dma_domain(dev);

		paddr = iommu_iova_to_phys(domain, paddr);
		if (!paddr)
			return;
	}

	/* check node or not */
	if ((zhaoxin_patch_code & ZHAOXIN_P2CW_NODE_CHECK)
	   && pfn_to_nid(PFN_DOWN(paddr)) == dev_to_node(dev))
		return;

	/* flush data by one pci read cycle */
	pci_read_config_byte(pci, PCI_VENDOR_ID, &vid);
}

/* zhaoxin kh-40000 direct dma ops */
static void *kh40000_dma_direct_alloc(struct device *dev, size_t size,
		dma_addr_t *addr, gfp_t gfp, unsigned long attrs)
{
	if (dev->coherent_dma_mask > DMA_BIT_MASK(32))
		gfp |= __GFP_THISNODE;

	return dma_direct_alloc(dev, size, addr, gfp, attrs);
}

static void kh40000_dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	kh40000_sync_single_dma_for_cpu(dev, addr, dir, 0);
	dma_direct_unmap_page(dev, addr, size, dir, attrs);
}

static void kh40000_dma_direct_sync_sg_for_cpu(struct device *dev,
		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
	struct scatterlist *sg;
	int i;

	for_each_sg(sgl,  sg, nents, i)
		kh40000_sync_single_dma_for_cpu(dev, sg_dma_address(sg), dir, 0);

	dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
}

static void kh40000_dma_direct_sync_single_for_cpu(struct device *dev,
		dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
	kh40000_sync_single_dma_for_cpu(dev, addr, dir, 0);
	dma_direct_sync_single_for_cpu(dev, addr, size, dir);
}

static void kh40000_dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
		int nents, enum dma_data_direction dir, unsigned long attrs)
{
	struct scatterlist *sg;
	int i;

	for_each_sg(sgl,  sg, nents, i)
		kh40000_sync_single_dma_for_cpu(dev, sg_dma_address(sg), dir, 0);

	dma_direct_unmap_sg(dev, sgl, nents, dir, attrs);
}

static void kh40000_dma_direct_unmap_resource(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	kh40000_sync_single_dma_for_cpu(dev, addr, dir, 0);
}

const struct dma_map_ops kh40000_dma_direct_ops = {
	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
	.alloc			= kh40000_dma_direct_alloc,
	.sync_sg_for_cpu	= kh40000_dma_direct_sync_sg_for_cpu,
	.unmap_page		= kh40000_dma_direct_unmap_page,
	.sync_single_for_cpu	= kh40000_dma_direct_sync_single_for_cpu,
	.unmap_sg		= kh40000_dma_direct_unmap_sg,
	.unmap_resource		= kh40000_dma_direct_unmap_resource,
	.dma_supported		= dma_direct_supported,
	.free			= dma_direct_free,
	.alloc_pages		= dma_direct_alloc_pages,
	.free_pages		= dma_direct_free_pages,
	.sync_single_for_device	= dma_direct_sync_single_for_device,
	.sync_sg_for_device	= dma_direct_sync_sg_for_device,
	.get_required_mask	= dma_direct_get_required_mask,
	.max_mapping_size	= dma_direct_max_mapping_size,
	.mmap			= dma_direct_mmap,
	.get_sgtable		= dma_direct_get_sgtable,
	.map_page		= dma_direct_map_page,
	.map_sg			= dma_direct_map_sg,
	.map_resource		= dma_direct_map_resource,
};

/* zhaoxin kh-40000 iommu dma ops */
static const struct dma_map_ops *iommu_dma_ops;

static void *kh40000_iommu_dma_alloc(struct device *dev, size_t size,
		dma_addr_t *addr, gfp_t gfp, unsigned long attrs)
{
	gfp |= __GFP_THISNODE;

	return iommu_dma_ops->alloc(dev, size, addr, gfp, attrs);
}

static void kh40000_iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
		dma_addr_t handle, unsigned long attrs)
{
	iommu_dma_ops->free(dev, size, cpu_addr, handle, attrs);
}

static struct page *kh40000_dma_common_alloc_pages(struct device *dev, size_t size,
		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
	return iommu_dma_ops->alloc_pages(dev, size, dma_handle, dir, gfp);
}

static void kh40000_dma_common_free_pages(struct device *dev, size_t size, struct page *page,
		dma_addr_t dma_handle, enum dma_data_direction dir)
{
	iommu_dma_ops->free_pages(dev, size, page, dma_handle, dir);
}

static struct sg_table *kh40000_iommu_dma_alloc_noncontiguous(struct device *dev,
		size_t size, enum dma_data_direction dir, gfp_t gfp,
		unsigned long attrs)
{
	return iommu_dma_ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
}

static void kh40000_iommu_dma_free_noncontiguous(struct device *dev, size_t size,
		struct sg_table *sgt, enum dma_data_direction dir)
{
	return iommu_dma_ops->free_noncontiguous(dev, size, sgt, dir);
}

static int kh40000_iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
		void *cpu_addr, dma_addr_t dma_addr, size_t size,
		unsigned long attrs)
{
	return iommu_dma_ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
}

static void kh40000_iommu_dma_unmap_page(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	kh40000_sync_single_dma_for_cpu(dev, addr, dir, 1);
	iommu_dma_ops->unmap_page(dev, addr, size, dir, attrs);
}

static int kh40000_iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
		void *cpu_addr, dma_addr_t dma_addr, size_t size,
		unsigned long attrs)
{
	return iommu_dma_ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
}

static dma_addr_t kh40000_iommu_dma_map_page(struct device *dev, struct page *page,
		unsigned long offset, size_t size, enum dma_data_direction dir,
		unsigned long attrs)
{
	return iommu_dma_ops->map_page(dev, page, offset, size, dir, attrs);
}

static int kh40000_iommu_dma_map_sg(struct device *dev, struct scatterlist *sgl,
		int nents, enum dma_data_direction dir, unsigned long attrs)
{
	return iommu_dma_ops->map_sg(dev, sgl, nents, dir, attrs);
}

static void kh40000_iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
		int nelems, enum dma_data_direction dir, unsigned long attrs)
{
	struct scatterlist *sg;
	int i;

	for_each_sg(sgl, sg, nelems, i)
		kh40000_sync_single_dma_for_cpu(dev, sg_dma_address(sg), dir, 1);
	iommu_dma_ops->unmap_sg(dev, sgl, nelems, dir, attrs);
}

static void kh40000_iommu_dma_sync_single_for_cpu(struct device *dev,
		dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
	kh40000_sync_single_dma_for_cpu(dev, addr, dir, 1);
	iommu_dma_ops->sync_single_for_cpu(dev, addr, size, dir);
}

static void kh40000_iommu_dma_sync_single_for_device(struct device *dev,
		dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
	iommu_dma_ops->sync_single_for_device(dev, addr, size, dir);
}

static void kh40000_iommu_dma_sync_sg_for_cpu(struct device *dev,
		struct scatterlist *sgl, int nelems,
		enum dma_data_direction dir)
{
	struct scatterlist *sg;
	int i;

	for_each_sg(sgl, sg, nelems, i)
		kh40000_sync_single_dma_for_cpu(dev, sg_dma_address(sg), dir, 1);
	iommu_dma_ops->sync_sg_for_cpu(dev, sgl, nelems, dir);
}

static void kh40000_iommu_dma_sync_sg_for_device(struct device *dev,
		struct scatterlist *sgl, int nelems,
		enum dma_data_direction dir)
{
	iommu_dma_ops->sync_sg_for_device(dev, sgl, nelems, dir);
}

static dma_addr_t kh40000_iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	return iommu_dma_ops->map_resource(dev, phys, size, dir, attrs);
}

static void kh40000_iommu_dma_unmap_resource(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	kh40000_sync_single_dma_for_cpu(dev, addr, dir, 1);
	iommu_dma_ops->unmap_resource(dev, addr, size, dir, attrs);
}

static unsigned long kh40000_iommu_dma_get_merge_boundary(struct device *dev)
{
	return iommu_dma_ops->get_merge_boundary(dev);
}

static size_t kh40000_iommu_dma_opt_mapping_size(void)
{
	return iommu_dma_ops->opt_mapping_size();
}

const struct dma_map_ops kh40000_dma_iommu_ops = {
	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
	.alloc			= kh40000_iommu_dma_alloc,
	.free			= kh40000_iommu_dma_free,
	.unmap_page		= kh40000_iommu_dma_unmap_page,
	.alloc_pages		= kh40000_dma_common_alloc_pages,
	.free_pages		= kh40000_dma_common_free_pages,
	.alloc_noncontiguous	= kh40000_iommu_dma_alloc_noncontiguous,
	.free_noncontiguous	= kh40000_iommu_dma_free_noncontiguous,
	.mmap			= kh40000_iommu_dma_mmap,
	.get_sgtable		= kh40000_iommu_dma_get_sgtable,
	.map_page		= kh40000_iommu_dma_map_page,
	.map_sg			= kh40000_iommu_dma_map_sg,
	.unmap_sg		= kh40000_iommu_dma_unmap_sg,
	.sync_single_for_cpu	= kh40000_iommu_dma_sync_single_for_cpu,
	.sync_single_for_device	= kh40000_iommu_dma_sync_single_for_device,
	.sync_sg_for_cpu	= kh40000_iommu_dma_sync_sg_for_cpu,
	.sync_sg_for_device	= kh40000_iommu_dma_sync_sg_for_device,
	.map_resource		= kh40000_iommu_dma_map_resource,
	.unmap_resource		= kh40000_iommu_dma_unmap_resource,
	.get_merge_boundary	= kh40000_iommu_dma_get_merge_boundary,
	.opt_mapping_size	= kh40000_iommu_dma_opt_mapping_size,
};

void kh40000_set_iommu_dma_ops(struct device *dev)
{
	if (dev->dma_ops) {
		iommu_dma_ops = dev->dma_ops;
		set_dma_ops(dev, &kh40000_dma_iommu_ops);
		pr_info_once("zhaoxin iommu dma patch enabled\n");
	}
}
+4 −0
Original line number Diff line number Diff line
@@ -4418,6 +4418,10 @@ static void intel_iommu_probe_finalize(struct device *dev)
{
	set_dma_ops(dev, NULL);
	iommu_setup_dma_ops(dev, 0, U64_MAX);
#ifdef CONFIG_X86_64
	if (is_zhaoxin_kh40000)
		kh40000_set_iommu_dma_ops(dev);
#endif
}

static void intel_iommu_get_resv_regions(struct device *device,
Loading