Commit 08a2fd23 authored by Ramesh Errabolu's avatar Ramesh Errabolu Committed by Alex Deucher
Browse files

drm/amdgpu: Add peer-to-peer support among PCIe connected AMD GPUs



Add support for peer-to-peer communication among AMD GPUs over PCIe
bus. Support REQUIRES enablement of config HSA_AMD_P2P.

Signed-off-by: default avatarRamesh Errabolu <Ramesh.Errabolu@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6fbfc3a2
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -223,6 +223,9 @@ static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;
static const bool __maybe_unused debug_evictions; /* = false */
static const bool __maybe_unused no_system_mem_limit;
#endif
#ifdef CONFIG_HSA_AMD_P2P
extern bool pcie_p2p;
#endif

extern int amdgpu_tmz;
extern int amdgpu_reset_method;
+1 −0
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@ enum kfd_mem_attachment_type {
	KFD_MEM_ATT_SHARED,	/* Share kgd_mem->bo or another attachment's */
	KFD_MEM_ATT_USERPTR,	/* SG bo to DMA map pages from a userptr bo */
	KFD_MEM_ATT_DMABUF,	/* DMAbuf to DMA map TTM BOs */
	KFD_MEM_ATT_SG		/* Tag to DMA map SG BOs */
};

struct kfd_mem_attachment {
+238 −62
Original line number Diff line number Diff line
@@ -241,6 +241,42 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
	kfree(bo->kfd_bo);
}

/**
 * @create_dmamap_sg_bo: Creates a amdgpu_bo object to reflect information
 * about USERPTR or DOOREBELL or MMIO BO.
 * @adev: Device for which dmamap BO is being created
 * @mem: BO of peer device that is being DMA mapped. Provides parameters
 *	 in building the dmamap BO
 * @bo_out: Output parameter updated with handle of dmamap BO
 */
static int
create_dmamap_sg_bo(struct amdgpu_device *adev,
		 struct kgd_mem *mem, struct amdgpu_bo **bo_out)
{
	struct drm_gem_object *gem_obj;
	int ret, align;

	ret = amdgpu_bo_reserve(mem->bo, false);
	if (ret)
		return ret;

	align = 1;
	ret = amdgpu_gem_object_create(adev, mem->bo->tbo.base.size, align,
			AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_CREATE_PREEMPTIBLE,
			ttm_bo_type_sg, mem->bo->tbo.base.resv, &gem_obj);

	amdgpu_bo_unreserve(mem->bo);

	if (ret) {
		pr_err("Error in creating DMA mappable SG BO on domain: %d\n", ret);
		return -EINVAL;
	}

	*bo_out = gem_to_amdgpu_bo(gem_obj);
	(*bo_out)->parent = amdgpu_bo_ref(mem->bo);
	return ret;
}

/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's
 *  reservation object.
 *
@@ -446,6 +482,38 @@ static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
	return pte_flags;
}

/**
 * create_sg_table() - Create an sg_table for a contiguous DMA addr range
 * @addr: The starting address to point to
 * @size: Size of memory area in bytes being pointed to
 *
 * Allocates an instance of sg_table and initializes it to point to memory
 * area specified by input parameters. The address used to build is assumed
 * to be DMA mapped, if needed.
 *
 * DOORBELL or MMIO BOs use only one scatterlist node in their sg_table
 * because they are physically contiguous.
 *
 * Return: Initialized instance of SG Table or NULL
 */
static struct sg_table *create_sg_table(uint64_t addr, uint32_t size)
{
	struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);

	if (!sg)
		return NULL;
	if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
		kfree(sg);
		return NULL;
	}
	sg_dma_address(sg->sgl) = addr;
	sg->sgl->length = size;
#ifdef CONFIG_NEED_SG_DMA_LENGTH
	sg->sgl->dma_length = size;
#endif
	return sg;
}

static int
kfd_mem_dmamap_userptr(struct kgd_mem *mem,
		       struct kfd_mem_attachment *attachment)
@@ -510,6 +578,87 @@ kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment *attachment)
	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
}

/**
 * kfd_mem_dmamap_sg_bo() - Create DMA mapped sg_table to access DOORBELL or MMIO BO
 * @mem: SG BO of the DOORBELL or MMIO resource on the owning device
 * @attachment: Virtual address attachment of the BO on accessing device
 *
 * An access request from the device that owns DOORBELL does not require DMA mapping.
 * This is because the request doesn't go through PCIe root complex i.e. it instead
 * loops back. The need to DMA map arises only when accessing peer device's DOORBELL
 *
 * In contrast, all access requests for MMIO need to be DMA mapped without regard to
 * device ownership. This is because access requests for MMIO go through PCIe root
 * complex.
 *
 * This is accomplished in two steps:
 *   - Obtain DMA mapped address of DOORBELL or MMIO memory that could be used
 *         in updating requesting device's page table
 *   - Signal TTM to mark memory pointed to by requesting device's BO as GPU
 *         accessible. This allows an update of requesting device's page table
 *         with entries associated with DOOREBELL or MMIO memory
 *
 * This method is invoked in the following contexts:
 *   - Mapping of DOORBELL or MMIO BO of same or peer device
 *   - Validating an evicted DOOREBELL or MMIO BO on device seeking access
 *
 * Return: ZERO if successful, NON-ZERO otherwise
 */
static int
kfd_mem_dmamap_sg_bo(struct kgd_mem *mem,
		     struct kfd_mem_attachment *attachment)
{
	struct ttm_operation_ctx ctx = {.interruptible = true};
	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
	struct amdgpu_device *adev = attachment->adev;
	struct ttm_tt *ttm = bo->tbo.ttm;
	enum dma_data_direction dir;
	dma_addr_t dma_addr;
	bool mmio;
	int ret;

	/* Expect SG Table of dmapmap BO to be NULL */
	mmio = (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP);
	if (unlikely(ttm->sg)) {
		pr_err("SG Table of %d BO for peer device is UNEXPECTEDLY NON-NULL", mmio);
		return -EINVAL;
	}

	dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
			DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
	dma_addr = mem->bo->tbo.sg->sgl->dma_address;
	pr_debug("%d BO size: %d\n", mmio, mem->bo->tbo.sg->sgl->length);
	pr_debug("%d BO address before DMA mapping: %llx\n", mmio, dma_addr);
	dma_addr = dma_map_resource(adev->dev, dma_addr,
			mem->bo->tbo.sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);
	ret = dma_mapping_error(adev->dev, dma_addr);
	if (unlikely(ret))
		return ret;
	pr_debug("%d BO address after DMA mapping: %llx\n", mmio, dma_addr);

	ttm->sg = create_sg_table(dma_addr, mem->bo->tbo.sg->sgl->length);
	if (unlikely(!ttm->sg)) {
		ret = -ENOMEM;
		goto unmap_sg;
	}

	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
	if (unlikely(ret))
		goto free_sg;

	return ret;

free_sg:
	sg_free_table(ttm->sg);
	kfree(ttm->sg);
	ttm->sg = NULL;
unmap_sg:
	dma_unmap_resource(adev->dev, dma_addr, mem->bo->tbo.sg->sgl->length,
			   dir, DMA_ATTR_SKIP_CPU_SYNC);
	return ret;
}

static int
kfd_mem_dmamap_attachment(struct kgd_mem *mem,
			  struct kfd_mem_attachment *attachment)
@@ -521,6 +670,8 @@ kfd_mem_dmamap_attachment(struct kgd_mem *mem,
		return kfd_mem_dmamap_userptr(mem, attachment);
	case KFD_MEM_ATT_DMABUF:
		return kfd_mem_dmamap_dmabuf(attachment);
	case KFD_MEM_ATT_SG:
		return kfd_mem_dmamap_sg_bo(mem, attachment);
	default:
		WARN_ON_ONCE(1);
	}
@@ -561,6 +712,50 @@ kfd_mem_dmaunmap_dmabuf(struct kfd_mem_attachment *attachment)
	ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
}

/**
 * kfd_mem_dmaunmap_sg_bo() - Free DMA mapped sg_table of DOORBELL or MMIO BO
 * @mem: SG BO of the DOORBELL or MMIO resource on the owning device
 * @attachment: Virtual address attachment of the BO on accessing device
 *
 * The method performs following steps:
 *   - Signal TTM to mark memory pointed to by BO as GPU inaccessible
 *   - Free SG Table that is used to encapsulate DMA mapped memory of
 *          peer device's DOORBELL or MMIO memory
 *
 * This method is invoked in the following contexts:
 *     UNMapping of DOORBELL or MMIO BO on a device having access to its memory
 *     Eviction of DOOREBELL or MMIO BO on device having access to its memory
 *
 * Return: void
 */
static void
kfd_mem_dmaunmap_sg_bo(struct kgd_mem *mem,
		       struct kfd_mem_attachment *attachment)
{
	struct ttm_operation_ctx ctx = {.interruptible = true};
	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
	struct amdgpu_device *adev = attachment->adev;
	struct ttm_tt *ttm = bo->tbo.ttm;
	enum dma_data_direction dir;

	if (unlikely(!ttm->sg)) {
		pr_err("SG Table of BO is UNEXPECTEDLY NULL");
		return;
	}

	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
	ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);

	dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
				DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
	dma_unmap_resource(adev->dev, ttm->sg->sgl->dma_address,
			ttm->sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);
	sg_free_table(ttm->sg);
	kfree(ttm->sg);
	ttm->sg = NULL;
	bo->tbo.sg = NULL;
}

static void
kfd_mem_dmaunmap_attachment(struct kgd_mem *mem,
			    struct kfd_mem_attachment *attachment)
@@ -574,38 +769,14 @@ kfd_mem_dmaunmap_attachment(struct kgd_mem *mem,
	case KFD_MEM_ATT_DMABUF:
		kfd_mem_dmaunmap_dmabuf(attachment);
		break;
	case KFD_MEM_ATT_SG:
		kfd_mem_dmaunmap_sg_bo(mem, attachment);
		break;
	default:
		WARN_ON_ONCE(1);
	}
}

static int
kfd_mem_attach_userptr(struct amdgpu_device *adev, struct kgd_mem *mem,
		       struct amdgpu_bo **bo)
{
	unsigned long bo_size = mem->bo->tbo.base.size;
	struct drm_gem_object *gobj;
	int ret;

	ret = amdgpu_bo_reserve(mem->bo, false);
	if (ret)
		return ret;

	ret = amdgpu_gem_object_create(adev, bo_size, 1,
				       AMDGPU_GEM_DOMAIN_CPU,
				       AMDGPU_GEM_CREATE_PREEMPTIBLE,
				       ttm_bo_type_sg, mem->bo->tbo.base.resv,
				       &gobj);
	amdgpu_bo_unreserve(mem->bo);
	if (ret)
		return ret;

	*bo = gem_to_amdgpu_bo(gobj);
	(*bo)->parent = amdgpu_bo_ref(mem->bo);

	return 0;
}

static int
kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem,
		      struct amdgpu_bo **bo)
@@ -656,6 +827,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
	uint64_t va = mem->va;
	struct kfd_mem_attachment *attachment[2] = {NULL, NULL};
	struct amdgpu_bo *bo[2] = {NULL, NULL};
	bool same_hive = false;
	int i, ret;

	if (!va) {
@@ -663,6 +835,24 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
		return -EINVAL;
	}

	/* Determine access to VRAM, MMIO and DOORBELL BOs of peer devices
	 *
	 * The access path of MMIO and DOORBELL BOs of is always over PCIe.
	 * In contrast the access path of VRAM BOs depens upon the type of
	 * link that connects the peer device. Access over PCIe is allowed
	 * if peer device has large BAR. In contrast, access over xGMI is
	 * allowed for both small and large BAR configurations of peer device
	 */
	if ((adev != bo_adev) &&
	    ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
	     (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
	     (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
		if (mem->domain == AMDGPU_GEM_DOMAIN_VRAM)
			same_hive = amdgpu_xgmi_same_hive(adev, bo_adev);
		if (!same_hive && !amdgpu_device_is_peer_accessible(bo_adev, adev))
			return -EINVAL;
	}

	for (i = 0; i <= is_aql; i++) {
		attachment[i] = kzalloc(sizeof(*attachment[i]), GFP_KERNEL);
		if (unlikely(!attachment[i])) {
@@ -673,9 +863,9 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
		pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
			 va + bo_size, vm);

		if (adev == bo_adev ||
		if ((adev == bo_adev && !(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) ||
		    (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) && adev->ram_is_direct_mapped) ||
		   (mem->domain == AMDGPU_GEM_DOMAIN_VRAM && amdgpu_xgmi_same_hive(adev, bo_adev))) {
		    same_hive) {
			/* Mappings on the local GPU, or VRAM mappings in the
			 * local hive, or userptr mapping IOMMU direct map mode
			 * share the original BO
@@ -691,26 +881,30 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
		} else if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {
			/* Create an SG BO to DMA-map userptrs on other GPUs */
			attachment[i]->type = KFD_MEM_ATT_USERPTR;
			ret = kfd_mem_attach_userptr(adev, mem, &bo[i]);
			ret = create_dmamap_sg_bo(adev, mem, &bo[i]);
			if (ret)
				goto unwind;
		} else if (mem->domain == AMDGPU_GEM_DOMAIN_GTT &&
			   mem->bo->tbo.type != ttm_bo_type_sg) {
			/* GTT BOs use DMA-mapping ability of dynamic-attach
			 * DMA bufs. TODO: The same should work for VRAM on
			 * large-BAR GPUs.
			 */
		/* Handle DOORBELL BOs of peer devices and MMIO BOs of local and peer devices */
		} else if (mem->bo->tbo.type == ttm_bo_type_sg) {
			WARN_ONCE(!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL ||
				    mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP),
				  "Handing invalid SG BO in ATTACH request");
			attachment[i]->type = KFD_MEM_ATT_SG;
			ret = create_dmamap_sg_bo(adev, mem, &bo[i]);
			if (ret)
				goto unwind;
		/* Enable acces to GTT and VRAM BOs of peer devices */
		} else if (mem->domain == AMDGPU_GEM_DOMAIN_GTT ||
			   mem->domain == AMDGPU_GEM_DOMAIN_VRAM) {
			attachment[i]->type = KFD_MEM_ATT_DMABUF;
			ret = kfd_mem_attach_dmabuf(adev, mem, &bo[i]);
			if (ret)
				goto unwind;
			pr_debug("Employ DMABUF mechanim to enable peer GPU access\n");
		} else {
			/* FIXME: Need to DMA-map other BO types:
			 * large-BAR VRAM, doorbells, MMIO remap
			 */
			attachment[i]->type = KFD_MEM_ATT_SHARED;
			bo[i] = mem->bo;
			drm_gem_object_get(&bo[i]->tbo.base);
			WARN_ONCE(true, "Handling invalid ATTACH request");
			ret = -EINVAL;
			goto unwind;
		}

		/* Add BO to VM internal data structures */
@@ -1111,24 +1305,6 @@ static int map_bo_to_gpuvm(struct kgd_mem *mem,
	return ret;
}

static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size)
{
	struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);

	if (!sg)
		return NULL;
	if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
		kfree(sg);
		return NULL;
	}
	sg->sgl->dma_address = addr;
	sg->sgl->length = size;
#ifdef CONFIG_NEED_SG_DMA_LENGTH
	sg->sgl->dma_length = size;
#endif
	return sg;
}

static int process_validate_vms(struct amdkfd_process_info *process_info)
{
	struct amdgpu_vm *peer_vm;
@@ -1497,7 +1673,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
			bo_type = ttm_bo_type_sg;
			if (size > UINT_MAX)
				return -EINVAL;
			sg = create_doorbell_sg(*offset, size);
			sg = create_sg_table(*offset, size);
			if (!sg)
				return -ENOMEM;
		} else {
+31 −0
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@
#include <linux/pci.h>
#include <linux/devcoredump.h>
#include <generated/utsrelease.h>
#include <linux/pci-p2pdma.h>

#include <drm/drm_atomic_helper.h>
#include <drm/drm_probe_helper.h>
@@ -5499,6 +5500,36 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
	}
}

/**
 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
 *
 * @adev: amdgpu_device pointer
 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
 *
 * Return true if @peer_adev can access (DMA) @adev through the PCIe
 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
 * @peer_adev.
 */
bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
				      struct amdgpu_device *peer_adev)
{
#ifdef CONFIG_HSA_AMD_P2P
	uint64_t address_mask = peer_adev->dev->dma_mask ?
		~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
	resource_size_t aper_limit =
		adev->gmc.aper_base + adev->gmc.aper_size - 1;
	bool p2p_access = !(pci_p2pdma_distance_many(adev->pdev,
					&peer_adev->dev, 1, true) < 0);

	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
		adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
		!(adev->gmc.aper_base & address_mask ||
		  aper_limit & address_mask));
#else
	return false;
#endif
}

int amdgpu_device_baco_enter(struct drm_device *dev)
{
	struct amdgpu_device *adev = drm_to_adev(dev);
+10 −0
Original line number Diff line number Diff line
@@ -802,6 +802,16 @@ MODULE_PARM_DESC(no_queue_eviction_on_vm_fault, "No queue eviction on VM fault (
module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm_fault, int, 0444);
#endif

/**
 * DOC: pcie_p2p (bool)
 * Enable PCIe P2P (requires large-BAR). Default value: true (on)
 */
#ifdef CONFIG_HSA_AMD_P2P
bool pcie_p2p = true;
module_param(pcie_p2p, bool, 0444);
MODULE_PARM_DESC(pcie_p2p, "Enable PCIe P2P (requires large-BAR). (N = off, Y = on(default))");
#endif

/**
 * DOC: dcfeaturemask (uint)
 * Override display features enabled. See enum DC_FEATURE_MASK in drivers/gpu/drm/amd/include/amd_shared.h.