Commit c5cb0db5 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-next-5.12-2021-02-03' of...

Merge tag 'amd-drm-next-5.12-2021-02-03' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-5.12-2021-02-03:

amdgpu:
- Display fixes and cleanups
- Vangogh fixes
- Fix possible race when there are timeouts on two rings
- SR-IOV fixes
- Add missing license
- DCE 10/12 bpc fixes
- Display MALL fixes
- Fix SMU user preference settings persistence
- Fix retry in gem allocate
- Add new PCI DID
- Fix for manual fan speed control on cards where it was problematic
- Fix regression in pinning GTT
- Misc display fixes
- Misc code cleanups

amdkfd:
- Fix config handling
- Fix regression in buffer free

From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210204045717.3823-1-alexander.deucher@amd.com


Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
parents 54c820d0 c915ef89
Loading
Loading
Loading
Loading
+0 −87
Original line number Diff line number Diff line
@@ -47,12 +47,8 @@ int amdgpu_amdkfd_init(void)
	amdgpu_amdkfd_total_mem_size = si.totalram - si.totalhigh;
	amdgpu_amdkfd_total_mem_size *= si.mem_unit;

#ifdef CONFIG_HSA_AMD
	ret = kgd2kfd_init();
	amdgpu_amdkfd_gpuvm_init_mem_limits();
#else
	ret = -ENOENT;
#endif
	kfd_initialized = !ret;

	return ret;
@@ -696,86 +692,3 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)

	return adev->have_atomics_support;
}

#ifndef CONFIG_HSA_AMD
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
{
	return false;
}

void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo)
{
}

int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
{
	return 0;
}

void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
					struct amdgpu_vm *vm)
{
}

struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
{
	return NULL;
}

int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm)
{
	return 0;
}

struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
			      unsigned int asic_type, bool vf)
{
	return NULL;
}

bool kgd2kfd_device_init(struct kfd_dev *kfd,
			 struct drm_device *ddev,
			 const struct kgd2kfd_shared_resources *gpu_resources)
{
	return false;
}

void kgd2kfd_device_exit(struct kfd_dev *kfd)
{
}

void kgd2kfd_exit(void)
{
}

void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
{
}

int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
{
	return 0;
}

int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{
	return 0;
}

int kgd2kfd_post_reset(struct kfd_dev *kfd)
{
	return 0;
}

void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
{
}

void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{
}

void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
}
#endif
+119 −19
Original line number Diff line number Diff line
@@ -94,11 +94,6 @@ enum kgd_engine_type {
	KGD_ENGINE_MAX
};

struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
						       struct mm_struct *mm);
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);

struct amdkfd_process_info {
	/* List head of all VMs that belong to a KFD process */
@@ -132,8 +127,6 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev);

int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm);
int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
				uint32_t vmid, uint64_t gpu_addr,
				uint32_t *ib_cmd, uint32_t ib_len);
@@ -153,6 +146,38 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd);
int amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev,
					int queue_bit);

struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
								struct mm_struct *mm);
#if IS_ENABLED(CONFIG_HSA_AMD)
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm);
#else
static inline
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
{
	return false;
}

static inline
struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
{
	return NULL;
}

static inline
int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
{
	return 0;
}

static inline
int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm)
{
	return 0;
}
#endif
/* Shared API */
int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
				void **mem_obj, uint64_t *gpu_addr,
@@ -215,8 +240,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
					struct file *filp, u32 pasid,
					void **vm, void **process_info,
					struct dma_fence **ef);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
				struct amdgpu_vm *vm);
void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm);
void amdgpu_amdkfd_gpuvm_release_process_vm(struct kgd_dev *kgd, void *vm);
uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm);
@@ -236,23 +259,43 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
		struct kgd_mem *mem, void **kptr, uint64_t *size);
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
					    struct dma_fence **ef);

int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
					      struct kfd_vm_fault_info *info);

int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
				      struct dma_buf *dmabuf,
				      uint64_t va, void *vm,
				      struct kgd_mem **mem, uint64_t *size,
				      uint64_t *mmap_offset);

void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo);

int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
				struct tile_config *config);
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
				struct amdgpu_vm *vm);
void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo);
#else
static inline
void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
{
}

static inline
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
					struct amdgpu_vm *vm)
{
}

static inline
void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo)
{
}
#endif
/* KGD2KFD callbacks */
int kgd2kfd_quiesce_mm(struct mm_struct *mm);
int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
						struct dma_fence *fence);
#if IS_ENABLED(CONFIG_HSA_AMD)
int kgd2kfd_init(void);
void kgd2kfd_exit(void);
struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
@@ -266,11 +309,68 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_pre_reset(struct kfd_dev *kfd);
int kgd2kfd_post_reset(struct kfd_dev *kfd);
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
int kgd2kfd_quiesce_mm(struct mm_struct *mm);
int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
					       struct dma_fence *fence);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
#else
static inline int kgd2kfd_init(void)
{
	return -ENOENT;
}

static inline void kgd2kfd_exit(void)
{
}

static inline
struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
					unsigned int asic_type, bool vf)
{
	return NULL;
}

static inline
bool kgd2kfd_device_init(struct kfd_dev *kfd, struct drm_device *ddev,
				const struct kgd2kfd_shared_resources *gpu_resources)
{
	return false;
}

static inline void kgd2kfd_device_exit(struct kfd_dev *kfd)
{
}

static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
{
}

static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
{
	return 0;
}

static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{
	return 0;
}

static inline int kgd2kfd_post_reset(struct kfd_dev *kfd)
{
	return 0;
}

static inline
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
{
}

static inline
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{
}

static inline
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
}
#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
+6 −10
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#include <linux/sched/task.h>

#include "amdgpu_object.h"
#include "amdgpu_gem.h"
#include "amdgpu_vm.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_dma_buf.h"
@@ -1152,7 +1153,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
	struct sg_table *sg = NULL;
	uint64_t user_addr = 0;
	struct amdgpu_bo *bo;
	struct amdgpu_bo_param bp;
	struct drm_gem_object *gobj;
	u32 domain, alloc_domain;
	u64 alloc_flags;
	int ret;
@@ -1220,19 +1221,14 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
	pr_debug("\tcreate BO VA 0x%llx size 0x%llx domain %s\n",
			va, size, domain_string(alloc_domain));

	memset(&bp, 0, sizeof(bp));
	bp.size = size;
	bp.byte_align = 1;
	bp.domain = alloc_domain;
	bp.flags = alloc_flags;
	bp.type = bo_type;
	bp.resv = NULL;
	ret = amdgpu_bo_create(adev, &bp, &bo);
	ret = amdgpu_gem_object_create(adev, size, 1, alloc_domain, alloc_flags,
				       bo_type, NULL, &gobj);
	if (ret) {
		pr_debug("Failed to create BO on domain %s. ret %d\n",
			 domain_string(alloc_domain), ret);
		goto err_bo_create;
	}
	bo = gem_to_amdgpu_bo(gobj);
	if (bo_type == ttm_bo_type_sg) {
		bo->tbo.sg = sg;
		bo->tbo.ttm->sg = sg;
+73 −13
Original line number Diff line number Diff line
@@ -4211,7 +4211,6 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
		case CHIP_NAVI12:
		case CHIP_SIENNA_CICHLID:
		case CHIP_NAVY_FLOUNDER:
		case CHIP_VANGOGH:
			break;
		default:
			goto disabled;
@@ -4461,6 +4460,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
	up_write(&adev->reset_sem);
}

/*
 * to lockup a list of amdgpu devices in a hive safely, if not a hive
 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
 *
 * unlock won't require roll back.
 */
static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
{
	struct amdgpu_device *tmp_adev = NULL;

	if (adev->gmc.xgmi.num_physical_nodes > 1) {
		if (!hive) {
			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
			return -ENODEV;
		}
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			if (!amdgpu_device_lock_adev(tmp_adev, hive))
				goto roll_back;
		}
	} else if (!amdgpu_device_lock_adev(adev, hive))
		return -EAGAIN;

	return 0;
roll_back:
	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
		/*
		 * if the lockup iteration break in the middle of a hive,
		 * it may means there may has a race issue,
		 * or a hive device locked up independently.
		 * we may be in trouble and may not, so will try to roll back
		 * the lock and give out a warnning.
		 */
		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			amdgpu_device_unlock_adev(tmp_adev);
		}
	}
	return -EAGAIN;
}

static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
{
	struct pci_dev *p = NULL;
@@ -4574,11 +4613,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
				job ? job->base.id : -1, hive->hive_id);
			amdgpu_put_xgmi_hive(hive);
			if (job)
				drm_sched_increase_karma(&job->base);
			return 0;
		}
		mutex_lock(&hive->hive_lock);
	}

	/*
	 * lock the device before we try to operate the linked list
	 * if didn't get the device lock, don't touch the linked list since
	 * others may iterating it.
	 */
	r = amdgpu_device_lock_hive_adev(adev, hive);
	if (r) {
		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
					job ? job->base.id : -1);

		/* even we skipped this reset, still need to set the job to guilty */
		if (job)
			drm_sched_increase_karma(&job->base);
		goto skip_recovery;
	}

	/*
	 * Build list of devices to reset.
	 * In case we are in XGMI hive mode, resort the device list
@@ -4586,8 +4643,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	 */
	INIT_LIST_HEAD(&device_list);
	if (adev->gmc.xgmi.num_physical_nodes > 1) {
		if (!hive)
			return -ENODEV;
		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
		device_list_handle = &hive->device_list;
@@ -4598,13 +4653,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

	/* block all schedulers and reset given job's ring */
	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
				  job ? job->base.id : -1);
			r = 0;
			goto skip_recovery;
		}

		/*
		 * Try to put the audio codec into suspend state
		 * before gpu reset started.
@@ -4742,7 +4790,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		amdgpu_put_xgmi_hive(hive);
	}

	if (r)
	if (r && r != -EAGAIN)
		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
	return r;
}
@@ -4792,7 +4840,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
		} else {
			if (speed_cap == PCIE_SPEED_16_0GT)
			if (speed_cap == PCIE_SPEED_32_0GT)
				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
			else if (speed_cap == PCIE_SPEED_16_0GT)
				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
@@ -4812,7 +4866,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
		} else {
			if (platform_speed_cap == PCIE_SPEED_16_0GT)
			if (platform_speed_cap == PCIE_SPEED_32_0GT)
				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
+6 −2
Original line number Diff line number Diff line
@@ -926,8 +926,10 @@ amdgpu_display_user_framebuffer_create(struct drm_device *dev,
				       struct drm_file *file_priv,
				       const struct drm_mode_fb_cmd2 *mode_cmd)
{
	struct drm_gem_object *obj;
	struct amdgpu_framebuffer *amdgpu_fb;
	struct drm_gem_object *obj;
	struct amdgpu_bo *bo;
	uint32_t domains;
	int ret;

	obj = drm_gem_object_lookup(file_priv, mode_cmd->handles[0]);
@@ -938,7 +940,9 @@ amdgpu_display_user_framebuffer_create(struct drm_device *dev,
	}

	/* Handle is imported dma-buf, so cannot be migrated to VRAM for scanout */
	if (obj->import_attach) {
	bo = gem_to_amdgpu_bo(obj);
	domains = amdgpu_display_supported_domains(drm_to_adev(dev), bo->flags);
	if (obj->import_attach && !(domains & AMDGPU_GEM_DOMAIN_GTT)) {
		drm_dbg_kms(dev, "Cannot create framebuffer from imported dma_buf\n");
		return ERR_PTR(-EINVAL);
	}
Loading