Commit ff891a2e authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher
Browse files

drm/amdkfd: check access permisson to restore retry fault



Check range access permission to restore GPU retry fault, if GPU retry
fault on address which belongs to VMA, and VMA has no read or write
permission requested by GPU, failed to restore the address. The vm fault
event will pass back to user space.

Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f24d991b
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -3345,12 +3345,13 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 * @adev: amdgpu device pointer
 * @pasid: PASID of the VM
 * @addr: Address of the fault
 * @write_fault: true is write fault, false is read fault
 *
 * Try to gracefully handle a VM fault. Return true if the fault was handled and
 * shouldn't be reported any more.
 */
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
			    uint64_t addr)
			    uint64_t addr, bool write_fault)
{
	bool is_compute_context = false;
	struct amdgpu_bo *root;
@@ -3375,7 +3376,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
	addr /= AMDGPU_GPU_PAGE_SIZE;

	if (is_compute_context &&
	    !svm_range_restore_pages(adev, pasid, addr)) {
	    !svm_range_restore_pages(adev, pasid, addr, write_fault)) {
		amdgpu_bo_unref(&root);
		return true;
	}
+1 −1
Original line number Diff line number Diff line
@@ -448,7 +448,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
			     struct amdgpu_task_info *task_info);
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
			    uint64_t addr);
			    uint64_t addr, bool write_fault);

void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

+2 −1
Original line number Diff line number Diff line
@@ -93,6 +93,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
				       struct amdgpu_iv_entry *entry)
{
	bool retry_fault = !!(entry->src_data[1] & 0x80);
	bool write_fault = !!(entry->src_data[1] & 0x20);
	struct amdgpu_vmhub *hub = &adev->vmhub[entry->vmid_src];
	struct amdgpu_task_info task_info;
	uint32_t status = 0;
@@ -121,7 +122,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
		/* Try to handle the recoverable page faults by filling page
		 * tables
		 */
		if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
		if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, write_fault))
			return 1;
	}

+2 −1
Original line number Diff line number Diff line
@@ -507,6 +507,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
				      struct amdgpu_iv_entry *entry)
{
	bool retry_fault = !!(entry->src_data[1] & 0x80);
	bool write_fault = !!(entry->src_data[1] & 0x20);
	uint32_t status = 0, cid = 0, rw = 0;
	struct amdgpu_task_info task_info;
	struct amdgpu_vmhub *hub;
@@ -537,7 +538,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
		/* Try to handle the recoverable page faults by filling page
		 * tables
		 */
		if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
		if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, write_fault))
			return 1;
	}

+28 −1
Original line number Diff line number Diff line
@@ -2400,9 +2400,29 @@ svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
		WRITE_ONCE(pdd->faults, pdd->faults + 1);
}

static bool
svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool write_fault)
{
	unsigned long requested = VM_READ;
	struct vm_area_struct *vma;

	if (write_fault)
		requested |= VM_WRITE;

	vma = find_vma(mm, addr << PAGE_SHIFT);
	if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
		pr_debug("address 0x%llx VMA is removed\n", addr);
		return true;
	}

	pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
		vma->vm_flags);
	return (vma->vm_flags & requested) == requested;
}

int
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
			uint64_t addr)
			uint64_t addr, bool write_fault)
{
	struct mm_struct *mm = NULL;
	struct svm_range_list *svms;
@@ -2484,6 +2504,13 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
		goto out_unlock_range;
	}

	if (!svm_fault_allowed(mm, addr, write_fault)) {
		pr_debug("fault addr 0x%llx no %s permission\n", addr,
			write_fault ? "write" : "read");
		r = -EPERM;
		goto out_unlock_range;
	}

	best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
	if (best_loc == -1) {
		pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
Loading