Commit 549b1f40 authored by Wang Wensheng's avatar Wang Wensheng
Browse files

mm/sharepool: Implement mg_sp_make_share_u2k()

ascend inclusion
category: Feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LNGH



---------------------------------------------

This function could map a user memory to kernel vmalloc space.

Signed-off-by: default avatarWang Wensheng <wangwensheng4@huawei.com>
parent e51dd46d
Loading
Loading
Loading
Loading
+298 −1
Original line number Diff line number Diff line
@@ -669,6 +669,13 @@ static void spa_dec_usage(struct sp_area *spa)
		atomic64_sub(spa->real_size, &sp_overall_stat.spa_total_size);
	}
}

static inline void check_interrupt_context(void)
{
	if (unlikely(in_interrupt()))
		panic("function can't be used in interrupt context\n");
}

/**
 * mp_sp_group_id_by_pid() - Get the sp_group ID array of a process.
 * @tgid: tgid of target process.
@@ -1115,6 +1122,247 @@ void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size,
}
EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u);

static int sp_pmd_entry(pmd_t *pmd, unsigned long addr,
			unsigned long next, struct mm_walk *walk)
{
	struct page *page;
	struct sp_walk_data *sp_walk_data = walk->private;

	/*
	 * There exist a scene in DVPP where the pagetable is huge page but its
	 * vma doesn't record it, something like THP.
	 * So we cannot make out whether it is a hugepage map until we access the
	 * pmd here. If mixed size of pages appear, just return an error.
	 */
	if (pmd_huge(*pmd)) {
		if (!sp_walk_data->is_page_type_set) {
			sp_walk_data->is_page_type_set = true;
			sp_walk_data->is_hugepage = true;
		} else if (!sp_walk_data->is_hugepage) {
			return -EFAULT;
		}

		/* To skip pte level walk */
		walk->action = ACTION_CONTINUE;

		page = pmd_page(*pmd);
		get_page(page);
		sp_walk_data->pages[sp_walk_data->page_count++] = page;

		return 0;
	}

	if (!sp_walk_data->is_page_type_set) {
		sp_walk_data->is_page_type_set = true;
		sp_walk_data->is_hugepage = false;
	} else if (sp_walk_data->is_hugepage)
		return -EFAULT;

	sp_walk_data->pmd = pmd;

	return 0;
}

static int sp_pte_entry(pte_t *pte, unsigned long addr,
			unsigned long next, struct mm_walk *walk)
{
	struct page *page;
	struct sp_walk_data *sp_walk_data = walk->private;
	pmd_t *pmd = sp_walk_data->pmd;

retry:
	if (unlikely(!pte_present(*pte))) {
		swp_entry_t entry;
		spinlock_t *ptl = pte_lockptr(walk->mm, pmd);

		if (pte_none(*pte))
			goto no_page;
		entry = pte_to_swp_entry(*pte);
		if (!is_migration_entry(entry))
			goto no_page;

		pte_unmap_unlock(pte, ptl);
		migration_entry_wait(walk->mm, pmd, addr);
		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
		goto retry;
	}

	page = pte_page(*pte);
	get_page(page);
	sp_walk_data->pages[sp_walk_data->page_count++] = page;
	return 0;

no_page:
	pr_debug("the page of addr %lx unexpectedly not in RAM\n",
		 (unsigned long)addr);
	return -EFAULT;
}

static int sp_test_walk(unsigned long addr, unsigned long next,
			struct mm_walk *walk)
{
	/*
	 * FIXME: The devmm driver uses remap_pfn_range() but actually there
	 * are associated struct pages, so they should use vm_map_pages() or
	 * similar APIs. Before the driver has been converted to correct APIs
	 * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as
	 * normal VMAs.
	 */
	return 0;
}

static int sp_pte_hole(unsigned long start, unsigned long end,
		       int depth, struct mm_walk *walk)
{
	pr_debug("hole [%lx, %lx) appeared unexpectedly\n",
			(unsigned long)start, (unsigned long)end);
	return -EFAULT;
}

static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask,
			    unsigned long addr, unsigned long next,
			    struct mm_walk *walk)
{
	pte_t pte = huge_ptep_get(ptep);
	struct page *page = pte_page(pte);
	struct sp_walk_data *sp_walk_data;

	if (unlikely(!pte_present(pte))) {
		pr_debug("the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr);
		return -EFAULT;
	}

	sp_walk_data = walk->private;
	get_page(page);
	sp_walk_data->pages[sp_walk_data->page_count++] = page;
	return 0;
}

/*
 * __sp_walk_page_range() - Walk page table with caller specific callbacks.
 * @uva: the start VA of user memory.
 * @size: the size of user memory.
 * @mm: mm struct of the target task.
 * @sp_walk_data: a structure of a page pointer array.
 *
 * the caller must hold mm->mmap_lock
 *
 * Notes for parameter alignment:
 * When size == 0, let it be page_size, so that at least one page is walked.
 *
 * When size > 0, for convenience, usually the parameters of uva and
 * size are not page aligned. There are four different alignment scenarios and
 * we must handler all of them correctly.
 *
 * The basic idea is to align down uva and align up size so all the pages
 * in range [uva, uva + size) are walked. However, there are special cases.
 *
 * Considering a 2M-hugepage addr scenario. Assuming the caller wants to
 * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M
 * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M.
 * The traverse range will be [1000M, 1004M). Obviously, the final page for
 * [1004M, 1004.5M) is not covered.
 *
 * To fix this problem, we need to walk an additional page, size should be
 * ALIGN(uva+size) - uva_aligned
 */
static int __sp_walk_page_range(unsigned long uva, unsigned long size,
	struct mm_struct *mm, struct sp_walk_data *sp_walk_data)
{
	int ret = 0;
	struct vm_area_struct *vma;
	unsigned long page_nr;
	struct page **pages = NULL;
	bool is_hugepage = false;
	unsigned long uva_aligned;
	unsigned long size_aligned;
	unsigned int page_size = PAGE_SIZE;
	struct mm_walk_ops sp_walk = {};

	/*
	 * Here we also support non share pool memory in this interface
	 * because the caller can't distinguish whether a uva is from the
	 * share pool or not. It is not the best idea to do so, but currently
	 * it simplifies overall design.
	 *
	 * In this situation, the correctness of the parameters is mainly
	 * guaranteed by the caller.
	 */
	vma = find_vma(mm, uva);
	if (!vma) {
		pr_debug("u2k input uva %lx is invalid\n", (unsigned long)uva);
		return -EINVAL;
	}
	if (is_vm_hugetlb_page(vma))
		is_hugepage = true;

	sp_walk.pte_hole = sp_pte_hole;
	sp_walk.test_walk = sp_test_walk;
	if (is_hugepage) {
		sp_walk_data->is_hugepage = true;
		sp_walk.hugetlb_entry = sp_hugetlb_entry;
		page_size = PMD_SIZE;
	} else {
		sp_walk_data->is_hugepage = false;
		sp_walk.pte_entry = sp_pte_entry;
		sp_walk.pmd_entry = sp_pmd_entry;
	}

	sp_walk_data->is_page_type_set = false;
	sp_walk_data->page_count = 0;
	sp_walk_data->page_size = page_size;
	uva_aligned = ALIGN_DOWN(uva, page_size);
	sp_walk_data->uva_aligned = uva_aligned;
	if (size == 0)
		size_aligned = page_size;
	else
		/* special alignment handling */
		size_aligned = ALIGN(uva + size, page_size) - uva_aligned;

	if (uva_aligned + size_aligned < uva_aligned) {
		pr_err_ratelimited("overflow happened in walk page range\n");
		return -EINVAL;
	}

	page_nr = size_aligned / page_size;
	pages = kvmalloc_array(page_nr, sizeof(struct page *), GFP_KERNEL);
	if (!pages) {
		pr_err_ratelimited("alloc page array failed in walk page range\n");
		return -ENOMEM;
	}
	sp_walk_data->pages = pages;

	ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned,
			      &sp_walk, sp_walk_data);
	if (ret) {
		while (sp_walk_data->page_count--)
			put_page(pages[sp_walk_data->page_count]);
		kvfree(pages);
		sp_walk_data->pages = NULL;
	}

	if (sp_walk_data->is_hugepage)
		sp_walk_data->uva_aligned = ALIGN_DOWN(uva, PMD_SIZE);

	return ret;
}

static void __sp_walk_page_free(struct sp_walk_data *data)
{
	int i = 0;
	struct page *page;

	while (i < data->page_count) {
		page = data->pages[i++];
		put_page(page);
	}

	kvfree(data->pages);
	/* prevent repeated release */
	data->page_count = 0;
	data->pages = NULL;
}

/**
 * mg_sp_make_share_u2k() - Share user memory of a specified process to kernel.
 * @uva: the VA of shared user memory
@@ -1127,7 +1375,56 @@ EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u);
 */
void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid)
{
	int ret = 0;
	struct mm_struct *mm = current->mm;
	void *p = ERR_PTR(-ESRCH);
	struct sp_walk_data sp_walk_data;
	struct vm_struct *area;

	if (!sp_is_enabled())
		return ERR_PTR(-EOPNOTSUPP);

	check_interrupt_context();

	if (mm == NULL) {
		pr_err("u2k: kthread is not allowed\n");
		return ERR_PTR(-EPERM);
	}

	mmap_write_lock(mm);
	ret = __sp_walk_page_range(uva, size, mm, &sp_walk_data);
	if (ret) {
		pr_err_ratelimited("walk page range failed %d\n", ret);
		mmap_write_unlock(mm);
		return ERR_PTR(ret);
	}

	if (sp_walk_data.is_hugepage)
		p = vmap_hugepage(sp_walk_data.pages, sp_walk_data.page_count,
				  VM_MAP, PAGE_KERNEL);
	else
		p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP,
			 PAGE_KERNEL);
	mmap_write_unlock(mm);

	if (!p) {
		pr_err("vmap(huge) in u2k failed\n");
		__sp_walk_page_free(&sp_walk_data);
		return ERR_PTR(-ENOMEM);
	}

	p = p + (uva - sp_walk_data.uva_aligned);

	/*
	 * kva p may be used later in k2u. Since p comes from uva originally,
	 * it's reasonable to add flag VM_USERMAP so that p can be remapped
	 * into userspace again.
	 */
	area = find_vm_area(p);
	area->flags |= VM_USERMAP;

	kvfree(sp_walk_data.pages);
	return p;
}
EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k);