Commit 5d3b64fd authored by Yuchen Tang's avatar Yuchen Tang
Browse files

etmem: add etmem scan feature

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8T1MB?from=project-issue


CVE: NA

-------------------------------------------------

This patch implements the etmem feature.

etmem scan module communicates with the user space program through
registered proc file system. It periodically scans the vma segments
of the target process, by walking its page table and check access
bit of each page, before reporting the scan results to user space,
so that we can better classify hotness of pages and further migrate
hot ones to fast memory tier and cold ones to slow memory tier.

Signed-off-by: default avataryanxiaodan <yanxiaodan@huawei.com>
Signed-off-by: default avatarFeilong Lin <linfeilong@huawei.com>
Signed-off-by: default avatargeruijun <geruijun@huawei.com>
Signed-off-by: default avatarliubo <liubo254@huawei.com>
Signed-off-by: default avatarYuchen Tang <tangyuchen5@huawei.com>
parent a42298d4
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -35,3 +35,5 @@ proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
proc-$(CONFIG_BOOT_CONFIG)	+= bootconfig.o
proc-$(CONFIG_MEMORY_RELIABLE)	+= mem_reliable.o
obj-$(CONFIG_ETMEM_SCAN)	+= etmem_scan.o
proc-${CONFIG_ETMEM}		+= etmem_proc.o
+6 −0
Original line number Diff line number Diff line
@@ -3366,6 +3366,9 @@ static const struct pid_entry tgid_base_stuff[] = {
	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_ETMEM
	REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations),
#endif
#ifdef CONFIG_SECURITY
	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
@@ -3718,6 +3721,9 @@ static const struct pid_entry tid_base_stuff[] = {
	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_ETMEM
	REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations),
#endif
#ifdef CONFIG_SECURITY
	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif

fs/proc/etmem_proc.c

0 → 100644
+122 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/etmem.h>

#include "internal.h"

static DEFINE_SPINLOCK(scan_lock);

static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock)
{
	if (is_lock)
		spin_lock(&scan_lock);
	else
		spin_unlock(&scan_lock);

	return 0;
}

/* will be filled when kvm_ept_idle module loads */
struct file_operations proc_page_scan_operations = {
	.flock = page_scan_lock,
};
EXPORT_SYMBOL_GPL(proc_page_scan_operations);

static ssize_t mm_idle_read(struct file *file, char __user *buf,
			    size_t count, loff_t *ppos)
{
	struct mm_struct *mm = file->private_data;
	int ret = 0;

	if (!mm || !mmget_not_zero(mm)) {
		ret = -ESRCH;
		return ret;
	}
	if (proc_page_scan_operations.read)
		ret = proc_page_scan_operations.read(file, buf, count, ppos);

	mmput(mm);
	return ret;
}

static int mm_idle_open(struct inode *inode, struct file *file)
{
	struct mm_struct *mm = NULL;
	struct module *module = NULL;
	int ret = -1;

	if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN))
		return -EPERM;

	page_scan_lock(NULL, 1, NULL);
	module = proc_page_scan_operations.owner;
	if (module != NULL && try_module_get(module))
		ret = 0;
	page_scan_lock(NULL, 0, NULL);
	if (ret != 0) {
		/* no scan ko installed, avoid to return valid file */
		return -ENODEV;
	}

	mm = proc_mem_open(inode, PTRACE_MODE_READ);
	if (IS_ERR(mm)) {
		module_put(module);
		return PTR_ERR(mm);
	}

	file->private_data = mm;

	if (proc_page_scan_operations.open)
		ret = proc_page_scan_operations.open(inode, file);

	if (ret != 0)
		module_put(module);

	return ret;
}

static int mm_idle_release(struct inode *inode, struct file *file)
{
	struct mm_struct *mm = file->private_data;
	int ret = 0;

	if (mm) {
		if (!mm_kvm(mm))
			flush_tlb_mm(mm);
		mmdrop(mm);
	}

	if (proc_page_scan_operations.release)
		ret = proc_page_scan_operations.release(inode, file);

	if (proc_page_scan_operations.owner)
		module_put(proc_page_scan_operations.owner);

	return ret;
}

static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	if (proc_page_scan_operations.unlocked_ioctl)
		return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg);

	return 0;
}

const struct file_operations proc_mm_idle_operations = {
	.llseek		= mem_lseek, /* borrow this */
	.read		= mm_idle_read,
	.open		= mm_idle_open,
	.release	= mm_idle_release,
	.unlocked_ioctl = mm_idle_ioctl,
};

fs/proc/etmem_scan.c

0 → 100644
+1383 −0

File added.

Preview size limit exceeded, changes collapsed.

fs/proc/etmem_scan.h

0 → 100644
+149 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _PAGE_IDLE_H
#define _PAGE_IDLE_H

#include <linux/types.h>

#define SCAN_HUGE_PAGE		O_NONBLOCK	/* only huge page */
#define SCAN_SKIM_IDLE		O_NOFOLLOW	/* stop on PMD_IDLE_PTES */
#define SCAN_DIRTY_PAGE         O_NOATIME       /* report pte/pmd dirty bit */

/* define to not used file flags */
#define SCAN_AS_HUGE		0100000000      /* treat normal page as hugepage in vm */
#define SCAN_IGN_HOST		0200000000      /* ignore host access when scan vm */
#define VM_SCAN_HOST		0400000000      /* scan and add host page for vm hole(internal) */
#define VMA_SCAN_FLAG           0x1000        /* scan the specifics vma with flag */

#define ALL_SCAN_FLAGS		(SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \
				SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST | VMA_SCAN_FLAG)

#define IDLE_SCAN_MAGIC         0x66
#define IDLE_SCAN_ADD_FLAGS	_IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int)
#define IDLE_SCAN_REMOVE_FLAGS	_IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int)
#define VMA_SCAN_ADD_FLAGS      _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int)
#define VMA_SCAN_REMOVE_FLAGS   _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int)

enum ProcIdlePageType {
	PTE_ACCESSED,	/* 4k page */
	PMD_ACCESSED,	/* 2M page */
	PUD_PRESENT,	/* 1G page */

	PTE_DIRTY_M,
	PMD_DIRTY_M,

	PTE_IDLE,
	PMD_IDLE,
	PMD_IDLE_PTES,	/* all PTE idle */

	PTE_HOLE,
	PMD_HOLE,

	PIP_CMD,

	IDLE_PAGE_TYPE_MAX
};

#define PIP_TYPE(a)		(0xf & (a >> 4))
#define PIP_SIZE(a)		(0xf & a)
#define PIP_COMPOSE(type, nr)	((type << 4) | nr)

#define PIP_CMD_SET_HVA		PIP_COMPOSE(PIP_CMD, 0)

#ifndef INVALID_PAGE
#define INVALID_PAGE ~0UL
#endif

#ifdef CONFIG_ARM64
#define _PAGE_MM_BIT_ACCESSED 10
#else
#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED
#endif

#ifdef CONFIG_X86_64
#define _PAGE_BIT_EPT_ACCESSED	8
#define _PAGE_BIT_EPT_DIRTY		9
#define _PAGE_EPT_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)

#define _PAGE_EPT_PRESENT	(_AT(pteval_t, 7))

static inline int ept_pte_present(pte_t a)
{
	return pte_flags(a) & _PAGE_EPT_PRESENT;
}

static inline int ept_pmd_present(pmd_t a)
{
	return pmd_flags(a) & _PAGE_EPT_PRESENT;
}

static inline int ept_pud_present(pud_t a)
{
	return pud_flags(a) & _PAGE_EPT_PRESENT;
}

static inline int ept_p4d_present(p4d_t a)
{
	return p4d_flags(a) & _PAGE_EPT_PRESENT;
}

static inline int ept_pgd_present(pgd_t a)
{
	return pgd_flags(a) & _PAGE_EPT_PRESENT;
}

static inline int ept_pte_accessed(pte_t a)
{
	return pte_flags(a) & _PAGE_EPT_ACCESSED;
}

static inline int ept_pmd_accessed(pmd_t a)
{
	return pmd_flags(a) & _PAGE_EPT_ACCESSED;
}

static inline int ept_pud_accessed(pud_t a)
{
	return pud_flags(a) & _PAGE_EPT_ACCESSED;
}

static inline int ept_p4d_accessed(p4d_t a)
{
	return p4d_flags(a) & _PAGE_EPT_ACCESSED;
}

static inline int ept_pgd_accessed(pgd_t a)
{
	return pgd_flags(a) & _PAGE_EPT_ACCESSED;
}
#endif

extern struct file_operations proc_page_scan_operations;

#define PAGE_IDLE_KBUF_FULL	1
#define PAGE_IDLE_BUF_FULL	2
#define PAGE_IDLE_BUF_MIN	(sizeof(uint64_t) * 2 + 3)

#define PAGE_IDLE_KBUF_SIZE	8000

struct page_idle_ctrl {
	struct mm_struct *mm;
	struct kvm *kvm;

	uint8_t kpie[PAGE_IDLE_KBUF_SIZE];
	int pie_read;
	int pie_read_max;

	void __user *buf;
	int buf_size;
	int bytes_copied;

	unsigned long next_hva;		/* GPA for EPT; VA for PT */
	unsigned long gpa_to_hva;
	unsigned long restart_gpa;
	unsigned long last_va;

	unsigned int flags;
};

#endif
Loading