Commit bdc47013 authored by Ze Zuo's avatar Ze Zuo
Browse files

mm/mem_sampling.c: Drive NUMA balancing via mem_sampling access data

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ


CVE: NA

--------------------------------

Feed the page access data obtained from mem_sampling to NUMA
balancing as hint fault equivalents. The existing per-task
and per-group fault stats are now built from page access
information provided by mem_sampling. With this it will not be
necessary to scan the address space to introduce NUMA hinting
faults.

A callback is registered from NUMA balancing to mem_sampling for
subscribing the page access information from mem_sampling. NUMA
balancing then uses task_work framework to process the mem_sampling
data and realize the migrating policy.

The sampling policy in NUMA balancing can be switched between
original task_tick_numa() and mem_sampling records by writing to a
sysctl interface dynamically. Similar effects can be achieved
by switching on and off CONFIG_NUMA_BALANCING_MEM_SAMPLING.

Note that THP migrating is not supported for now.

Signed-off-by: default avatarZe Zuo <zuoze1@huawei.com>
Signed-off-by: default avatarTong Tiangen <tongtiangen@huawei.com>
Signed-off-by: default avatarShuang Yan <yanshuang7@huawei.com>
parent 1d536414
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@
#ifndef __MEM_SAMPLING_H
#define __MEM_SAMPLING_H

DECLARE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);

enum mem_sampling_sample_type {
	MEM_SAMPLING_L1D_ACCESS		= 1 << 0,
	MEM_SAMPLING_L1D_MISS		= 1 << 1,
+11 −0
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#endif
#include <linux/sched/grid_qos.h>
#include <linux/bpf_sched.h>
#include <linux/mem_sampling.h>

/*
 * Targeted preemption latency for CPU-bound tasks:
@@ -2967,6 +2968,16 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
	struct callback_head *work = &curr->numa_work;
	u64 period, now;

#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
	/*
	 * If we are using access hints from hardware (like using
	 * SPE), don't scan the address space.
	 * Note that currently PMD-level page migration is not
	 * supported.
	 */
	if (static_branch_unlikely(&sched_numabalancing_mem_sampling))
		return;
#endif
	/*
	 * We don't care about NUMA placement if we don't have memory.
	 */
+13 −0
Original line number Diff line number Diff line
@@ -1025,6 +1025,19 @@ config MEM_SAMPLING
	  features. It requires at least one hardware pmu (e.g. ARM_SPE_MEM_SAMPLING) to
	  be enabled.

config NUMABALANCING_MEM_SAMPLING
	bool "Use hardware memory samples for numa balancing"
	depends on MEM_SAMPLING && NUMA_BALANCING
	default n
	help
	  This feature relies on hardware sampling, and will use memory access
	  information obtained from hardware sampling in the NUMA balancing
	  policy instead of the native software PROT_NONE scheme. Turning on
	  this feature may have a performance impact on some workloads, for
	  example, lightweight memory access programs.

	  if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING.

source "mm/damon/Kconfig"

endmenu
+167 −0
Original line number Diff line number Diff line
@@ -18,6 +18,10 @@
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mem_sampling.h>
#include <linux/mempolicy.h>
#include <linux/task_work.h>
#include <linux/migrate.h>
#include <linux/sched/numa_balancing.h>

struct mem_sampling_ops_struct mem_sampling_ops;

@@ -31,6 +35,15 @@ struct mem_sampling_record_cb_list_entry {
};
LIST_HEAD(mem_sampling_record_cb_list);

#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
struct mem_sampling_numa_access_work {
	struct callback_head work;
	u64 vaddr, paddr;
	/* Test for debug : decode buffer cpu not same with handle interrupt cpu*/
	int cpu;
};
#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */

void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
{
	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
@@ -103,6 +116,158 @@ static void mem_sampling_process(struct mem_sampling_record *record_base, int nr
		mem_sampling_ops.sampling_stop();
}

#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING

static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
				unsigned long addr, int page_nid,
				int *flags)
{
	get_page(page);

	count_vm_numa_event(NUMA_HINT_FAULTS);
	if (page_nid == numa_node_id()) {
		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
		*flags |= TNF_FAULT_LOCAL;
	}

	return mpol_misplaced(page, vma, addr);
}

/*
 * Called from task_work context to act upon the page access.
 *
 * Physical address (provided by SPE) is used directly instead
 * of walking the page tables to get to the PTE/page. Hence we
 * don't check if PTE is writable for the TNF_NO_GROUP
 * optimization, which means RO pages are considered for grouping.
 */
static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr)
{
	struct mm_struct *mm = p->mm;
	struct vm_area_struct *vma;
	struct page *page = NULL;
	int page_nid = NUMA_NO_NODE;
	int last_cpupid;
	int target_nid;
	int flags = 0;

	if (!mm)
		return;

	if (!mmap_read_trylock(mm))
		return;

	vma = find_vma(mm, vaddr);
	if (!vma)
		goto out_unlock;

	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
		goto out_unlock;

	if (!vma->vm_mm ||
	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
		goto out_unlock;

	if (!vma_is_accessible(vma))
		goto out_unlock;

	page = pfn_to_online_page(PHYS_PFN(paddr));
	if (!page || is_zone_device_page(page))
		goto out_unlock;

	if (unlikely(!PageLRU(page)))
		goto out_unlock;

	/* TODO: handle PTE-mapped THP or PMD-mapped THP*/
	if (PageCompound(page))
		goto out_unlock;

	/*
	 * Flag if the page is shared between multiple address spaces. This
	 * is later used when determining whether to group tasks together
	 */
	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
		flags |= TNF_SHARED;

	last_cpupid = page_cpupid_last(page);
	page_nid = page_to_nid(page);

	target_nid = numa_migrate_prep(page, vma, vaddr, page_nid, &flags);
	if (target_nid == NUMA_NO_NODE) {
		put_page(page);
		goto out;
	}

	/* Migrate to the requested node */
	if (migrate_misplaced_page(page, vma, target_nid)) {
		page_nid = target_nid;
		flags |= TNF_MIGRATED;
	} else {
		flags |= TNF_MIGRATE_FAIL;
	}

out:
	if (page_nid != NUMA_NO_NODE)
		task_numa_fault(last_cpupid, page_nid, 1, flags);

out_unlock:
	mmap_read_unlock(mm);
}

static void task_mem_sampling_access_work(struct callback_head *work)
{
	struct mem_sampling_numa_access_work *iwork =
		container_of(work, struct mem_sampling_numa_access_work, work);

	if (iwork->cpu == smp_processor_id())
		do_numa_access(current, iwork->vaddr, iwork->paddr);
	kfree(iwork);
}

static void numa_create_taskwork(u64 vaddr, u64 paddr, int cpu)
{
	struct mem_sampling_numa_access_work *iwork = NULL;

	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
	if (!iwork)
		return;

	iwork->vaddr = vaddr;
	iwork->paddr = paddr;
	iwork->cpu = smp_processor_id();

	init_task_work(&iwork->work, task_mem_sampling_access_work);
	task_work_add(current, &iwork->work, TWA_RESUME);
}

static void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record)
{
	struct task_struct *p = current;
	u64 vaddr = record->virt_addr;
	u64 paddr = record->phys_addr;

	/* Discard kernel address accesses */
	if (vaddr & (1UL << 63))
		return;

	if (p->pid != record->context_id)
		return;

	numa_create_taskwork(vaddr, paddr, smp_processor_id());
}

static void numa_balancing_mem_sampling_cb_register(void)
{
	mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb);
}

static void numa_balancing_mem_sampling_cb_unregister(void)
{
	mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb);
}
#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */

static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
{
#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
@@ -112,6 +277,8 @@ static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
#endif
}

DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);

static int sysctl_mem_sampling_mode;

static void __set_mem_sampling_state(bool enabled)