mm/mem_sampling.c: Drive NUMA balancing via mem_sampling access data (bdc47013) · Commits · EulixOS / Software / Kernel

include/linux/mem_sampling.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,8 @@
		#ifndef __MEM_SAMPLING_H
		#define __MEM_SAMPLING_H

		DECLARE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);

		enum mem_sampling_sample_type {
		MEM_SAMPLING_L1D_ACCESS = 1 << 0,
		MEM_SAMPLING_L1D_MISS = 1 << 1,

kernel/sched/fair.c

+11 −0

Original line number	Diff line number	Diff line
		@@ -30,6 +30,7 @@
		#endif
		#include <linux/sched/grid_qos.h>
		#include <linux/bpf_sched.h>
		#include <linux/mem_sampling.h>

		/*
		* Targeted preemption latency for CPU-bound tasks:
		@@ -2967,6 +2968,16 @@ static void task_tick_numa(struct rq rq, struct task_struct curr)
		struct callback_head *work = &curr->numa_work;
		u64 period, now;

		#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
		/*
		* If we are using access hints from hardware (like using
		* SPE), don't scan the address space.
		* Note that currently PMD-level page migration is not
		* supported.
		*/
		if (static_branch_unlikely(&sched_numabalancing_mem_sampling))
		return;
		#endif
		/*
		* We don't care about NUMA placement if we don't have memory.
		*/

mm/Kconfig

+13 −0

Original line number	Diff line number	Diff line
		@@ -1025,6 +1025,19 @@ config MEM_SAMPLING
		features. It requires at least one hardware pmu (e.g. ARM_SPE_MEM_SAMPLING) to
		be enabled.

		config NUMABALANCING_MEM_SAMPLING
		bool "Use hardware memory samples for numa balancing"
		depends on MEM_SAMPLING && NUMA_BALANCING
		default n
		help
		This feature relies on hardware sampling, and will use memory access
		information obtained from hardware sampling in the NUMA balancing
		policy instead of the native software PROT_NONE scheme. Turning on
		this feature may have a performance impact on some workloads, for
		example, lightweight memory access programs.

		if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING.

		source "mm/damon/Kconfig"

		endmenu

mm/mem_sampling.c

+167 −0

Original line number	Diff line number	Diff line
		@@ -18,6 +18,10 @@
		#include <linux/list.h>
		#include <linux/mm.h>
		#include <linux/mem_sampling.h>
		#include <linux/mempolicy.h>
		#include <linux/task_work.h>
		#include <linux/migrate.h>
		#include <linux/sched/numa_balancing.h>

		struct mem_sampling_ops_struct mem_sampling_ops;

		@@ -31,6 +35,15 @@ struct mem_sampling_record_cb_list_entry {
		};
		LIST_HEAD(mem_sampling_record_cb_list);

		#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
		struct mem_sampling_numa_access_work {
		struct callback_head work;
		u64 vaddr, paddr;
		/* Test for debug : decode buffer cpu not same with handle interrupt cpu*/
		int cpu;
		};
		#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */

		void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
		{
		struct mem_sampling_record_cb_list_entry cb_entry, tmp;
		@@ -103,6 +116,158 @@ static void mem_sampling_process(struct mem_sampling_record *record_base, int nr
		mem_sampling_ops.sampling_stop();
		}

		#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING

		static int numa_migrate_prep(struct page page, struct vm_area_struct vma,
		unsigned long addr, int page_nid,
		int *flags)
		{
		get_page(page);

		count_vm_numa_event(NUMA_HINT_FAULTS);
		if (page_nid == numa_node_id()) {
		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
		*flags \|= TNF_FAULT_LOCAL;
		}

		return mpol_misplaced(page, vma, addr);
		}

		/*
		* Called from task_work context to act upon the page access.
		*
		* Physical address (provided by SPE) is used directly instead
		* of walking the page tables to get to the PTE/page. Hence we
		* don't check if PTE is writable for the TNF_NO_GROUP
		* optimization, which means RO pages are considered for grouping.
		*/
		static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr)
		{
		struct mm_struct *mm = p->mm;
		struct vm_area_struct *vma;
		struct page *page = NULL;
		int page_nid = NUMA_NO_NODE;
		int last_cpupid;
		int target_nid;
		int flags = 0;

		if (!mm)
		return;

		if (!mmap_read_trylock(mm))
		return;

		vma = find_vma(mm, vaddr);
		if (!vma)
		goto out_unlock;

		if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
		is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP))
		goto out_unlock;

		if (!vma->vm_mm \|\|
		(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
		goto out_unlock;

		if (!vma_is_accessible(vma))
		goto out_unlock;

		page = pfn_to_online_page(PHYS_PFN(paddr));
		if (!page \|\| is_zone_device_page(page))
		goto out_unlock;

		if (unlikely(!PageLRU(page)))
		goto out_unlock;

		/* TODO: handle PTE-mapped THP or PMD-mapped THP*/
		if (PageCompound(page))
		goto out_unlock;

		/*
		* Flag if the page is shared between multiple address spaces. This
		* is later used when determining whether to group tasks together
		*/
		if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
		flags \|= TNF_SHARED;

		last_cpupid = page_cpupid_last(page);
		page_nid = page_to_nid(page);

		target_nid = numa_migrate_prep(page, vma, vaddr, page_nid, &flags);
		if (target_nid == NUMA_NO_NODE) {
		put_page(page);
		goto out;
		}

		/* Migrate to the requested node */
		if (migrate_misplaced_page(page, vma, target_nid)) {
		page_nid = target_nid;
		flags \|= TNF_MIGRATED;
		} else {
		flags \|= TNF_MIGRATE_FAIL;
		}

		out:
		if (page_nid != NUMA_NO_NODE)
		task_numa_fault(last_cpupid, page_nid, 1, flags);

		out_unlock:
		mmap_read_unlock(mm);
		}

		static void task_mem_sampling_access_work(struct callback_head *work)
		{
		struct mem_sampling_numa_access_work *iwork =
		container_of(work, struct mem_sampling_numa_access_work, work);

		if (iwork->cpu == smp_processor_id())
		do_numa_access(current, iwork->vaddr, iwork->paddr);
		kfree(iwork);
		}

		static void numa_create_taskwork(u64 vaddr, u64 paddr, int cpu)
		{
		struct mem_sampling_numa_access_work *iwork = NULL;

		iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
		if (!iwork)
		return;

		iwork->vaddr = vaddr;
		iwork->paddr = paddr;
		iwork->cpu = smp_processor_id();

		init_task_work(&iwork->work, task_mem_sampling_access_work);
		task_work_add(current, &iwork->work, TWA_RESUME);
		}

		static void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record)
		{
		struct task_struct *p = current;
		u64 vaddr = record->virt_addr;
		u64 paddr = record->phys_addr;

		/* Discard kernel address accesses */
		if (vaddr & (1UL << 63))
		return;

		if (p->pid != record->context_id)
		return;

		numa_create_taskwork(vaddr, paddr, smp_processor_id());
		}

		static void numa_balancing_mem_sampling_cb_register(void)
		{
		mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb);
		}

		static void numa_balancing_mem_sampling_cb_unregister(void)
		{
		mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb);
		}
		#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */

		static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
		{
		#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
		@@ -112,6 +277,8 @@ static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
		#endif
		}

		DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);

		static int sysctl_mem_sampling_mode;

		static void __set_mem_sampling_state(bool enabled)