Commit 1c434a44 authored by Nikita Panov's avatar Nikita Panov Committed by Denis Darvish
Browse files

mm: per-NUMA node replication core infrastructure

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IBOJU2



-------------------------------------------------

In current design mutable kernel data modifications don't require
synchronization between translation tables due to on 64-bit platforms
all physical memory already mapped in kernel space and this mapping
is persistent.
In user space the translation tables synchronizations are quite rare
due to the only case is new PUD/P4D allocation.

At the current stage only the PGD layer is replicated for the user space.
Please refer to the next pics.

TT overview:
                   NODE 0                   NODE 1
              USER      KERNEL         USER      KERNEL
           ---------------------    ---------------------
     PGD   | | | | |   | | | |*|    | | | | |   | | | |*|
           ---------------------    ---------------------
                              |                        |
            -------------------      -------------------
            |                        |
           ---------------------    ---------------------
     PUD   | | | | |   | | |*|*|    | | | | |   | | |*|*|
           ---------------------    ---------------------
                              |                        |
            -------------------      -------------------
            |                        |
           ---------------------    ---------------------
     PMD   |READ-ONLY|MUTABLE  |    |READ-ONLY|MUTABLE  |
           ---------------------    ---------------------
                  |       |                  |     |
                  |       --------------------------
                  |               |          |
                --------       -------      --------
   PHYS         |      |       |     |      |      |
    MEM         --------       -------      --------
                <------>                    <------>
                 NODE 0        Shared        NODE 1
                               between
                               nodes
* - entries unique in each table

TT synchronization:
               NODE 0                    NODE 1
          USER      KERNEL          USER      KERNEL
       ---------------------     ---------------------
 PGD   | | |0| |   | | | | |     | | |0| |   | | | | |
       ---------------------     ---------------------
                              |
                              |
                              |
                              |
                              |  PUD_ALLOC / P4D_ALLOC
                              |
                              |      IN USERSPACE
                              |
                              \/
       ---------------------     ---------------------
 PGD   | | |p| |   | | | | |     | | |p| |   | | | | |
       ---------------------     ---------------------
            |                         |
            |                         |
            ---------------------------
                     |
                    ---------------------
 PUD/P4D            | | | | |   | | | | |
                    ---------------------

Acked-by: default avatarAlexander Grubnikov <alexander.grubnikov@huawei.com>
Acked-by: default avatarIlya Hanov <ilya.hanov@huawei-partners.com>
Acked-by: default avatarDenis Darvish <darvish.denis@huawei.com>
Co-developed-by: default avatarArtem Kuzin <artem.kuzin@huawei.com>
Signed-off-by: default avatarArtem Kuzin <artem.kuzin@huawei.com>
Co-developed-by: default avatarNikita Panov <panov.nikita@huawei.com>
Signed-off-by: default avatarNikita Panov <panov.nikita@huawei.com>
parent 86987167
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -981,7 +981,11 @@ struct mm_struct {
#endif
	} __randomize_layout;

#ifdef CONFIG_KERNEL_REPLICATION
	KABI_USE(1, pgd_t **pgd_numa)
#else
	KABI_RESERVE(1)
#endif
	KABI_RESERVE(2)
	KABI_RESERVE(3)
	KABI_RESERVE(4)
+198 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_NUMA_REPLICATION_H
#define _LINUX_NUMA_REPLICATION_H

#ifdef CONFIG_KERNEL_REPLICATION

#include <linux/kabi.h>

/*
 * Why? Because linux is defined to 1 for some reason,
 * and linux/mm.h converted to 1/mm.h. Perhaps compiler?
 * Do not ask me, I have no idea.
 */
#if defined(linux)
#define tmp_linux_value linux
#undef linux
#endif

#include KABI_HIDE_INCLUDE(<linux/mm_types.h>)
#include KABI_HIDE_INCLUDE(<linux/nodemask.h>)
#include KABI_HIDE_INCLUDE(<linux/module.h>)
#include KABI_HIDE_INCLUDE(<linux/mm.h>)
#include KABI_HIDE_INCLUDE(<asm/numa_replication.h>)

#if defined(tmp_linux_value)
#define linux tmp_linux_value
#undef tmp_linux_value
#endif

typedef enum {
	NONE = 0,
	PMD_PROPAGATION = 1,
	PUD_PROPAGATION = 2,
	P4D_PROPAGATION = 3,
	PGD_PROPAGATION = 4
} propagation_level_t;

extern nodemask_t replica_nodes;

#define for_each_memory_node(nid)			\
	for (nid = first_node(replica_nodes);		\
	     nid != MAX_NUMNODES;			\
	     nid = next_node(nid, replica_nodes))

#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()])
#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid])

static inline bool numa_addr_has_replica(const void *addr)
{
	return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) &&
		((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT);
}

void __init numa_replication_init(void);
void __init numa_replicate_kernel_text(void);
void numa_replicate_kernel_rodata(void);
void numa_replication_fini(void);

bool is_text_replicated(void);
propagation_level_t get_propagation_level(void);
void numa_setup_pgd(void);
void __init_or_module *numa_get_replica(void *vaddr, int nid);
int numa_get_memory_node(int nid);
void dump_mm_pgtables(struct mm_struct *mm,
		      unsigned long start, unsigned long end);

/* Macro to walk over mm->pgd_numa and cast it to appropriate level type */
#define for_each_pgtable_replica(table, mm, replica, nid, offset)				\
	for (nid = first_node(replica_nodes), offset = ((unsigned long)table) & (~PAGE_MASK),	\
	     replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset);		\
	     nid != MAX_NUMNODES;								\
	     nid = next_node(nid, replica_nodes),						\
	     replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset))

static inline void pgd_populate_replicated(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
{
	int nid;
	pgd_t *curr_pgd;
	unsigned long offset;

	if (get_propagation_level() == PGD_PROPAGATION) {
		for_each_pgtable_replica(pgdp, mm, curr_pgd, nid, offset) {
			pgd_populate(mm, curr_pgd, p4dp);
		}
	} else {
		pgd_populate(mm, pgdp, p4dp);
	}
}

static inline void p4d_populate_replicated(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
{
	int nid;
	p4d_t *curr_p4d;
	unsigned long offset;

	if (get_propagation_level() == P4D_PROPAGATION) {
		for_each_pgtable_replica(p4dp, mm, curr_p4d, nid, offset) {
			p4d_populate(mm, curr_p4d, pudp);
		}
	} else {
		p4d_populate(mm, p4dp, pudp);
	}
}

static inline void pud_populate_replicated(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
{
	int nid;
	pud_t *curr_pud;
	unsigned long offset;

	if (get_propagation_level() == PUD_PROPAGATION) {
		for_each_pgtable_replica(pudp, mm, curr_pud, nid, offset) {
			pud_populate(mm, curr_pud, pmdp);
		}
	} else {
		pud_populate(mm, pudp, pmdp);
	}
}

static inline void pmd_populate_replicated(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
{
	int nid;
	pmd_t *curr_pmd;
	unsigned long offset;

	if (get_propagation_level() == PMD_PROPAGATION) {
		for_each_pgtable_replica(pmdp, mm, curr_pmd, nid, offset) {
			pmd_populate(mm, curr_pmd, ptep);
		}
	} else {
		pmd_populate(mm, pmdp, ptep);
	}
}

#else

#if defined(linux)
#define tmp_linux_value linux
#undef linux
#endif

#include KABI_HIDE_INCLUDE(<linux/mm.h>)

#if defined(tmp_linux_value)
#define linux tmp_linux_value
#undef tmp_linux_value
#endif

#define this_node_pgd(mm) ((mm)->pgd)
#define per_node_pgd(mm, nid) ((mm)->pgd)

static inline void numa_setup_pgd(void)
{
}

static inline void __init numa_replication_init(void)
{
}

static inline void __init numa_replicate_kernel_text(void)
{
}

static inline void numa_replicate_kernel_rodata(void)
{
}

static inline void numa_replication_fini(void)
{
}

static inline bool numa_addr_has_replica(const void *addr)
{
	return false;
}

static inline bool is_text_replicated(void)
{
	return false;
}

static inline void *numa_get_replica(void *vaddr, int nid)
{
	return lm_alias(vaddr);
}

static inline void dump_mm_pgtables(struct mm_struct *mm,
				    unsigned long start, unsigned long end)
{
}

#define pgd_populate_replicated pgd_populate
#define p4d_populate_replicated p4d_populate
#define pud_populate_replicated pud_populate
#define pmd_populate_replicated pmd_populate

#endif /*CONFIG_KERNEL_REPLICATION*/
#endif /*_LINUX_NUMA_REPLICATION_H*/
+1 −0
Original line number Diff line number Diff line
@@ -140,6 +140,7 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_KERNEL_REPLICATION) += numa_kernel_replication.o
obj-$(CONFIG_SHARE_POOL) += share_pool.o
obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
obj-$(CONFIG_ETMEM) += etmem.o
+133 −15
Original line number Diff line number Diff line
@@ -79,6 +79,7 @@
#include <linux/sched/sysctl.h>
#include <linux/userswap.h>
#include <linux/dynamic_pool.h>
#include <linux/numa_kernel_replication.h>

#include <trace/events/kmem.h>

@@ -185,6 +186,96 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member)
	trace_rss_stat(mm, member);
}

#ifdef CONFIG_KERNEL_REPLICATION

static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
{
	unsigned long offset;
	int nid;
	pmd_t *curr_pmd;
	pgtable_t token = pmd_pgtable(*pmd);

	if (get_propagation_level() == PMD_PROPAGATION) {
		for_each_pgtable_replica(pmd, tlb->mm, curr_pmd, nid, offset) {
			pmd_clear(curr_pmd);
		}
	} else {
		pmd_clear(pmd);
	}

	pte_free_tlb(tlb, token, addr);
	mm_dec_nr_ptes(tlb->mm);
	(void)token;
}

static inline void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				    unsigned long addr)
{
	unsigned long offset;
	int nid;
	pud_t *curr_pud;
	pmd_t *pmd = pmd_offset(pud, addr);

	if (get_propagation_level() == PUD_PROPAGATION) {
		for_each_pgtable_replica(pud, tlb->mm, curr_pud, nid, offset) {
			pud_clear(curr_pud);
		}
	} else {
		pud_clear(pud);
	}

	pmd_free_tlb(tlb, pmd, addr);
	mm_dec_nr_pmds(tlb->mm);
	(void)pmd;
}

static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
				    unsigned long addr)
{
	unsigned long offset;
	int nid;
	p4d_t *curr_p4d;
	pud_t *pud = pud_offset(p4d, addr);

	if (get_propagation_level() == P4D_PROPAGATION) {
		for_each_pgtable_replica(p4d, tlb->mm, curr_p4d, nid, offset) {
			p4d_clear(curr_p4d);
		}
	} else {
		p4d_clear(p4d);
	}

	pud_free_tlb(tlb, pud, addr);
	mm_dec_nr_puds(tlb->mm);
	(void)pud;
}

static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
				    unsigned long addr)
{
	unsigned long offset;
	int nid;
	pgd_t *curr_pgd;
	p4d_t *p4d  = p4d_offset(pgd, addr);

	if (get_propagation_level() == PGD_PROPAGATION) {
		for_each_pgtable_replica(pgd, tlb->mm, curr_pgd, nid, offset) {
			pgd_clear(curr_pgd);
		}
	} else {
		pgd_clear(pgd);
	}
	p4d_free_tlb(tlb, p4d, addr);
	/*
	 * Why? If 4-level paging is enabled via kconfig,
	 * all functions execept p4d_offset are empty,
	 * and we get unused variable error
	 */
	(void)p4d;
}
#else

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
@@ -196,8 +287,43 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
	pmd_clear(pmd);
	pte_free_tlb(tlb, token, addr);
	mm_dec_nr_ptes(tlb->mm);
	(void)token;
}

static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				    unsigned long addr)
{
	pmd_t *pmd = pmd_offset(pud, addr);

	pud_clear(pud);
	pmd_free_tlb(tlb, pmd, addr);
	mm_dec_nr_pmds(tlb->mm);
	(void)pmd;
}

static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
				    unsigned long addr)
{
	pud_t *pud = pud_offset(p4d, addr);

	p4d_clear(p4d);
	pud_free_tlb(tlb, pud, addr);
	mm_dec_nr_puds(tlb->mm);
	(void)pud;
}

static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
				    unsigned long addr)
{
	p4d_t *p4d = p4d_offset(pgd, addr);

	pgd_clear(pgd);
	p4d_free_tlb(tlb, p4d, addr);
	(void)p4d;
}

#endif

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
@@ -226,10 +352,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
	pmd_free_tlb(tlb, pmd, start);
	mm_dec_nr_pmds(tlb->mm);
	__free_pmd_range(tlb, pud, start);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
@@ -260,10 +383,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(p4d, start);
	p4d_clear(p4d);
	pud_free_tlb(tlb, pud, start);
	mm_dec_nr_puds(tlb->mm);
	__free_pud_range(tlb, p4d, start);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -294,9 +414,7 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
	if (end - 1 > ceiling - 1)
		return;

	p4d = p4d_offset(pgd, start);
	pgd_clear(pgd);
	p4d_free_tlb(tlb, p4d, start);
	__free_p4d_range(tlb, pgd, start);
}

/*
@@ -440,7 +558,7 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
		 * smp_rmb() barriers in page table walking code.
		 */
		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
		pmd_populate(mm, pmd, *pte);
		pmd_populate_replicated(mm, pmd, *pte);
		*pte = NULL;
	}
	spin_unlock(ptl);
@@ -6050,7 +6168,7 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
		p4d_free(mm, new);
	} else {
		smp_wmb(); /* See comment in pmd_install() */
		pgd_populate(mm, pgd, new);
		pgd_populate_replicated(mm, pgd, new);
	}
	spin_unlock(&mm->page_table_lock);
	return 0;
@@ -6094,7 +6212,7 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
	if (!p4d_present(*p4d)) {
		mm_inc_nr_puds(mm);
		smp_wmb(); /* See comment in pmd_install() */
		p4d_populate(mm, p4d, new);
		p4d_populate_replicated(mm, p4d, new);
	} else	/* Another has populated it */
		pud_free(mm, new);
	spin_unlock(&mm->page_table_lock);
@@ -6118,7 +6236,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
	if (!pud_present(*pud)) {
		mm_inc_nr_pmds(mm);
		smp_wmb(); /* See comment in pmd_install() */
		pud_populate(mm, pud, new);
		pud_populate_replicated(mm, pud, new);
	} else {	/* Another has populated it */
		pmd_free(mm, new);
	}
+759 −0

File added.

Preview size limit exceeded, changes collapsed.