Commit 5f9a4f4a authored by Muchun Song's avatar Muchun Song Committed by Linus Torvalds
Browse files

mm: memcontrol: add the missing numa_stat interface for cgroup v2



In the cgroup v1, we have a numa_stat interface.  This is useful for
providing visibility into the numa locality information within an memcg
since the pages are allowed to be allocated from any physical node.  One
of the use cases is evaluating application performance by combining this
information with the application's CPU allocation.  But the cgroup v2 does
not.  So this patch adds the missing information.

Suggested-by: default avatarShakeel Butt <shakeelb@google.com>
Signed-off-by: default avatarMuchun Song <songmuchun@bytedance.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Reviewed-by: default avatarShakeel Butt <shakeelb@google.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Link: https://lkml.kernel.org/r/20200916100030.71698-2-songmuchun@bytedance.com


Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent bd0b230f
Loading
Loading
Loading
Loading
+49 −20
Original line number Diff line number Diff line
@@ -1259,6 +1259,10 @@ PAGE_SIZE multiple when read back.
	can show up in the middle. Don't rely on items remaining in a
	fixed position; use the keys to look up specific values!

	If the entry has no per-node counter(or not show in the
	mempry.numa_stat). We use 'npn'(non-per-node) as the tag
	to indicate that it will not show in the mempry.numa_stat.

	  anon
		Amount of memory used in anonymous mappings such as
		brk(), sbrk(), and mmap(MAP_ANONYMOUS)
@@ -1270,15 +1274,11 @@ PAGE_SIZE multiple when read back.
	  kernel_stack
		Amount of memory allocated to kernel stacks.

	  slab
		Amount of memory used for storing in-kernel data
		structures.

	  percpu
	  percpu(npn)
		Amount of memory used for storing per-cpu kernel
		data structures.

	  sock
	  sock(npn)
		Amount of memory used in network transmission buffers

	  shmem
@@ -1318,11 +1318,9 @@ PAGE_SIZE multiple when read back.
		Part of "slab" that cannot be reclaimed on memory
		pressure.

	  pgfault
		Total number of page faults incurred

	  pgmajfault
		Number of major page faults incurred
	  slab(npn)
		Amount of memory used for storing in-kernel data
		structures.

	  workingset_refault_anon
		Number of refaults of previously evicted anonymous pages.
@@ -1348,37 +1346,68 @@ PAGE_SIZE multiple when read back.
	  workingset_nodereclaim
		Number of times a shadow node has been reclaimed

	  pgrefill
	  pgfault(npn)
		Total number of page faults incurred

	  pgmajfault(npn)
		Number of major page faults incurred

	  pgrefill(npn)
		Amount of scanned pages (in an active LRU list)

	  pgscan
	  pgscan(npn)
		Amount of scanned pages (in an inactive LRU list)

	  pgsteal
	  pgsteal(npn)
		Amount of reclaimed pages

	  pgactivate
	  pgactivate(npn)
		Amount of pages moved to the active LRU list

	  pgdeactivate
	  pgdeactivate(npn)
		Amount of pages moved to the inactive LRU list

	  pglazyfree
	  pglazyfree(npn)
		Amount of pages postponed to be freed under memory pressure

	  pglazyfreed
	  pglazyfreed(npn)
		Amount of reclaimed lazyfree pages

	  thp_fault_alloc
	  thp_fault_alloc(npn)
		Number of transparent hugepages which were allocated to satisfy
		a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE
                is not set.

	  thp_collapse_alloc
	  thp_collapse_alloc(npn)
		Number of transparent hugepages which were allocated to allow
		collapsing an existing range of pages. This counter is not
		present when CONFIG_TRANSPARENT_HUGEPAGE is not set.

  memory.numa_stat
	A read-only nested-keyed file which exists on non-root cgroups.

	This breaks down the cgroup's memory footprint into different
	types of memory, type-specific details, and other information
	per node on the state of the memory management system.

	This is useful for providing visibility into the NUMA locality
	information within an memcg since the pages are allowed to be
	allocated from any physical node. One of the use case is evaluating
	application performance by combining this information with the
	application's CPU allocation.

	All memory amounts are in bytes.

	The output format of memory.numa_stat is::

	  type N0=<bytes in node 0> N1=<bytes in node 1> ...

	The entries are ordered to be human readable, and new entries
	can show up in the middle. Don't rely on items remaining in a
	fixed position; use the keys to look up specific values!

	The entries can refer to the memory.stat.

  memory.swap.current
	A read-only single value file which exists on non-root
	cgroups.
+110 −60
Original line number Diff line number Diff line
@@ -1448,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
	return false;
}

struct memory_stat {
	const char *name;
	unsigned int ratio;
	unsigned int idx;
};

static struct memory_stat memory_stats[] = {
	{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
	{ "file", PAGE_SIZE, NR_FILE_PAGES },
	{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
	{ "percpu", 1, MEMCG_PERCPU_B },
	{ "sock", PAGE_SIZE, MEMCG_SOCK },
	{ "shmem", PAGE_SIZE, NR_SHMEM },
	{ "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
	{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
	{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	/*
	 * The ratio will be initialized in memory_stats_init(). Because
	 * on some architectures, the macro of HPAGE_PMD_SIZE is not
	 * constant(e.g. powerpc).
	 */
	{ "anon_thp", 0, NR_ANON_THPS },
#endif
	{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
	{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
	{ "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
	{ "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
	{ "unevictable", PAGE_SIZE, NR_UNEVICTABLE },

	/*
	 * Note: The slab_reclaimable and slab_unreclaimable must be
	 * together and slab_reclaimable must be in front.
	 */
	{ "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
	{ "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },

	/* The memory events */
	{ "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
	{ "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
	{ "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
	{ "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
	{ "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
	{ "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
	{ "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
};

static int __init memory_stats_init(void)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		if (memory_stats[i].idx == NR_ANON_THPS)
			memory_stats[i].ratio = HPAGE_PMD_SIZE;
#endif
		VM_BUG_ON(!memory_stats[i].ratio);
		VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
	}

	return 0;
}
pure_initcall(memory_stats_init);

static char *memory_stat_format(struct mem_cgroup *memcg)
{
	struct seq_buf s;
@@ -1468,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
	 * Current memory state:
	 */

	seq_buf_printf(&s, "anon %llu\n",
		       (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
		       PAGE_SIZE);
	seq_buf_printf(&s, "file %llu\n",
		       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
		       PAGE_SIZE);
	seq_buf_printf(&s, "kernel_stack %llu\n",
		       (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
		       1024);
	seq_buf_printf(&s, "slab %llu\n",
		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
	seq_buf_printf(&s, "percpu %llu\n",
		       (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
	seq_buf_printf(&s, "sock %llu\n",
		       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
		       PAGE_SIZE);
	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
		u64 size;

	seq_buf_printf(&s, "shmem %llu\n",
		       (u64)memcg_page_state(memcg, NR_SHMEM) *
		       PAGE_SIZE);
	seq_buf_printf(&s, "file_mapped %llu\n",
		       (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
		       PAGE_SIZE);
	seq_buf_printf(&s, "file_dirty %llu\n",
		       (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
		       PAGE_SIZE);
	seq_buf_printf(&s, "file_writeback %llu\n",
		       (u64)memcg_page_state(memcg, NR_WRITEBACK) *
		       PAGE_SIZE);
		size = memcg_page_state(memcg, memory_stats[i].idx);
		size *= memory_stats[i].ratio;
		seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	seq_buf_printf(&s, "anon_thp %llu\n",
		       (u64)memcg_page_state(memcg, NR_ANON_THPS) *
		       HPAGE_PMD_SIZE);
#endif

	for (i = 0; i < NR_LRU_LISTS; i++)
		seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
			       PAGE_SIZE);

	seq_buf_printf(&s, "slab_reclaimable %llu\n",
		       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
	seq_buf_printf(&s, "slab_unreclaimable %llu\n",
		       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
			size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
			       memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
			seq_buf_printf(&s, "slab %llu\n", size);
		}
	}

	/* Accumulated memory events */

@@ -1521,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
		       memcg_events(memcg, PGFAULT));
	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
		       memcg_events(memcg, PGMAJFAULT));

	seq_buf_printf(&s, "workingset_refault_anon %lu\n",
		       memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
	seq_buf_printf(&s, "workingset_refault_file %lu\n",
		       memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
	seq_buf_printf(&s, "workingset_activate_anon %lu\n",
		       memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
	seq_buf_printf(&s, "workingset_activate_file %lu\n",
		       memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
	seq_buf_printf(&s, "workingset_restore_anon %lu\n",
		       memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
	seq_buf_printf(&s, "workingset_restore_file %lu\n",
		       memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
	seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
		       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));

	seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
		       memcg_events(memcg, PGREFILL));
	seq_buf_printf(&s, "pgscan %lu\n",
@@ -6374,6 +6389,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
	return 0;
}

#ifdef CONFIG_NUMA
static int memory_numa_stat_show(struct seq_file *m, void *v)
{
	int i;
	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);

	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
		int nid;

		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
			continue;

		seq_printf(m, "%s", memory_stats[i].name);
		for_each_node_state(nid, N_MEMORY) {
			u64 size;
			struct lruvec *lruvec;

			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
			size = lruvec_page_state(lruvec, memory_stats[i].idx);
			size *= memory_stats[i].ratio;
			seq_printf(m, " N%d=%llu", nid, size);
		}
		seq_putc(m, '\n');
	}

	return 0;
}
#endif

static int memory_oom_group_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6451,6 +6495,12 @@ static struct cftype memory_files[] = {
		.name = "stat",
		.seq_show = memory_stat_show,
	},
#ifdef CONFIG_NUMA
	{
		.name = "numa_stat",
		.seq_show = memory_numa_stat_show,
	},
#endif
	{
		.name = "oom.group",
		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,