Commit dd89a137 authored Mar 29, 2024 by Uladzislau Rezki (Sony) Committed by Peng Zhang Mar 29, 2024

mm: vmalloc: set nr_nodes based on CPUs in a system

mainline inclusion
from mainline-v6.9-rc1
commit 8f33a2ff307248c3e55a7696f60b3658b28edb57
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9CHG1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8f33a2ff307248c3e55a7696f60b3658b28edb57

-------------------------------------------------

A number of nodes which are used in the alloc/free paths is set based on
num_possible_cpus() in a system.  Please note a high limit threshold
though is fixed and corresponds to 128 nodes.

For 32-bit or single core systems an access to a global vmap heap is not
balanced.  Such small systems do not suffer from lock contentions due to
low number of CPUs.  In such case the nr_nodes is equal to 1.

Test on AMD Ryzen Threadripper 3970X 32-Core Processor: sudo
./test_vmalloc.sh run_test_mask=7 nr_threads=64

<default perf>
 94.41%     0.89%  [kernel]        [k] _raw_spin_lock
 93.35%    93.07%  [kernel]        [k] native_queued_spin_lock_slowpath
 76.13%     0.28%  [kernel]        [k] __vmalloc_node_range
 72.96%     0.81%  [kernel]        [k] alloc_vmap_area
 56.94%     0.00%  [kernel]        [k] __get_vm_area_node
 41.95%     0.00%  [kernel]        [k] vmalloc
 37.15%     0.01%  [test_vmalloc]  [k] full_fit_alloc_test
 35.17%     0.00%  [kernel]        [k] ret_from_fork_asm
 35.17%     0.00%  [kernel]        [k] ret_from_fork
 35.17%     0.00%  [kernel]        [k] kthread
 35.08%     0.00%  [test_vmalloc]  [k] test_func
 34.45%     0.00%  [test_vmalloc]  [k] fix_size_alloc_test
 28.09%     0.01%  [test_vmalloc]  [k] long_busy_list_alloc_test
 23.53%     0.25%  [kernel]        [k] vfree.part.0
 21.72%     0.00%  [kernel]        [k] remove_vm_area
 20.08%     0.21%  [kernel]        [k] find_unlink_vmap_area
  2.34%     0.61%  [kernel]        [k] free_vmap_area_noflush
<default perf>
   vs
<patch-series perf>
 82.32%     0.22%  [test_vmalloc]  [k] long_busy_list_alloc_test
 63.36%     0.02%  [kernel]        [k] vmalloc
 63.34%     2.64%  [kernel]        [k] __vmalloc_node_range
 30.42%     4.46%  [kernel]        [k] vfree.part.0
 28.98%     2.51%  [kernel]        [k] __alloc_pages_bulk
 27.28%     0.19%  [kernel]        [k] __get_vm_area_node
 26.13%     1.50%  [kernel]        [k] alloc_vmap_area
 21.72%    21.67%  [kernel]        [k] clear_page_rep
 19.51%     2.43%  [kernel]        [k] _raw_spin_lock
 16.61%    16.51%  [kernel]        [k] native_queued_spin_lock_slowpath
 13.40%     2.07%  [kernel]        [k] free_unref_page
 10.62%     0.01%  [kernel]        [k] remove_vm_area
  9.02%     8.73%  [kernel]        [k] insert_vmap_area
  8.94%     0.00%  [kernel]        [k] ret_from_fork_asm
  8.94%     0.00%  [kernel]        [k] ret_from_fork
  8.94%     0.00%  [kernel]        [k] kthread
  8.29%     0.00%  [test_vmalloc]  [k] test_func
  7.81%     0.05%  [test_vmalloc]  [k] full_fit_alloc_test
  5.30%     4.73%  [kernel]        [k] purge_vmap_node
  4.47%     2.65%  [kernel]        [k] free_vmap_area_noflush
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath goes down to
16.51% percent from 93.07%.

The throughput is ~12x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    10m51.271s
user    0m0.013s
sys     0m0.187s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    0m51.301s
user    0m0.015s
sys     0m0.040s
urezki@pc638:~$

Link: https://lkml.kernel.org/r/20240102184633.748113-11-urezki@gmail.com


Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 8f33a2ff307248c3e55a7696f60b3658b28edb57)
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>

parent 8e63c98d

mm/vmalloc.c

+23 −6

Original line number	Diff line number	Diff line
		@@ -5157,10 +5157,27 @@ static void __init vmap_init_free_space(void)
		static void vmap_init_nodes(void)
		{
		struct vmap_node *vn;
		int i, j;
		int i, n;

		#if BITS_PER_LONG == 64
		/* A high threshold of max nodes is fixed and bound to 128. */
		n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);

		if (n > 1) {
		vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT \| __GFP_NOWARN);
		if (vn) {
		/* Node partition is 16 pages. */
		vmap_zone_size = (1 << 4) * PAGE_SIZE;
		nr_vmap_nodes = n;
		vmap_nodes = vn;
		} else {
		pr_err("Failed to allocate an array. Disable a node layer\n");
		}
		}
		#endif

		for (i = 0; i < nr_vmap_nodes; i++) {
		vn = &vmap_nodes[i];
		for (n = 0; n < nr_vmap_nodes; n++) {
		vn = &vmap_nodes[n];
		vn->busy.root = RB_ROOT;
		INIT_LIST_HEAD(&vn->busy.head);
		spin_lock_init(&vn->busy.lock);
		@@ -5169,9 +5186,9 @@ static void vmap_init_nodes(void)
		INIT_LIST_HEAD(&vn->lazy.head);
		spin_lock_init(&vn->lazy.lock);

		for (j = 0; j < MAX_VA_SIZE_PAGES; j++) {
		INIT_LIST_HEAD(&vn->pool[j].head);
		WRITE_ONCE(vn->pool[j].len, 0);
		for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
		INIT_LIST_HEAD(&vn->pool[i].head);
		WRITE_ONCE(vn->pool[i].len, 0);
		}

		spin_lock_init(&vn->pool_lock);