Commit 091d2493 authored by Uladzislau Rezki (Sony)'s avatar Uladzislau Rezki (Sony) Committed by Peng Zhang
Browse files

mm: vmalloc: remove global vmap_area_root rb-tree

mainline inclusion
from mainline-v6.9-rc1
commit d093602919ad5908532142a048539800fa94a0d1
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9CHG1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d093602919ad5908532142a048539800fa94a0d1

-------------------------------------------------

Store allocated objects in a separate nodes.  A va->va_start address is
converted into a correct node where it should be placed and resided.  An
addr_to_node() function is used to do a proper address conversion to
determine a node that contains a VA.

Such approach balances VAs across nodes as a result an access becomes
scalable.  Number of nodes in a system depends on number of CPUs.

Please note:

1. As of now allocated VAs are bound to a node-0. It means the
   patch does not give any difference comparing with a current
   behavior;

2. The global vmap_area_lock, vmap_area_root are removed as there
   is no need in it anymore. The vmap_area_list is still kept and
   is _empty_. It is exported for a kexec only;

3. The vmallocinfo and vread() have to be reworked to be able to
   handle multiple nodes.

[urezki@gmail.com: mark vmap_init_free_space() with __init tag]
  Link: https://lkml.kernel.org/r/20240111132628.299644-1-urezki@gmail.com
[urezki@gmail.com: fix a wrong value passed to __find_vmap_area()]
  Link: https://lkml.kernel.org/r/20240111121104.180993-1-urezki@gmail.com
Link: https://lkml.kernel.org/r/20240102184633.748113-5-urezki@gmail.com


Signed-off-by: default avatarUladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: default avatarBaoquan He <bhe@redhat.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarLorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: default avatarAnshuman Khandual <anshuman.khandual@arm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
(cherry picked from commit d093602919ad5908532142a048539800fa94a0d1)
Signed-off-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: default avatarZhangPeng <zhangpeng362@huawei.com>
parent 53f06cc3
Loading
Loading
Loading
Loading
+174 −68
Original line number Diff line number Diff line
@@ -800,11 +800,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(vmap_area_lock);
static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;

static struct rb_root purge_vmap_area_root = RB_ROOT;
@@ -844,6 +842,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

/*
 * An effective vmap-node logic. Users make use of nodes instead
 * of a global heap. It allows to balance an access and mitigate
 * contention.
 */
struct rb_list {
	struct rb_root root;
	struct list_head head;
	spinlock_t lock;
};

static struct vmap_node {
	/* Bookkeeping data of this node. */
	struct rb_list busy;
} single;

static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
	return (addr / vmap_zone_size) % nr_vmap_nodes;
}

static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
	return &vmap_nodes[addr_to_node_id(addr)];
}

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
@@ -875,10 +905,11 @@ unsigned long vmalloc_nr_pages(void)
}

/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
static struct vmap_area *
find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
	struct vmap_area *va = NULL;
	struct rb_node *n = vmap_area_root.rb_node;
	struct rb_node *n = root->rb_node;

	addr = (unsigned long)kasan_reset_tag((void *)addr);

@@ -1624,12 +1655,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
 */
static void free_vmap_area(struct vmap_area *va)
{
	struct vmap_node *vn = addr_to_node(va->va_start);

	/*
	 * Remove from the busy tree/list.
	 */
	spin_lock(&vmap_area_lock);
	unlink_va(va, &vmap_area_root);
	spin_unlock(&vmap_area_lock);
	spin_lock(&vn->busy.lock);
	unlink_va(va, &vn->busy.root);
	spin_unlock(&vn->busy.lock);

	/*
	 * Insert/Merge it back to the free tree/list.
@@ -1672,6 +1705,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
				int node, gfp_t gfp_mask,
				unsigned long va_flags)
{
	struct vmap_node *vn;
	struct vmap_area *va;
	unsigned long freed;
	unsigned long addr;
@@ -1717,9 +1751,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
	va->vm = NULL;
	va->flags = va_flags;

	spin_lock(&vmap_area_lock);
	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
	spin_unlock(&vmap_area_lock);
	vn = addr_to_node(va->va_start);

	spin_lock(&vn->busy.lock);
	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
	spin_unlock(&vn->busy.lock);

	BUG_ON(!IS_ALIGNED(va->va_start, align));
	BUG_ON(va->va_start < vstart);
@@ -1943,26 +1979,61 @@ static void free_unmap_vmap_area(struct vmap_area *va)

struct vmap_area *find_vmap_area(unsigned long addr)
{
	struct vmap_node *vn;
	struct vmap_area *va;
	int i, j;

	/*
	 * An addr_to_node_id(addr) converts an address to a node index
	 * where a VA is located. If VA spans several zones and passed
	 * addr is not the same as va->va_start, what is not common, we
	 * may need to scan an extra nodes. See an example:
	 *
	 *      <--va-->
	 * -|-----|-----|-----|-----|-
	 *     1     2     0     1
	 *
	 * VA resides in node 1 whereas it spans 1 and 2. If passed
	 * addr is within a second node we should do extra work. We
	 * should mention that it is rare and is a corner case from
	 * the other hand it has to be covered.
	 */
	i = j = addr_to_node_id(addr);
	do {
		vn = &vmap_nodes[i];

	spin_lock(&vmap_area_lock);
	va = __find_vmap_area(addr, &vmap_area_root);
	spin_unlock(&vmap_area_lock);
		spin_lock(&vn->busy.lock);
		va = __find_vmap_area(addr, &vn->busy.root);
		spin_unlock(&vn->busy.lock);

		if (va)
			return va;
	} while ((i = (i + 1) % nr_vmap_nodes) != j);

	return NULL;
}

static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
	struct vmap_node *vn;
	struct vmap_area *va;
	int i, j;

	i = j = addr_to_node_id(addr);
	do {
		vn = &vmap_nodes[i];

	spin_lock(&vmap_area_lock);
	va = __find_vmap_area(addr, &vmap_area_root);
		spin_lock(&vn->busy.lock);
		va = __find_vmap_area(addr, &vn->busy.root);
		if (va)
		unlink_va(va, &vmap_area_root);
	spin_unlock(&vmap_area_lock);
			unlink_va(va, &vn->busy.root);
		spin_unlock(&vn->busy.lock);

		if (va)
			return va;
	} while ((i = (i + 1) % nr_vmap_nodes) != j);

	return NULL;
}

/*** Per cpu kva allocator ***/
@@ -2164,6 +2235,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)

static void free_vmap_block(struct vmap_block *vb)
{
	struct vmap_node *vn;
	struct vmap_block *tmp;
	struct xarray *xa;

@@ -2171,9 +2243,10 @@ static void free_vmap_block(struct vmap_block *vb)
	tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
	BUG_ON(tmp != vb);

	spin_lock(&vmap_area_lock);
	unlink_va(vb->va, &vmap_area_root);
	spin_unlock(&vmap_area_lock);
	vn = addr_to_node(vb->va->va_start);
	spin_lock(&vn->busy.lock);
	unlink_va(vb->va, &vn->busy.root);
	spin_unlock(&vn->busy.lock);

	free_vmap_area_noflush(vb->va);
	kfree_rcu(vb, rcu_head);
@@ -2597,9 +2670,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
			      unsigned long flags, const void *caller)
{
	spin_lock(&vmap_area_lock);
	struct vmap_node *vn = addr_to_node(va->va_start);

	spin_lock(&vn->busy.lock);
	setup_vmalloc_vm_locked(vm, va, flags, caller);
	spin_unlock(&vmap_area_lock);
	spin_unlock(&vn->busy.lock);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -3792,6 +3867,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
 */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
	struct vmap_node *vn;
	struct vmap_area *va;
	struct vm_struct *vm;
	char *vaddr;
@@ -3805,8 +3881,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)

	remains = count;

	spin_lock(&vmap_area_lock);
	va = find_vmap_area_exceed_addr((unsigned long)addr);
	/* Hooked to node_0 so far. */
	vn = addr_to_node(0);
	spin_lock(&vn->busy.lock);

	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
	if (!va)
		goto finished_zero;

@@ -3814,7 +3893,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
	if ((unsigned long)addr + remains <= va->va_start)
		goto finished_zero;

	list_for_each_entry_from(va, &vmap_area_list, list) {
	list_for_each_entry_from(va, &vn->busy.head, list) {
		size_t copied;

		if (remains == 0)
@@ -3873,12 +3952,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
	}

finished_zero:
	spin_unlock(&vmap_area_lock);
	spin_unlock(&vn->busy.lock);
	/* zero-fill memory holes */
	return count - remains + zero_iter(iter, remains);
finished:
	/* Nothing remains, or We couldn't copy/zero everything. */
	spin_unlock(&vmap_area_lock);
	spin_unlock(&vn->busy.lock);

	return count - remains;
}
@@ -4212,14 +4291,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
	}

	/* insert all vm's */
	spin_lock(&vmap_area_lock);
	for (area = 0; area < nr_vms; area++) {
		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
		struct vmap_node *vn = addr_to_node(vas[area]->va_start);

		spin_lock(&vn->busy.lock);
		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
				 pcpu_get_vm_areas);
		spin_unlock(&vn->busy.lock);
	}
	spin_unlock(&vmap_area_lock);

	/*
	 * Mark allocated areas as accessible. Do it now as a best-effort
@@ -4330,31 +4410,32 @@ bool vmalloc_dump_obj(void *object)
{
	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
	const void *caller;
	struct vm_struct *vm;
	struct vmap_area *va;
	struct vmap_node *vn;
	unsigned long addr;
	unsigned int nr_pages;
	bool success = false;

	if (!spin_trylock(&vmap_area_lock))
		return false;
	va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
	if (!va) {
		spin_unlock(&vmap_area_lock);
		return false;
	vn = addr_to_node((unsigned long)objp);

	if (spin_trylock(&vn->busy.lock)) {
		va = __find_vmap_area((unsigned long)objp, &vn->busy.root);

		if (va && va->vm) {
			addr = (unsigned long)va->vm->addr;
			caller = va->vm->caller;
			nr_pages = va->vm->nr_pages;
			success = true;
		}

	vm = va->vm;
	if (!vm) {
		spin_unlock(&vmap_area_lock);
		return false;
		spin_unlock(&vn->busy.lock);
	}
	addr = (unsigned long)vm->addr;
	caller = vm->caller;
	nr_pages = vm->nr_pages;
	spin_unlock(&vmap_area_lock);

	if (success)
		pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
			nr_pages, addr, caller);
	return true;

	return success;
}
#endif

@@ -4558,25 +4639,26 @@ EXPORT_SYMBOL(remap_vmalloc_hugepage_range);

#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
	__acquires(&vmap_purge_lock)
	__acquires(&vmap_area_lock)
{
	struct vmap_node *vn = addr_to_node(0);

	mutex_lock(&vmap_purge_lock);
	spin_lock(&vmap_area_lock);
	spin_lock(&vn->busy.lock);

	return seq_list_start(&vmap_area_list, *pos);
	return seq_list_start(&vn->busy.head, *pos);
}

static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
	return seq_list_next(p, &vmap_area_list, pos);
	struct vmap_node *vn = addr_to_node(0);
	return seq_list_next(p, &vn->busy.head, pos);
}

static void s_stop(struct seq_file *m, void *p)
	__releases(&vmap_area_lock)
	__releases(&vmap_purge_lock)
{
	spin_unlock(&vmap_area_lock);
	struct vmap_node *vn = addr_to_node(0);

	spin_unlock(&vn->busy.lock);
	mutex_unlock(&vmap_purge_lock);
}

@@ -4619,9 +4701,11 @@ static void show_purge_info(struct seq_file *m)

static int s_show(struct seq_file *m, void *p)
{
	struct vmap_node *vn;
	struct vmap_area *va;
	struct vm_struct *v;

	vn = addr_to_node(0);
	va = list_entry(p, struct vmap_area, list);

	if (!va->vm) {
@@ -4675,7 +4759,7 @@ static int s_show(struct seq_file *m, void *p)
	 * As a final step, dump "unpurged" areas.
	 */
final:
	if (list_is_last(&va->list, &vmap_area_list))
	if (list_is_last(&va->list, &vn->busy.head))
		show_purge_info(m);

	return 0;
@@ -4702,11 +4786,12 @@ module_init(proc_vmalloc_init);

#endif

static void vmap_init_free_space(void)
static void __init vmap_init_free_space(void)
{
	unsigned long vmap_start = 1;
	const unsigned long vmap_end = ULONG_MAX;
	struct vmap_area *busy, *free;
	struct vmap_area *free;
	struct vm_struct *busy;

	/*
	 *     B     F     B     B     B     F
@@ -4714,12 +4799,12 @@ static void vmap_init_free_space(void)
	 *  |           The KVA space           |
	 *  |<--------------------------------->|
	 */
	list_for_each_entry(busy, &vmap_area_list, list) {
		if (busy->va_start - vmap_start > 0) {
	for (busy = vmlist; busy; busy = busy->next) {
		if ((unsigned long) busy->addr - vmap_start > 0) {
			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
			if (!WARN_ON_ONCE(!free)) {
				free->va_start = vmap_start;
				free->va_end = busy->va_start;
				free->va_end = (unsigned long) busy->addr;

				insert_vmap_area_augment(free, NULL,
					&free_vmap_area_root,
@@ -4727,7 +4812,7 @@ static void vmap_init_free_space(void)
			}
		}

		vmap_start = busy->va_end;
		vmap_start = (unsigned long) busy->addr + busy->size;
	}

	if (vmap_end - vmap_start > 0) {
@@ -4743,9 +4828,23 @@ static void vmap_init_free_space(void)
	}
}

static void vmap_init_nodes(void)
{
	struct vmap_node *vn;
	int i;

	for (i = 0; i < nr_vmap_nodes; i++) {
		vn = &vmap_nodes[i];
		vn->busy.root = RB_ROOT;
		INIT_LIST_HEAD(&vn->busy.head);
		spin_lock_init(&vn->busy.lock);
	}
}

void __init vmalloc_init(void)
{
	struct vmap_area *va;
	struct vmap_node *vn;
	struct vm_struct *tmp;
	int i;

@@ -4767,6 +4866,11 @@ void __init vmalloc_init(void)
		xa_init(&vbq->vmap_blocks);
	}

	/*
	 * Setup nodes before importing vmlist.
	 */
	vmap_init_nodes();

	/* Import existing vmlist entries. */
	for (tmp = vmlist; tmp; tmp = tmp->next) {
		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
@@ -4776,7 +4880,9 @@ void __init vmalloc_init(void)
		va->va_start = (unsigned long)tmp->addr;
		va->va_end = va->va_start + tmp->size;
		va->vm = tmp;
		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);

		vn = addr_to_node(va->va_start);
		insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
	}

	/*