Commit 5af6807b authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by Daniel Borkmann
Browse files

bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().



Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
objects into free_by_rcu_ttrace list where they are waiting for RCU
task trace grace period to be freed into slab.

The life cycle of objects:
alloc: dequeue free_llist
free: enqeueu free_llist
free_rcu: enqueue free_by_rcu -> waiting_for_gp
free_llist above high watermark -> free_by_rcu_ttrace
after RCU GP waiting_for_gp -> free_by_rcu_ttrace
free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarHou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20230706033447.54696-13-alexei.starovoitov@gmail.com
parent f76faa65
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
/* kmalloc/kfree equivalent: */
void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);

/* kmem_cache_alloc/free equivalent: */
void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_cache_raw_free(void *ptr);
void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);

+126 −3
Original line number Diff line number Diff line
@@ -101,6 +101,15 @@ struct bpf_mem_cache {
	bool draining;
	struct bpf_mem_cache *tgt;

	/* list of objects to be freed after RCU GP */
	struct llist_head free_by_rcu;
	struct llist_node *free_by_rcu_tail;
	struct llist_head waiting_for_gp;
	struct llist_node *waiting_for_gp_tail;
	struct rcu_head rcu;
	atomic_t call_rcu_in_progress;
	struct llist_head free_llist_extra_rcu;

	/* list of objects to be freed after RCU tasks trace GP */
	struct llist_head free_by_rcu_ttrace;
	struct llist_head waiting_for_gp_ttrace;
@@ -346,6 +355,69 @@ static void free_bulk(struct bpf_mem_cache *c)
	do_call_rcu_ttrace(tgt);
}

static void __free_by_rcu(struct rcu_head *head)
{
	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
	struct bpf_mem_cache *tgt = c->tgt;
	struct llist_node *llnode;

	llnode = llist_del_all(&c->waiting_for_gp);
	if (!llnode)
		goto out;

	llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);

	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
	do_call_rcu_ttrace(tgt);
out:
	atomic_set(&c->call_rcu_in_progress, 0);
}

static void check_free_by_rcu(struct bpf_mem_cache *c)
{
	struct llist_node *llnode, *t;
	unsigned long flags;

	/* drain free_llist_extra_rcu */
	if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
		inc_active(c, &flags);
		llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
			if (__llist_add(llnode, &c->free_by_rcu))
				c->free_by_rcu_tail = llnode;
		dec_active(c, flags);
	}

	if (llist_empty(&c->free_by_rcu))
		return;

	if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
		/*
		 * Instead of kmalloc-ing new rcu_head and triggering 10k
		 * call_rcu() to hit rcutree.qhimark and force RCU to notice
		 * the overload just ask RCU to hurry up. There could be many
		 * objects in free_by_rcu list.
		 * This hint reduces memory consumption for an artificial
		 * benchmark from 2 Gbyte to 150 Mbyte.
		 */
		rcu_request_urgent_qs_task(current);
		return;
	}

	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));

	inc_active(c, &flags);
	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
	c->waiting_for_gp_tail = c->free_by_rcu_tail;
	dec_active(c, flags);

	if (unlikely(READ_ONCE(c->draining))) {
		free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
		atomic_set(&c->call_rcu_in_progress, 0);
	} else {
		call_rcu_hurry(&c->rcu, __free_by_rcu);
	}
}

static void bpf_mem_refill(struct irq_work *work)
{
	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
@@ -360,6 +432,8 @@ static void bpf_mem_refill(struct irq_work *work)
		alloc_bulk(c, c->batch, NUMA_NO_NODE);
	else if (cnt > c->high_watermark)
		free_bulk(c);

	check_free_by_rcu(c);
}

static void notrace irq_work_raise(struct bpf_mem_cache *c)
@@ -488,6 +562,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
	free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
	free_all(__llist_del_all(&c->free_llist), percpu);
	free_all(__llist_del_all(&c->free_llist_extra), percpu);
	free_all(__llist_del_all(&c->free_by_rcu), percpu);
	free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
	free_all(llist_del_all(&c->waiting_for_gp), percpu);
}

static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
@@ -500,8 +577,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)

static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
	 * still execute. Wait for it now before we freeing percpu caches.
	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
	 * might still execute. Wait for them.
	 *
	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -510,7 +587,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
	 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
	 * using rcu_trace_implies_rcu_gp() as well.
	 */
	rcu_barrier_tasks_trace();
	rcu_barrier(); /* wait for __free_by_rcu */
	rcu_barrier_tasks_trace(); /* wait for __free_rcu */
	if (!rcu_trace_implies_rcu_gp())
		rcu_barrier();
	free_mem_alloc_no_barrier(ma);
@@ -563,6 +641,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
			irq_work_sync(&c->refill_work);
			drain_mem_cache(c);
			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
		}
		/* objcg is the same across cpus */
		if (c->objcg)
@@ -579,6 +658,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
				irq_work_sync(&c->refill_work);
				drain_mem_cache(c);
				rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
			}
		}
		if (c->objcg)
@@ -663,6 +743,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
		irq_work_raise(c);
}

static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
{
	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
	unsigned long flags;

	c->tgt = *(struct bpf_mem_cache **)llnode;

	local_irq_save(flags);
	if (local_inc_return(&c->active) == 1) {
		if (__llist_add(llnode, &c->free_by_rcu))
			c->free_by_rcu_tail = llnode;
	} else {
		llist_add(llnode, &c->free_llist_extra_rcu);
	}
	local_dec(&c->active);
	local_irq_restore(flags);

	if (!atomic_read(&c->call_rcu_in_progress))
		irq_work_raise(c);
}

/* Called from BPF program or from sys_bpf syscall.
 * In both cases migration is disabled.
 */
@@ -696,6 +797,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
}

void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
{
	int idx;

	if (!ptr)
		return;

	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
	if (idx < 0)
		return;

	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
}

void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
{
	void *ret;
@@ -712,6 +827,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
	unit_free(this_cpu_ptr(ma->cache), ptr);
}

void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
{
	if (!ptr)
		return;

	unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
}

/* Directly does a kfree() without putting 'ptr' back to the free_llist
 * for reuse and without waiting for a rcu_tasks_trace gp.
 * The caller must first go through the rcu_tasks_trace gp for 'ptr'