bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu(). (5af6807b) · Commits · EulixOS / Software / Kernel

include/linux/bpf_mem_alloc.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
		/* kmalloc/kfree equivalent: */
		void bpf_mem_alloc(struct bpf_mem_alloc ma, size_t size);
		void bpf_mem_free(struct bpf_mem_alloc ma, void ptr);
		void bpf_mem_free_rcu(struct bpf_mem_alloc ma, void ptr);

		/* kmem_cache_alloc/free equivalent: */
		void bpf_mem_cache_alloc(struct bpf_mem_alloc ma);
		void bpf_mem_cache_free(struct bpf_mem_alloc ma, void ptr);
		void bpf_mem_cache_free_rcu(struct bpf_mem_alloc ma, void ptr);
		void bpf_mem_cache_raw_free(void *ptr);
		void bpf_mem_cache_alloc_flags(struct bpf_mem_alloc ma, gfp_t flags);

kernel/bpf/memalloc.c

+126 −3

Original line number	Diff line number	Diff line
		@@ -101,6 +101,15 @@ struct bpf_mem_cache {
		bool draining;
		struct bpf_mem_cache *tgt;

		/* list of objects to be freed after RCU GP */
		struct llist_head free_by_rcu;
		struct llist_node *free_by_rcu_tail;
		struct llist_head waiting_for_gp;
		struct llist_node *waiting_for_gp_tail;
		struct rcu_head rcu;
		atomic_t call_rcu_in_progress;
		struct llist_head free_llist_extra_rcu;

		/* list of objects to be freed after RCU tasks trace GP */
		struct llist_head free_by_rcu_ttrace;
		struct llist_head waiting_for_gp_ttrace;
		@@ -346,6 +355,69 @@ static void free_bulk(struct bpf_mem_cache *c)
		do_call_rcu_ttrace(tgt);
		}

		static void __free_by_rcu(struct rcu_head *head)
		{
		struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
		struct bpf_mem_cache *tgt = c->tgt;
		struct llist_node *llnode;

		llnode = llist_del_all(&c->waiting_for_gp);
		if (!llnode)
		goto out;

		llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);

		/* Objects went through regular RCU GP. Send them to RCU tasks trace */
		do_call_rcu_ttrace(tgt);
		out:
		atomic_set(&c->call_rcu_in_progress, 0);
		}

		static void check_free_by_rcu(struct bpf_mem_cache *c)
		{
		struct llist_node llnode, t;
		unsigned long flags;

		/* drain free_llist_extra_rcu */
		if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
		inc_active(c, &flags);
		llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
		if (__llist_add(llnode, &c->free_by_rcu))
		c->free_by_rcu_tail = llnode;
		dec_active(c, flags);
		}

		if (llist_empty(&c->free_by_rcu))
		return;

		if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
		/*
		* Instead of kmalloc-ing new rcu_head and triggering 10k
		* call_rcu() to hit rcutree.qhimark and force RCU to notice
		* the overload just ask RCU to hurry up. There could be many
		* objects in free_by_rcu list.
		* This hint reduces memory consumption for an artificial
		* benchmark from 2 Gbyte to 150 Mbyte.
		*/
		rcu_request_urgent_qs_task(current);
		return;
		}

		WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));

		inc_active(c, &flags);
		WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
		c->waiting_for_gp_tail = c->free_by_rcu_tail;
		dec_active(c, flags);

		if (unlikely(READ_ONCE(c->draining))) {
		free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
		atomic_set(&c->call_rcu_in_progress, 0);
		} else {
		call_rcu_hurry(&c->rcu, __free_by_rcu);
		}
		}

		static void bpf_mem_refill(struct irq_work *work)
		{
		struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
		@@ -360,6 +432,8 @@ static void bpf_mem_refill(struct irq_work *work)
		alloc_bulk(c, c->batch, NUMA_NO_NODE);
		else if (cnt > c->high_watermark)
		free_bulk(c);

		check_free_by_rcu(c);
		}

		static void notrace irq_work_raise(struct bpf_mem_cache *c)
		@@ -488,6 +562,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
		free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
		free_all(__llist_del_all(&c->free_llist), percpu);
		free_all(__llist_del_all(&c->free_llist_extra), percpu);
		free_all(__llist_del_all(&c->free_by_rcu), percpu);
		free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
		free_all(llist_del_all(&c->waiting_for_gp), percpu);
		}

		static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
		@@ -500,8 +577,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)

		static void free_mem_alloc(struct bpf_mem_alloc *ma)
		{
		/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
		* still execute. Wait for it now before we freeing percpu caches.
		/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
		* might still execute. Wait for them.
		*
		* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
		* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
		@@ -510,7 +587,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
		* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
		* using rcu_trace_implies_rcu_gp() as well.
		*/
		rcu_barrier_tasks_trace();
		rcu_barrier(); /* wait for __free_by_rcu */
		rcu_barrier_tasks_trace(); /* wait for __free_rcu */
		if (!rcu_trace_implies_rcu_gp())
		rcu_barrier();
		free_mem_alloc_no_barrier(ma);
		@@ -563,6 +641,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
		irq_work_sync(&c->refill_work);
		drain_mem_cache(c);
		rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
		rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
		}
		/* objcg is the same across cpus */
		if (c->objcg)
		@@ -579,6 +658,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
		irq_work_sync(&c->refill_work);
		drain_mem_cache(c);
		rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
		rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
		}
		}
		if (c->objcg)
		@@ -663,6 +743,27 @@ static void notrace unit_free(struct bpf_mem_cache c, void ptr)
		irq_work_raise(c);
		}

		static void notrace unit_free_rcu(struct bpf_mem_cache c, void ptr)
		{
		struct llist_node *llnode = ptr - LLIST_NODE_SZ;
		unsigned long flags;

		c->tgt = (struct bpf_mem_cache *)llnode;

		local_irq_save(flags);
		if (local_inc_return(&c->active) == 1) {
		if (__llist_add(llnode, &c->free_by_rcu))
		c->free_by_rcu_tail = llnode;
		} else {
		llist_add(llnode, &c->free_llist_extra_rcu);
		}
		local_dec(&c->active);
		local_irq_restore(flags);

		if (!atomic_read(&c->call_rcu_in_progress))
		irq_work_raise(c);
		}

		/* Called from BPF program or from sys_bpf syscall.
		* In both cases migration is disabled.
		*/
		@@ -696,6 +797,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc ma, void ptr)
		unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
		}

		void notrace bpf_mem_free_rcu(struct bpf_mem_alloc ma, void ptr)
		{
		int idx;

		if (!ptr)
		return;

		idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
		if (idx < 0)
		return;

		unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
		}

		void notrace bpf_mem_cache_alloc(struct bpf_mem_alloc ma)
		{
		void *ret;
		@@ -712,6 +827,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc ma, void ptr)
		unit_free(this_cpu_ptr(ma->cache), ptr);
		}

		void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc ma, void ptr)
		{
		if (!ptr)
		return;

		unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
		}

		/* Directly does a kfree() without putting 'ptr' back to the free_llist
		* for reuse and without waiting for a rcu_tasks_trace gp.
		* The caller must first go through the rcu_tasks_trace gp for 'ptr'