Commit 3cb278e7 authored by Joel Fernandes (Google)'s avatar Joel Fernandes (Google) Committed by Paul E. McKenney
Browse files

rcu: Make call_rcu() lazy to save power



Implement timer-based RCU callback batching (also known as lazy
callbacks). With this we save about 5-10% of power consumed due
to RCU requests that happen when system is lightly loaded or idle.

By default, all async callbacks (queued via call_rcu) are marked
lazy. An alternate API call_rcu_hurry() is provided for the few users,
for example synchronize_rcu(), that need the old behavior.

The batch is flushed whenever a certain amount of time has passed, or
the batch on a particular CPU grows too big. Also memory pressure will
flush it in a future patch.

To handle several corner cases automagically (such as rcu_barrier() and
hotplug), we re-use bypass lists which were originally introduced to
address lock contention, to handle lazy CBs as well. The bypass list
length has the lazy CB length included in it. A separate lazy CB length
counter is also introduced to keep track of the number of lazy CBs.

[ paulmck: Fix formatting of inline call_rcu_lazy() definition. ]
[ paulmck: Apply Zqiang feedback. ]
[ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ]

Suggested-by: default avatarPaul McKenney <paulmck@kernel.org>
Acked-by: default avatarFrederic Weisbecker <frederic@kernel.org>
Signed-off-by: default avatarJoel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: default avatarPaul E. McKenney <paulmck@kernel.org>
parent b8f7aca3
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void)

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_RCU_LAZY
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
#else
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
	call_rcu(head, func);
}
#endif

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
+8 −0
Original line number Diff line number Diff line
@@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
	  Say N here if you hate read-side memory barriers.
	  Take the default if you are unsure.

config RCU_LAZY
	bool "RCU callback lazy invocation functionality"
	depends on RCU_NOCB_CPU
	default n
	help
	  To save power, batch RCU callbacks and flush after delay, memory
	  pressure, or callback list growing too big.

endmenu # "RCU Subsystem"
+8 −0
Original line number Diff line number Diff line
@@ -474,6 +474,14 @@ enum rcutorture_type {
	INVALID_RCU_FLAVOR
};

#if defined(CONFIG_RCU_LAZY)
unsigned long rcu_lazy_get_jiffies_till_flush(void);
void rcu_lazy_set_jiffies_till_flush(unsigned long j);
#else
static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
#endif

#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
			    unsigned long *gp_seq);
+1 −1
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {

void rcu_barrier(void)
{
	wait_rcu_gp(call_rcu);
	wait_rcu_gp(call_rcu_hurry);
}
EXPORT_SYMBOL(rcu_barrier);

+83 −46
Original line number Diff line number Diff line
@@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
	raw_spin_unlock_rcu_node(rnp);
}

/**
 * call_rcu() - Queue an RCU callback for invocation after a grace period.
 * @head: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a full grace
 * period elapses, in other words after all pre-existing RCU read-side
 * critical sections have completed.  However, the callback function
 * might well execute concurrently with RCU read-side critical sections
 * that started after call_rcu() was invoked.
 *
 * RCU read-side critical sections are delimited by rcu_read_lock()
 * and rcu_read_unlock(), and may be nested.  In addition, but only in
 * v5.0 and later, regions of code across which interrupts, preemption,
 * or softirqs have been disabled also serve as RCU read-side critical
 * sections.  This includes hardware interrupt handlers, softirq handlers,
 * and NMI handlers.
 *
 * Note that all CPUs must agree that the grace period extended beyond
 * all pre-existing RCU read-side critical section.  On systems with more
 * than one CPU, this means that when "func()" is invoked, each CPU is
 * guaranteed to have executed a full memory barrier since the end of its
 * last RCU read-side critical section whose beginning preceded the call
 * to call_rcu().  It also means that each CPU executing an RCU read-side
 * critical section that continues beyond the start of "func()" must have
 * executed a memory barrier after the call_rcu() but before the beginning
 * of that RCU read-side critical section.  Note that these guarantees
 * include CPUs that are offline, idle, or executing in user mode, as
 * well as CPUs that are executing in the kernel.
 *
 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
 * resulting RCU callback function "func()", then both CPU A and CPU B are
 * guaranteed to execute a full memory barrier during the time interval
 * between the call to call_rcu() and the invocation of "func()" -- even
 * if CPU A and CPU B are the same CPU (but again only if the system has
 * more than one CPU).
 *
 * Implementation of these memory-ordering guarantees is described here:
 * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
 */
void call_rcu(struct rcu_head *head, rcu_callback_t func)
static void
__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
{
	static atomic_t doublefrees;
	unsigned long flags;
@@ -2809,7 +2770,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
	}

	check_cb_ovld(rdp);
	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
		return; // Enqueued onto ->nocb_bypass, so just leave.
	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
	rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2831,8 +2792,84 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
		local_irq_restore(flags);
	}
}
EXPORT_SYMBOL_GPL(call_rcu);

#ifdef CONFIG_RCU_LAZY
/**
 * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
 * flush all lazy callbacks (including the new one) to the main ->cblist while
 * doing so.
 *
 * @head: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a full grace
 * period elapses, in other words after all pre-existing RCU read-side
 * critical sections have completed.
 *
 * Use this API instead of call_rcu() if you don't want the callback to be
 * invoked after very long periods of time, which can happen on systems without
 * memory pressure and on systems which are lightly loaded or mostly idle.
 * This function will cause callbacks to be invoked sooner than later at the
 * expense of extra power. Other than that, this function is identical to, and
 * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
 * ordering and other functionality.
 */
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
	return __call_rcu_common(head, func, false);
}
EXPORT_SYMBOL_GPL(call_rcu_hurry);
#endif

/**
 * call_rcu() - Queue an RCU callback for invocation after a grace period.
 * By default the callbacks are 'lazy' and are kept hidden from the main
 * ->cblist to prevent starting of grace periods too soon.
 * If you desire grace periods to start very soon, use call_rcu_hurry().
 *
 * @head: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a full grace
 * period elapses, in other words after all pre-existing RCU read-side
 * critical sections have completed.  However, the callback function
 * might well execute concurrently with RCU read-side critical sections
 * that started after call_rcu() was invoked.
 *
 * RCU read-side critical sections are delimited by rcu_read_lock()
 * and rcu_read_unlock(), and may be nested.  In addition, but only in
 * v5.0 and later, regions of code across which interrupts, preemption,
 * or softirqs have been disabled also serve as RCU read-side critical
 * sections.  This includes hardware interrupt handlers, softirq handlers,
 * and NMI handlers.
 *
 * Note that all CPUs must agree that the grace period extended beyond
 * all pre-existing RCU read-side critical section.  On systems with more
 * than one CPU, this means that when "func()" is invoked, each CPU is
 * guaranteed to have executed a full memory barrier since the end of its
 * last RCU read-side critical section whose beginning preceded the call
 * to call_rcu().  It also means that each CPU executing an RCU read-side
 * critical section that continues beyond the start of "func()" must have
 * executed a memory barrier after the call_rcu() but before the beginning
 * of that RCU read-side critical section.  Note that these guarantees
 * include CPUs that are offline, idle, or executing in user mode, as
 * well as CPUs that are executing in the kernel.
 *
 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
 * resulting RCU callback function "func()", then both CPU A and CPU B are
 * guaranteed to execute a full memory barrier during the time interval
 * between the call to call_rcu() and the invocation of "func()" -- even
 * if CPU A and CPU B are the same CPU (but again only if the system has
 * more than one CPU).
 *
 * Implementation of these memory-ordering guarantees is described here:
 * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
 */
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
	return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
}
EXPORT_SYMBOL_GPL(call_rcu);

/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (5 * HZ)
@@ -3507,7 +3544,7 @@ void synchronize_rcu(void)
		if (rcu_gp_is_expedited())
			synchronize_rcu_expedited();
		else
			wait_rcu_gp(call_rcu);
			wait_rcu_gp(call_rcu_hurry);
		return;
	}

@@ -3910,7 +3947,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
	 * if it's fully lazy.
	 */
	was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
	wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
		atomic_inc(&rcu_state.barrier_cpu_count);
@@ -4336,7 +4373,7 @@ void rcutree_migrate_callbacks(int cpu)
	my_rdp = this_cpu_ptr(&rcu_data);
	my_rnp = my_rdp->mynode;
	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
	/* Leverage recent GPs and set GP for new callbacks. */
	needwake = rcu_advance_cbs(my_rnp, rdp) ||
Loading