Commit 7192e563 authored by Alex Kogan's avatar Alex Kogan Committed by Wei Li
Browse files

locking/qspinlock: Introduce starvation avoidance into CNA

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8T8XV

Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-5-alex.kogan@oracle.com



--------------------------------

Keep track of the time the thread at the head of the secondary queue
has been waiting, and force inter-node handoff once this time passes
a preset threshold. The default value for the threshold (1ms) can be
overridden with the new kernel boot command-line option
"qspinlock.numa_spinlock_threshold_ns".

Signed-off-by: default avatarAlex Kogan <alex.kogan@oracle.com>
Reviewed-by: default avatarSteve Sistare <steven.sistare@oracle.com>
Reviewed-by: default avatarWaiman Long <longman@redhat.com>
Signed-off-by: default avatarWei Li <liwei391@huawei.com>
parent 62cfb35f
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -4714,6 +4714,14 @@
			[KNL] Number of legacy pty's. Overwrites compiled-in
			default number.

	qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
			Set the time threshold in nanoseconds for the
			number of intra-node lock hand-offs before the
			NUMA-aware spinlock is forced to be passed to
			a thread on another NUMA node. Smaller values
			result in a more fair, but less performant spinlock,
			and vice versa. The default value is 1000000 (=1ms).

	quiet		[KNL] Disable most log messages

	r128=		[HW,DRM]
+65 −16
Original line number Diff line number Diff line
@@ -4,6 +4,8 @@
#endif

#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/moduleparam.h>

/*
 * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
@@ -37,19 +39,39 @@
 * gradually filter the primary queue, leaving only waiters running on the same
 * preferred NUMA node.
 *
 * We change the NUMA node preference after a waiter at the head of the
 * secondary queue spins for a certain amount of time (1ms, by default).
 * We do that by flushing the secondary queue into the head of the primary queue,
 * effectively changing the preference to the NUMA node of the waiter at the head
 * of the secondary queue at the time of the flush.
 *
 * For more details, see https://arxiv.org/abs/1810.05600.
 *
 * Authors: Alex Kogan <alex.kogan@oracle.com>
 *          Dave Dice <dave.dice@oracle.com>
 */

#define FLUSH_SECONDARY_QUEUE	1

struct cna_node {
	struct mcs_spinlock	mcs;
	u16			numa_node;
	u16			real_numa_node;
	u32			encoded_tail;	/* self */
	u64			start_time;
};

static ulong numa_spinlock_threshold_ns = 1000000;   /* 1ms, by default */
module_param(numa_spinlock_threshold_ns, ulong, 0644);

static inline bool intra_node_threshold_reached(struct cna_node *cn)
{
	u64 current_time = local_clock();
	u64 threshold = cn->start_time + numa_spinlock_threshold_ns;

	return current_time > threshold;
}

static void __init cna_init_nodes_per_cpu(unsigned int cpu)
{
	struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
@@ -92,6 +114,7 @@ static __always_inline void cna_init_node(struct mcs_spinlock *node)
	struct cna_node *cn = (struct cna_node *)node;

	cn->numa_node = cn->real_numa_node;
	cn->start_time = 0;
}

/*
@@ -191,8 +214,14 @@ static void cna_splice_next(struct mcs_spinlock *node,

	/* stick `next` on the secondary queue tail */
	if (node->locked <= 1) { /* if secondary queue is empty */
		struct cna_node *cn = (struct cna_node *)node;

		/* create secondary queue */
		next->next = next;

		cn->start_time = local_clock();
		/* secondary queue is not empty iff start_time != 0 */
		WARN_ON(!cn->start_time);
	} else {
		/* add to the tail of the secondary queue */
		struct mcs_spinlock *tail_2nd = decode_tail(node->locked);
@@ -240,12 +269,18 @@ static int cna_order_queue(struct mcs_spinlock *node)
static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
						 struct mcs_spinlock *node)
{
	struct cna_node *cn = (struct cna_node *)node;

	if (!cn->start_time || !intra_node_threshold_reached(cn)) {
		/*
		 * Try and put the time otherwise spent spin waiting on
		 * _Q_LOCKED_PENDING_MASK to use by sorting our lists.
		 */
		while (LOCK_IS_BUSY(lock) && !cna_order_queue(node))
			cpu_relax();
	} else {
		cn->start_time = FLUSH_SECONDARY_QUEUE;
	}

	return 0; /* we lied; we didn't wait, go do so now */
}
@@ -253,11 +288,11 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
static inline void cna_lock_handoff(struct mcs_spinlock *node,
				 struct mcs_spinlock *next)
{
	struct cna_node *cn = (struct cna_node *)node;
	u32 val = 1;

	if (cn->start_time != FLUSH_SECONDARY_QUEUE) {
		if (node->locked > 1) {
		struct cna_node *cn = (struct cna_node *)node;

			val = node->locked;	/* preseve secondary queue */

			/*
@@ -271,6 +306,20 @@ static inline void cna_lock_handoff(struct mcs_spinlock *node,
			 * preference even if the next waiter is on a different node.
			 */
			((struct cna_node *)next)->numa_node = cn->numa_node;

			((struct cna_node *)next)->start_time = cn->start_time;
		}
	} else {
		/*
		 * We decided to flush the secondary queue;
		 * this can only happen if that queue is not empty.
		 */
		WARN_ON(node->locked <= 1);
		/*
		 * Splice the secondary queue onto the primary queue and pass the lock
		 * to the longest waiting remote waiter.
		 */
		next = cna_splice_head(NULL, 0, node, next);
	}

	arch_mcs_lock_handoff(&next->locked, val);