Commit cc74ca30 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'

Yury Norov says:

====================
sched: cpumask: improve on cpumask_local_spread() locality

cpumask_local_spread() currently checks local node for presence of i'th
CPU, and then if it finds nothing makes a flat search among all non-local
CPUs. We can do it better by checking CPUs per NUMA hops.

This has significant performance implications on NUMA machines, for example
when using NUMA-aware allocated memory together with NUMA-aware IRQ
affinity hints.

Performance tests from patch 8 of this series for mellanox network
driver show:

  TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on).
  Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121

  +-------------------------+-----------+------------------+------------------+
  |                         | BW (Gbps) | TX side CPU util | RX side CPU util |
  +-------------------------+-----------+------------------+------------------+
  | Baseline                | 52.3      | 6.4 %            | 17.9 %           |
  +-------------------------+-----------+------------------+------------------+
  | Applied on TX side only | 52.6      | 5.2 %            | 18.5 %           |
  +-------------------------+-----------+------------------+------------------+
  | Applied on RX side only | 94.9      | 11.9 %           | 27.2 %           |
  +-------------------------+-----------+------------------+------------------+
  | Applied on both sides   | 95.1      | 8.4 %            | 27.3 %           |
  +-------------------------+-----------+------------------+------------------+

  Bottleneck in RX side is released, reached linerate (~1.8x speedup).
  ~30% less cpu util on TX.
====================

Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 383d9f87 2ac4980c
Loading
Loading
Loading
Loading
+16 −2
Original line number Diff line number Diff line
@@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev)
static int comp_irqs_request(struct mlx5_core_dev *dev)
{
	struct mlx5_eq_table *table = dev->priv.eq_table;
	const struct cpumask *prev = cpu_none_mask;
	const struct cpumask *mask;
	int ncomp_eqs = table->num_comp_eqs;
	u16 *cpus;
	int ret;
	int cpu;
	int i;

	ncomp_eqs = table->num_comp_eqs;
@@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
		ret = -ENOMEM;
		goto free_irqs;
	}
	for (i = 0; i < ncomp_eqs; i++)
		cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);

	i = 0;
	rcu_read_lock();
	for_each_numa_hop_mask(mask, dev->priv.numa_node) {
		for_each_cpu_andnot(cpu, mask, prev) {
			cpus[i] = cpu;
			if (++i == ncomp_eqs)
				goto spread_done;
		}
		prev = mask;
	}
spread_done:
	rcu_read_unlock();
	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
	kfree(cpus);
	if (ret < 0)
+20 −0
Original line number Diff line number Diff line
@@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
				nr_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @srcp3: the cpumask pointer
 * @cpu: the N'th cpu to find, starting from 0
 *
 * Returns >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
							const struct cpumask *srcp2,
							const struct cpumask *srcp3)
{
	return find_nth_and_andnot_bit(cpumask_bits(srcp1),
					cpumask_bits(srcp2),
					cpumask_bits(srcp3),
					nr_cpumask_bits, cpumask_check(cpu));
}

#define CPU_BITS_NONE						\
{								\
	[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL			\
+33 −0
Original line number Diff line number Diff line
@@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long
				unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
					unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
					const unsigned long *addr3, unsigned long size,
					unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
					 const unsigned long *addr2, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
@@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
	return __find_nth_andnot_bit(addr1, addr2, size, n);
}

/**
 * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
 *			     excluding those set in 3rd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @addr3: The 3rd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
					const unsigned long *addr2,
					const unsigned long *addr3,
					unsigned long size, unsigned long n)
{
	if (n >= size)
		return size;

	if (small_const_nbits(size)) {
		unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);

		return val ? fns(val, n) : size;
	}

	return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}

#ifndef find_first_and_bit
/**
 * find_first_and_bit - find the first set bit in both memory regions
+33 −0
Original line number Diff line number Diff line
@@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
	return cpumask_of_node(cpu_to_node(cpu));
}

#ifdef CONFIG_NUMA
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
#else
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
	return cpumask_nth(cpu, cpus);
}

static inline const struct cpumask *
sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
	return ERR_PTR(-EOPNOTSUPP);
}
#endif	/* CONFIG_NUMA */

/**
 * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
 *                          from a given node.
 * @mask: the iteration variable.
 * @node: the NUMA node to start the search from.
 *
 * Requires rcu_lock to be held.
 *
 * Yields cpu_online_mask for @node == NUMA_NO_NODE.
 */
#define for_each_numa_hop_mask(mask, node)				       \
	for (unsigned int __hops = 0;					       \
	     mask = (node != NUMA_NO_NODE || __hops) ?			       \
		     sched_numa_hop_mask(node, __hops) :		       \
		     cpu_online_mask,					       \
	     !IS_ERR_OR_NULL(mask);					       \
	     __hops++)

#endif /* _LINUX_TOPOLOGY_H */
+90 −0
Original line number Diff line number Diff line
@@ -3,6 +3,8 @@
 * Scheduler topology setup/handling methods
 */

#include <linux/bsearch.h>

DEFINE_MUTEX(sched_domains_mutex);

/* Protected by sched_domains_mutex: */
@@ -2067,6 +2069,94 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
	return found;
}

struct __cmp_key {
	const struct cpumask *cpus;
	struct cpumask ***masks;
	int node;
	int cpu;
	int w;
};

static int hop_cmp(const void *a, const void *b)
{
	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
	struct cpumask **cur_hop = *(struct cpumask ***)b;
	struct __cmp_key *k = (struct __cmp_key *)a;

	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
		return 1;

	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
	if (k->w <= k->cpu)
		return 0;

	return -1;
}

/*
 * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
 *                             closest to @cpu from @cpumask.
 * cpumask: cpumask to find a cpu from
 * cpu: Nth cpu to find
 *
 * returns: cpu, or nr_cpu_ids when nothing found.
 */
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
	struct cpumask ***hop_masks;
	int hop, ret = nr_cpu_ids;

	rcu_read_lock();

	k.masks = rcu_dereference(sched_domains_numa_masks);
	if (!k.masks)
		goto unlock;

	hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
	hop = hop_masks	- k.masks;

	ret = hop ?
		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
		cpumask_nth_and(cpu, cpus, k.masks[0][node]);
unlock:
	rcu_read_unlock();
	return ret;
}
EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);

/**
 * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
 *                         @node
 * @node: The node to count hops from.
 * @hops: Include CPUs up to that many hops away. 0 means local node.
 *
 * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
 * @node, an error value otherwise.
 *
 * Requires rcu_lock to be held. Returned cpumask is only valid within that
 * read-side section, copy it if required beyond that.
 *
 * Note that not all hops are equal in distance; see sched_init_numa() for how
 * distances and masks are handled.
 * Also note that this is a reflection of sched_domains_numa_masks, which may change
 * during the lifetime of the system (offline nodes are taken out of the masks).
 */
const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
	struct cpumask ***masks;

	if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
		return ERR_PTR(-EINVAL);

	masks = rcu_dereference(sched_domains_numa_masks);
	if (!masks)
		return ERR_PTR(-EBUSY);

	return masks[hops][node];
}
EXPORT_SYMBOL_GPL(sched_numa_hop_mask);

#endif /* CONFIG_NUMA */

static int __sdt_alloc(const struct cpumask *cpu_map)
Loading