Commit e987af45 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull percpu updates from Dennis Zhou:
 "One bigger change to percpu_counter's api allowing for init and
  destroy of multiple counters via percpu_counter_init_many() and
  percpu_counter_destroy_many(). This is used to help begin remediating
  a performance regression with percpu rss stats.

  Additionally, it seems larger core count machines are feeling the
  burden of the single threaded allocation of percpu. Mateusz is
  thinking about it and I will spend some time on it too.

  percpu:

   - A couple cleanups by Baoquan He and Bibo Mao. The only behavior
     change is to start printing messages if we're under the warn limit
     for failed atomic allocations.

  percpu_counter:

   - Shakeel introduced percpu counters into mm_struct which caused
     percpu allocations be on the hot path [1]. Originally I spent some
     time trying to improve the percpu allocator, but instead preferred
     what Mateusz Guzik proposed grouping at the allocation site,
     percpu_counter_init_many(). This allows a single percpu allocation
     to be shared by the counters. I like this approach because it
     creates a shared lifetime by the allocations. Additionally, I
     believe many inits have higher level synchronization requirements,
     like percpu_counter does against HOTPLUG_CPU. Therefore we can
     group these optimizations together"

Link: https://lore.kernel.org/linux-mm/20221024052841.3291983-1-shakeelb@google.com/ [1]

* tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu:
  kernel/fork: group allocation/free of per-cpu counters for mm struct
  pcpcntr: add group allocation/free
  mm/percpu.c: print error message too if atomic alloc failed
  mm/percpu.c: optimize the code in pcpu_setup_first_chunk() a little bit
  mm/percpu.c: remove redundant check
  mm/percpu: Remove some local variables in pcpu_populate_pte
parents 0fe2b86c 14ef95be
Loading
Loading
Loading
Loading
+34 −7
Original line number Diff line number Diff line
@@ -30,17 +30,28 @@ struct percpu_counter {

extern int percpu_counter_batch;

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
			       gfp_t gfp, u32 nr_counters,
			       struct lock_class_key *key);

#define percpu_counter_init(fbc, value, gfp)				\
#define percpu_counter_init_many(fbc, value, gfp, nr_counters)		\
	({								\
		static struct lock_class_key __key;			\
									\
		__percpu_counter_init(fbc, value, gfp, &__key);		\
		__percpu_counter_init_many(fbc, value, gfp, nr_counters,\
					   &__key);			\
	})

void percpu_counter_destroy(struct percpu_counter *fbc);

#define percpu_counter_init(fbc, value, gfp)				\
	percpu_counter_init_many(fbc, value, gfp, 1)

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
	percpu_counter_destroy_many(fbc, 1);
}

void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
			      s32 batch);
@@ -116,11 +127,27 @@ struct percpu_counter {
	s64 count;
};

static inline int percpu_counter_init_many(struct percpu_counter *fbc,
					   s64 amount, gfp_t gfp,
					   u32 nr_counters)
{
	u32 i;

	for (i = 0; i < nr_counters; i++)
		fbc[i].count = amount;

	return 0;
}

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
				      gfp_t gfp)
{
	fbc->count = amount;
	return 0;
	return percpu_counter_init_many(fbc, amount, gfp, 1);
}

static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
					       u32 nr_counters)
{
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
+4 −11
Original line number Diff line number Diff line
@@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
 */
void __mmdrop(struct mm_struct *mm)
{
	int i;

	BUG_ON(mm == &init_mm);
	WARN_ON_ONCE(mm == current->mm);

@@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
	put_user_ns(mm->user_ns);
	mm_pasid_drop(mm);
	mm_destroy_cid(mm);
	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);

	for (i = 0; i < NR_MM_COUNTERS; i++)
		percpu_counter_destroy(&mm->rss_stat[i]);
	free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1260,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	struct user_namespace *user_ns)
{
	int i;

	mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
	atomic_set(&mm->mm_users, 1);
@@ -1309,8 +1304,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	if (mm_alloc_cid(mm))
		goto fail_cid;

	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
				     NR_MM_COUNTERS))
		goto fail_pcpu;

	mm->user_ns = get_user_ns(user_ns);
@@ -1318,8 +1313,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	return mm;

fail_pcpu:
	while (i > 0)
		percpu_counter_destroy(&mm->rss_stat[--i]);
	mm_destroy_cid(mm);
fail_cid:
	destroy_context(mm);
+43 −19
Original line number Diff line number Diff line
@@ -151,48 +151,72 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
			       gfp_t gfp, u32 nr_counters,
			       struct lock_class_key *key)
{
	unsigned long flags __maybe_unused;

	raw_spin_lock_init(&fbc->lock);
	lockdep_set_class(&fbc->lock, key);
	fbc->count = amount;
	fbc->counters = alloc_percpu_gfp(s32, gfp);
	if (!fbc->counters)
	size_t counter_size;
	s32 __percpu *counters;
	u32 i;

	counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
	counters = __alloc_percpu_gfp(nr_counters * counter_size,
				      __alignof__(*counters), gfp);
	if (!counters) {
		fbc[0].counters = NULL;
		return -ENOMEM;
	}

	debug_percpu_counter_activate(fbc);
	for (i = 0; i < nr_counters; i++) {
		raw_spin_lock_init(&fbc[i].lock);
		lockdep_set_class(&fbc[i].lock, key);
#ifdef CONFIG_HOTPLUG_CPU
		INIT_LIST_HEAD(&fbc[i].list);
#endif
		fbc[i].count = amount;
		fbc[i].counters = (void *)counters + (i * counter_size);

		debug_percpu_counter_activate(&fbc[i]);
	}

#ifdef CONFIG_HOTPLUG_CPU
	INIT_LIST_HEAD(&fbc->list);
	spin_lock_irqsave(&percpu_counters_lock, flags);
	list_add(&fbc->list, &percpu_counters);
	for (i = 0; i < nr_counters; i++)
		list_add(&fbc[i].list, &percpu_counters);
	spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
	return 0;
}
EXPORT_SYMBOL(__percpu_counter_init);
EXPORT_SYMBOL(__percpu_counter_init_many);

void percpu_counter_destroy(struct percpu_counter *fbc)
void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
{
	unsigned long flags __maybe_unused;
	u32 i;

	if (WARN_ON_ONCE(!fbc))
		return;

	if (!fbc->counters)
	if (!fbc[0].counters)
		return;

	debug_percpu_counter_deactivate(fbc);
	for (i = 0; i < nr_counters; i++)
		debug_percpu_counter_deactivate(&fbc[i]);

#ifdef CONFIG_HOTPLUG_CPU
	spin_lock_irqsave(&percpu_counters_lock, flags);
	list_del(&fbc->list);
	for (i = 0; i < nr_counters; i++)
		list_del(&fbc[i].list);
	spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
	free_percpu(fbc->counters);
	fbc->counters = NULL;

	free_percpu(fbc[0].counters);

	for (i = 0; i < nr_counters; i++)
		fbc[i].counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy);
EXPORT_SYMBOL(percpu_counter_destroy_many);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);
+28 −41
Original line number Diff line number Diff line
@@ -1890,13 +1890,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
fail:
	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

	if (!is_atomic && do_warn && warn_limit) {
	if (do_warn && warn_limit) {
		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
			size, align, is_atomic, err);
		if (!is_atomic)
			dump_stack();
		if (!--warn_limit)
			pr_info("limit reached, disable warning\n");
	}

	if (is_atomic) {
		/* see the flag handling in pcpu_balance_workfn() */
		pcpu_atomic_alloc_failed = true;
@@ -2581,14 +2583,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
{
	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
	size_t static_size, dyn_size;
	struct pcpu_chunk *chunk;
	unsigned long *group_offsets;
	size_t *group_sizes;
	unsigned long *unit_off;
	unsigned int cpu;
	int *unit_map;
	int group, unit, i;
	int map_size;
	unsigned long tmp_addr;
	size_t alloc_size;

@@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
	PCPU_SETUP_BUG_ON(!ai->dyn_size);
	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
@@ -2698,7 +2697,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
	pcpu_atom_size = ai->atom_size;
	pcpu_chunk_struct_size = struct_size(chunk, populated,
	pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
					     BITS_TO_LONGS(pcpu_unit_pages));

	pcpu_stats_save_ai(ai);
@@ -2735,29 +2734,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
	dyn_size = ai->dyn_size - (static_size - ai->static_size);

	/*
	 * Initialize first chunk.
	 * If the reserved_size is non-zero, this initializes the reserved
	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
	 * and the dynamic region is initialized here.  The first chunk,
	 * pcpu_first_chunk, will always point to the chunk that serves
	 * the dynamic region.
	 * Initialize first chunk:
	 * This chunk is broken up into 3 parts:
	 *		< static | [reserved] | dynamic >
	 * - static - there is no backing chunk because these allocations can
	 *   never be freed.
	 * - reserved (pcpu_reserved_chunk) - exists primarily to serve
	 *   allocations from module load.
	 * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
	 *   chunk.
	 */
	tmp_addr = (unsigned long)base_addr + static_size;
	map_size = ai->reserved_size ?: dyn_size;
	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);

	/* init dynamic chunk if necessary */
	if (ai->reserved_size) {
		pcpu_reserved_chunk = chunk;
	if (ai->reserved_size)
		pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
						ai->reserved_size);
	tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
	pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);

		tmp_addr = (unsigned long)base_addr + static_size +
			   ai->reserved_size;
		map_size = dyn_size;
		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
	}

	/* link the first chunk in */
	pcpu_first_chunk = chunk;
	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
	pcpu_chunk_relocate(pcpu_first_chunk, -1);

@@ -3189,32 +3182,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
	pmd_t *pmd;

	if (pgd_none(*pgd)) {
		p4d_t *new;

		new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
		if (!new)
		p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
		if (!p4d)
			goto err_alloc;
		pgd_populate(&init_mm, pgd, new);
		pgd_populate(&init_mm, pgd, p4d);
	}

	p4d = p4d_offset(pgd, addr);
	if (p4d_none(*p4d)) {
		pud_t *new;

		new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
		if (!new)
		pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
		if (!pud)
			goto err_alloc;
		p4d_populate(&init_mm, p4d, new);
		p4d_populate(&init_mm, p4d, pud);
	}

	pud = pud_offset(p4d, addr);
	if (pud_none(*pud)) {
		pmd_t *new;

		new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
		if (!new)
		pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
		if (!pmd)
			goto err_alloc;
		pud_populate(&init_mm, pud, new);
		pud_populate(&init_mm, pud, pmd);
	}

	pmd = pmd_offset(pud, addr);