Commit af961f80 authored by Vlastimil Babka's avatar Vlastimil Babka
Browse files

Merge branch 'slab/for-6.1/slub_debug_waste' into slab/for-next

A patch from Feng Tang that enhances the existing debugfs alloc_traces
file for kmalloc caches with information about how much space is wasted
by allocations that needs less space than the particular kmalloc cache
provides.
parents 0bdcef54 6edf2576
Loading
Loading
Loading
Loading
+21 −12
Original line number Diff line number Diff line
@@ -400,21 +400,30 @@ information:
    allocated objects. The output is sorted by frequency of each trace.

    Information in the output:
    Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
    pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
    Number of objects, allocating function, possible memory wastage of
    kmalloc objects(total/per-object), minimal/average/maximal jiffies
    since alloc, pid range of the allocating processes, cpu mask of
    allocating cpus, numa node mask of origins of memory, and stack trace.

    Example:::

    1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
	__slab_alloc+0x6d/0x90
	kmem_cache_alloc_trace+0x2eb/0x300
	populate_error_injection_list+0x97/0x110
	init_error_injection+0x1b/0x71
	do_one_initcall+0x5f/0x2d0
	kernel_init_freeable+0x26f/0x2d7
	kernel_init+0xe/0x118
	ret_from_fork+0x22/0x30

    338 pci_alloc_dev+0x2c/0xa0 waste=521872/1544 age=290837/291891/293509 pid=1 cpus=106 nodes=0-1
        __kmem_cache_alloc_node+0x11f/0x4e0
        kmalloc_trace+0x26/0xa0
        pci_alloc_dev+0x2c/0xa0
        pci_scan_single_device+0xd2/0x150
        pci_scan_slot+0xf7/0x2d0
        pci_scan_child_bus_extend+0x4e/0x360
        acpi_pci_root_create+0x32e/0x3b0
        pci_acpi_scan_root+0x2b9/0x2d0
        acpi_pci_root_add.cold.11+0x110/0xb0a
        acpi_bus_attach+0x262/0x3f0
        device_for_each_child+0xb7/0x110
        acpi_dev_for_each_child+0x77/0xa0
        acpi_bus_attach+0x108/0x3f0
        device_for_each_child+0xb7/0x110
        acpi_dev_for_each_child+0x77/0xa0
        acpi_bus_attach+0x108/0x3f0

2. free_traces::

+2 −0
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@
#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)
/* DEBUG: Poison objects */
#define SLAB_POISON		((slab_flags_t __force)0x00000800U)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC		((slab_flags_t __force)0x00001000U)
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
/* Use GFP_DMA memory */
+2 −1
Original line number Diff line number Diff line
@@ -661,7 +661,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
	if (!s)
		panic("Out of memory when creating slab %s\n", name);

	create_boot_cache(s, name, size, flags, useroffset, usersize);
	create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
								usersize);
	kasan_cache_create_kmalloc(s);
	list_add(&s->list, &slab_caches);
	s->refcount = 1;
+117 −37
Original line number Diff line number Diff line
@@ -194,11 +194,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
#endif		/* CONFIG_SLUB_DEBUG */

/* Structure holding parameters for get_partial() call chain */
struct partial_context {
	struct slab **slab;
	gfp_t flags;
	unsigned int orig_size;
};

static inline bool kmem_cache_debug(struct kmem_cache *s)
{
	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}

static inline bool slub_debug_orig_size(struct kmem_cache *s)
{
	return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
			(s->flags & SLAB_KMALLOC));
}

void *fixup_red_left(struct kmem_cache *s, void *p)
{
	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -790,6 +803,39 @@ static void print_slab_info(const struct slab *slab)
	       folio_flags(folio, 0));
}

/*
 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
 * family will round up the real request size to these fixed ones, so
 * there could be an extra area than what is requested. Save the original
 * request size in the meta data area, for better debug and sanity check.
 */
static inline void set_orig_size(struct kmem_cache *s,
				void *object, unsigned int orig_size)
{
	void *p = kasan_reset_tag(object);

	if (!slub_debug_orig_size(s))
		return;

	p += get_info_end(s);
	p += sizeof(struct track) * 2;

	*(unsigned int *)p = orig_size;
}

static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
{
	void *p = kasan_reset_tag(object);

	if (!slub_debug_orig_size(s))
		return s->object_size;

	p += get_info_end(s);
	p += sizeof(struct track) * 2;

	return *(unsigned int *)p;
}

static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
	struct va_format vaf;
@@ -849,6 +895,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
	if (s->flags & SLAB_STORE_USER)
		off += 2 * sizeof(struct track);

	if (slub_debug_orig_size(s))
		off += sizeof(unsigned int);

	off += kasan_metadata_size(s);

	if (off != size_from_object(s))
@@ -982,7 +1031,8 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
 *
 * 	A. Free pointer (if we cannot overwrite object on free)
 * 	B. Tracking data for SLAB_STORE_USER
 *	C. Padding to reach required alignment boundary or at minimum
 *	C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
 *	D. Padding to reach required alignment boundary or at minimum
 * 		one word if debugging is on to be able to detect writes
 * 		before the word boundary.
 *
@@ -1000,10 +1050,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
	unsigned long off = get_info_end(s);	/* The end of info */

	if (s->flags & SLAB_STORE_USER)
	if (s->flags & SLAB_STORE_USER) {
		/* We also have user information there */
		off += 2 * sizeof(struct track);

		if (s->flags & SLAB_KMALLOC)
			off += sizeof(unsigned int);
	}

	off += kasan_metadata_size(s);

	if (size_from_object(s) == off)
@@ -1298,7 +1352,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
}

static noinline int alloc_debug_processing(struct kmem_cache *s,
					struct slab *slab, void *object)
			struct slab *slab, void *object, int orig_size)
{
	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
		if (!alloc_consistency_checks(s, slab, object))
@@ -1307,6 +1361,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,

	/* Success. Perform special debug activities for allocs */
	trace(s, slab, object, 1);
	set_orig_size(s, object, orig_size);
	init_object(s, object, SLUB_RED_ACTIVE);
	return 1;

@@ -1575,7 +1630,7 @@ static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}

static inline int alloc_debug_processing(struct kmem_cache *s,
	struct slab *slab, void *object) { return 0; }
	struct slab *slab, void *object, int orig_size) { return 0; }

static inline void free_debug_processing(
	struct kmem_cache *s, struct slab *slab,
@@ -2004,7 +2059,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
 * it to full list if it was the last free object.
 */
static void *alloc_single_from_partial(struct kmem_cache *s,
		struct kmem_cache_node *n, struct slab *slab)
		struct kmem_cache_node *n, struct slab *slab, int orig_size)
{
	void *object;

@@ -2014,7 +2069,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
	slab->freelist = get_freepointer(s, object);
	slab->inuse++;

	if (!alloc_debug_processing(s, slab, object)) {
	if (!alloc_debug_processing(s, slab, object, orig_size)) {
		remove_partial(n, slab);
		return NULL;
	}
@@ -2033,7 +2088,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
 * and put the slab to the partial (or full) list.
 */
static void *alloc_single_from_new_slab(struct kmem_cache *s,
					struct slab *slab)
					struct slab *slab, int orig_size)
{
	int nid = slab_nid(slab);
	struct kmem_cache_node *n = get_node(s, nid);
@@ -2045,7 +2100,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s,
	slab->freelist = get_freepointer(s, object);
	slab->inuse = 1;

	if (!alloc_debug_processing(s, slab, object))
	if (!alloc_debug_processing(s, slab, object, orig_size))
		/*
		 * It's not really expected that this would fail on a
		 * freshly allocated slab, but a concurrent memory
@@ -2123,7 +2178,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
 * Try to allocate a partial slab from a specific node.
 */
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
			      struct slab **ret_slab, gfp_t gfpflags)
			      struct partial_context *pc)
{
	struct slab *slab, *slab2;
	void *object = NULL;
@@ -2143,11 +2198,12 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
		void *t;

		if (!pfmemalloc_match(slab, gfpflags))
		if (!pfmemalloc_match(slab, pc->flags))
			continue;

		if (kmem_cache_debug(s)) {
			object = alloc_single_from_partial(s, n, slab);
			object = alloc_single_from_partial(s, n, slab,
							pc->orig_size);
			if (object)
				break;
			continue;
@@ -2158,7 +2214,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
			break;

		if (!object) {
			*ret_slab = slab;
			*pc->slab = slab;
			stat(s, ALLOC_FROM_PARTIAL);
			object = t;
		} else {
@@ -2182,14 +2238,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
 * Get a slab from somewhere. Search in increasing NUMA distances.
 */
static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
			     struct slab **ret_slab)
static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
	struct zonelist *zonelist;
	struct zoneref *z;
	struct zone *zone;
	enum zone_type highest_zoneidx = gfp_zone(flags);
	enum zone_type highest_zoneidx = gfp_zone(pc->flags);
	void *object;
	unsigned int cpuset_mems_cookie;

@@ -2217,15 +2272,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,

	do {
		cpuset_mems_cookie = read_mems_allowed_begin();
		zonelist = node_zonelist(mempolicy_slab_node(), flags);
		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
			struct kmem_cache_node *n;

			n = get_node(s, zone_to_nid(zone));

			if (n && cpuset_zone_allowed(zone, flags) &&
			if (n && cpuset_zone_allowed(zone, pc->flags) &&
					n->nr_partial > s->min_partial) {
				object = get_partial_node(s, n, ret_slab, flags);
				object = get_partial_node(s, n, pc);
				if (object) {
					/*
					 * Don't check read_mems_allowed_retry()
@@ -2246,8 +2301,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
/*
 * Get a partial slab, lock it and return it.
 */
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
			 struct slab **ret_slab)
static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
{
	void *object;
	int searchnode = node;
@@ -2255,11 +2309,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
	if (node == NUMA_NO_NODE)
		searchnode = numa_mem_id();

	object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
	object = get_partial_node(s, get_node(s, searchnode), pc);
	if (object || node != NUMA_NO_NODE)
		return object;

	return get_any_partial(s, flags, ret_slab);
	return get_any_partial(s, pc);
}

#ifdef CONFIG_PREEMPTION
@@ -2980,11 +3034,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
 * already disabled (which is the case for bulk allocation).
 */
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
	void *freelist;
	struct slab *slab;
	unsigned long flags;
	struct partial_context pc;

	stat(s, ALLOC_SLOWPATH);

@@ -3098,7 +3153,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,

new_objects:

	freelist = get_partial(s, gfpflags, node, &slab);
	pc.flags = gfpflags;
	pc.slab = &slab;
	pc.orig_size = orig_size;
	freelist = get_partial(s, node, &pc);
	if (freelist)
		goto check_new_slab;

@@ -3114,7 +3172,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
	stat(s, ALLOC_SLAB);

	if (kmem_cache_debug(s)) {
		freelist = alloc_single_from_new_slab(s, slab);
		freelist = alloc_single_from_new_slab(s, slab, orig_size);

		if (unlikely(!freelist))
			goto new_objects;
@@ -3146,6 +3204,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
		 */
		if (s->flags & SLAB_STORE_USER)
			set_track(s, freelist, TRACK_ALLOC, addr);

		return freelist;
	}

@@ -3188,7 +3247,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 * pointer.
 */
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
	void *p;

@@ -3201,7 +3260,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
	c = slub_get_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
#ifdef CONFIG_PREEMPT_COUNT
	slub_put_cpu_ptr(s->cpu_slab);
#endif
@@ -3286,7 +3345,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l

	if (!USE_LOCKLESS_FAST_PATH() ||
	    unlikely(!object || !slab || !node_match(slab, node))) {
		object = __slab_alloc(s, gfpflags, node, addr, c);
		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
	} else {
		void *next_object = get_freepointer_safe(s, object);

@@ -3753,7 +3812,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
			 * of re-populating per CPU c->freelist
			 */
			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
					    _RET_IP_, c);
					    _RET_IP_, c, s->object_size);
			if (unlikely(!p[i]))
				goto error;

@@ -4156,12 +4215,17 @@ static int calculate_sizes(struct kmem_cache *s)
	}

#ifdef CONFIG_SLUB_DEBUG
	if (flags & SLAB_STORE_USER)
	if (flags & SLAB_STORE_USER) {
		/*
		 * Need to store information about allocs and frees after
		 * the object.
		 */
		size += 2 * sizeof(struct track);

		/* Save the original kmalloc request size */
		if (flags & SLAB_KMALLOC)
			size += sizeof(unsigned int);
	}
#endif

	kasan_cache_create(s, &size, &s->flags);
@@ -4945,6 +5009,7 @@ struct location {
	depot_stack_handle_t handle;
	unsigned long count;
	unsigned long addr;
	unsigned long waste;
	long long sum_time;
	long min_time;
	long max_time;
@@ -4991,13 +5056,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
}

static int add_location(struct loc_track *t, struct kmem_cache *s,
				const struct track *track)
				const struct track *track,
				unsigned int orig_size)
{
	long start, end, pos;
	struct location *l;
	unsigned long caddr, chandle;
	unsigned long caddr, chandle, cwaste;
	unsigned long age = jiffies - track->when;
	depot_stack_handle_t handle = 0;
	unsigned int waste = s->object_size - orig_size;

#ifdef CONFIG_STACKDEPOT
	handle = READ_ONCE(track->handle);
@@ -5015,11 +5082,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
		if (pos == end)
			break;

		caddr = t->loc[pos].addr;
		chandle = t->loc[pos].handle;
		if ((track->addr == caddr) && (handle == chandle)) {

		l = &t->loc[pos];
		caddr = l->addr;
		chandle = l->handle;
		cwaste = l->waste;
		if ((track->addr == caddr) && (handle == chandle) &&
			(waste == cwaste)) {

			l->count++;
			if (track->when) {
				l->sum_time += age;
@@ -5044,6 +5113,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
			end = pos;
		else if (track->addr == caddr && handle < chandle)
			end = pos;
		else if (track->addr == caddr && handle == chandle &&
				waste < cwaste)
			end = pos;
		else
			start = pos;
	}
@@ -5067,6 +5139,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
	l->min_pid = track->pid;
	l->max_pid = track->pid;
	l->handle = handle;
	l->waste = waste;
	cpumask_clear(to_cpumask(l->cpus));
	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
	nodes_clear(l->nodes);
@@ -5079,13 +5152,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
		unsigned long *obj_map)
{
	void *addr = slab_address(slab);
	bool is_alloc = (alloc == TRACK_ALLOC);
	void *p;

	__fill_map(obj_map, s, slab);

	for_each_object(p, s, addr, slab->objects)
		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
			add_location(t, s, get_track(s, p, alloc));
			add_location(t, s, get_track(s, p, alloc),
				     is_alloc ? get_orig_size(s, p) :
						s->object_size);
}
#endif  /* CONFIG_DEBUG_FS   */
#endif	/* CONFIG_SLUB_DEBUG */
@@ -5956,6 +6032,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v)
		else
			seq_puts(seq, "<not-available>");

		if (l->waste)
			seq_printf(seq, " waste=%lu/%lu",
				l->count * l->waste, l->waste);

		if (l->sum_time != l->min_time) {
			seq_printf(seq, " age=%ld/%llu/%ld",
				l->min_time, div_u64(l->sum_time, l->count),