mm/slub: enable debugging memory wasting of kmalloc (6edf2576) · Commits · EulixOS / Software / Kernel

Documentation/mm/slub.rst

+21 −12

Original line number	Diff line number	Diff line
		@@ -400,21 +400,30 @@ information:
		allocated objects. The output is sorted by frequency of each trace.

		Information in the output:
		Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
		pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
		Number of objects, allocating function, possible memory wastage of
		kmalloc objects(total/per-object), minimal/average/maximal jiffies
		since alloc, pid range of the allocating processes, cpu mask of
		allocating cpus, numa node mask of origins of memory, and stack trace.

		Example:::

		1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
		__slab_alloc+0x6d/0x90
		kmem_cache_alloc_trace+0x2eb/0x300
		populate_error_injection_list+0x97/0x110
		init_error_injection+0x1b/0x71
		do_one_initcall+0x5f/0x2d0
		kernel_init_freeable+0x26f/0x2d7
		kernel_init+0xe/0x118
		ret_from_fork+0x22/0x30

		338 pci_alloc_dev+0x2c/0xa0 waste=521872/1544 age=290837/291891/293509 pid=1 cpus=106 nodes=0-1
		__kmem_cache_alloc_node+0x11f/0x4e0
		kmalloc_trace+0x26/0xa0
		pci_alloc_dev+0x2c/0xa0
		pci_scan_single_device+0xd2/0x150
		pci_scan_slot+0xf7/0x2d0
		pci_scan_child_bus_extend+0x4e/0x360
		acpi_pci_root_create+0x32e/0x3b0
		pci_acpi_scan_root+0x2b9/0x2d0
		acpi_pci_root_add.cold.11+0x110/0xb0a
		acpi_bus_attach+0x262/0x3f0
		device_for_each_child+0xb7/0x110
		acpi_dev_for_each_child+0x77/0xa0
		acpi_bus_attach+0x108/0x3f0
		device_for_each_child+0xb7/0x110
		acpi_dev_for_each_child+0x77/0xa0
		acpi_bus_attach+0x108/0x3f0

		2. free_traces::

include/linux/slab.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -29,6 +29,8 @@
		#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U)
		/* DEBUG: Poison objects */
		#define SLAB_POISON ((slab_flags_t __force)0x00000800U)
		/* Indicate a kmalloc slab */
		#define SLAB_KMALLOC ((slab_flags_t __force)0x00001000U)
		/* Align objs on cache lines */
		#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U)
		/* Use GFP_DMA memory */

mm/slab_common.c

+2 −1

Original line number	Diff line number	Diff line
		@@ -649,7 +649,8 @@ struct kmem_cache __init create_kmalloc_cache(const char name,
		if (!s)
		panic("Out of memory when creating slab %s\n", name);

		create_boot_cache(s, name, size, flags, useroffset, usersize);
		create_boot_cache(s, name, size, flags \| SLAB_KMALLOC, useroffset,
		usersize);
		kasan_cache_create_kmalloc(s);
		list_add(&s->list, &slab_caches);
		s->refcount = 1;

mm/slub.c

+117 −37

Original line number	Diff line number	Diff line
		@@ -194,11 +194,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
		#endif
		#endif /* CONFIG_SLUB_DEBUG */

		/* Structure holding parameters for get_partial() call chain */
		struct partial_context {
		struct slab **slab;
		gfp_t flags;
		unsigned int orig_size;
		};

		static inline bool kmem_cache_debug(struct kmem_cache *s)
		{
		return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
		}

		static inline bool slub_debug_orig_size(struct kmem_cache *s)
		{
		return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
		(s->flags & SLAB_KMALLOC));
		}

		void fixup_red_left(struct kmem_cache s, void *p)
		{
		if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
		@@ -785,6 +798,39 @@ static void print_slab_info(const struct slab *slab)
		folio_flags(folio, 0));
		}

		/*
		* kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
		* family will round up the real request size to these fixed ones, so
		* there could be an extra area than what is requested. Save the original
		* request size in the meta data area, for better debug and sanity check.
		*/
		static inline void set_orig_size(struct kmem_cache *s,
		void *object, unsigned int orig_size)
		{
		void *p = kasan_reset_tag(object);

		if (!slub_debug_orig_size(s))
		return;

		p += get_info_end(s);
		p += sizeof(struct track) * 2;

		(unsigned int )p = orig_size;
		}

		static inline unsigned int get_orig_size(struct kmem_cache s, void object)
		{
		void *p = kasan_reset_tag(object);

		if (!slub_debug_orig_size(s))
		return s->object_size;

		p += get_info_end(s);
		p += sizeof(struct track) * 2;

		return (unsigned int )p;
		}

		static void slab_bug(struct kmem_cache s, char fmt, ...)
		{
		struct va_format vaf;
		@@ -844,6 +890,9 @@ static void print_trailer(struct kmem_cache s, struct slab slab, u8 *p)
		if (s->flags & SLAB_STORE_USER)
		off += 2 * sizeof(struct track);

		if (slub_debug_orig_size(s))
		off += sizeof(unsigned int);

		off += kasan_metadata_size(s);

		if (off != size_from_object(s))
		@@ -977,7 +1026,8 @@ static int check_bytes_and_report(struct kmem_cache s, struct slab slab,
		*
		* A. Free pointer (if we cannot overwrite object on free)
		* B. Tracking data for SLAB_STORE_USER
		* C. Padding to reach required alignment boundary or at minimum
		* C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
		* D. Padding to reach required alignment boundary or at minimum
		* one word if debugging is on to be able to detect writes
		* before the word boundary.
		*
		@@ -995,10 +1045,14 @@ static int check_pad_bytes(struct kmem_cache s, struct slab slab, u8 *p)
		{
		unsigned long off = get_info_end(s); /* The end of info */

		if (s->flags & SLAB_STORE_USER)
		if (s->flags & SLAB_STORE_USER) {
		/* We also have user information there */
		off += 2 * sizeof(struct track);

		if (s->flags & SLAB_KMALLOC)
		off += sizeof(unsigned int);
		}

		off += kasan_metadata_size(s);

		if (size_from_object(s) == off)
		@@ -1293,7 +1347,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
		}

		static noinline int alloc_debug_processing(struct kmem_cache *s,
		struct slab slab, void object)
		struct slab slab, void object, int orig_size)
		{
		if (s->flags & SLAB_CONSISTENCY_CHECKS) {
		if (!alloc_consistency_checks(s, slab, object))
		@@ -1302,6 +1356,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,

		/* Success. Perform special debug activities for allocs */
		trace(s, slab, object, 1);
		set_orig_size(s, object, orig_size);
		init_object(s, object, SLUB_RED_ACTIVE);
		return 1;

		@@ -1570,7 +1625,7 @@ static inline
		void setup_slab_debug(struct kmem_cache s, struct slab slab, void *addr) {}

		static inline int alloc_debug_processing(struct kmem_cache *s,
		struct slab slab, void object) { return 0; }
		struct slab slab, void object, int orig_size) { return 0; }

		static inline void free_debug_processing(
		struct kmem_cache s, struct slab slab,
		@@ -2013,7 +2068,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
		* it to full list if it was the last free object.
		*/
		static void alloc_single_from_partial(struct kmem_cache s,
		struct kmem_cache_node n, struct slab slab)
		struct kmem_cache_node n, struct slab slab, int orig_size)
		{
		void *object;

		@@ -2023,7 +2078,7 @@ static void alloc_single_from_partial(struct kmem_cache s,
		slab->freelist = get_freepointer(s, object);
		slab->inuse++;

		if (!alloc_debug_processing(s, slab, object)) {
		if (!alloc_debug_processing(s, slab, object, orig_size)) {
		remove_partial(n, slab);
		return NULL;
		}
		@@ -2042,7 +2097,7 @@ static void alloc_single_from_partial(struct kmem_cache s,
		* and put the slab to the partial (or full) list.
		*/
		static void alloc_single_from_new_slab(struct kmem_cache s,
		struct slab *slab)
		struct slab *slab, int orig_size)
		{
		int nid = slab_nid(slab);
		struct kmem_cache_node *n = get_node(s, nid);
		@@ -2054,7 +2109,7 @@ static void alloc_single_from_new_slab(struct kmem_cache s,
		slab->freelist = get_freepointer(s, object);
		slab->inuse = 1;

		if (!alloc_debug_processing(s, slab, object))
		if (!alloc_debug_processing(s, slab, object, orig_size))
		/*
		* It's not really expected that this would fail on a
		* freshly allocated slab, but a concurrent memory
		@@ -2132,7 +2187,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
		* Try to allocate a partial slab from a specific node.
		*/
		static void get_partial_node(struct kmem_cache s, struct kmem_cache_node *n,
		struct slab **ret_slab, gfp_t gfpflags)
		struct partial_context *pc)
		{
		struct slab slab, slab2;
		void *object = NULL;
		@@ -2152,11 +2207,12 @@ static void get_partial_node(struct kmem_cache s, struct kmem_cache_node *n,
		list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
		void *t;

		if (!pfmemalloc_match(slab, gfpflags))
		if (!pfmemalloc_match(slab, pc->flags))
		continue;

		if (kmem_cache_debug(s)) {
		object = alloc_single_from_partial(s, n, slab);
		object = alloc_single_from_partial(s, n, slab,
		pc->orig_size);
		if (object)
		break;
		continue;
		@@ -2167,7 +2223,7 @@ static void get_partial_node(struct kmem_cache s, struct kmem_cache_node *n,
		break;

		if (!object) {
		*ret_slab = slab;
		*pc->slab = slab;
		stat(s, ALLOC_FROM_PARTIAL);
		object = t;
		} else {
		@@ -2191,14 +2247,13 @@ static void get_partial_node(struct kmem_cache s, struct kmem_cache_node *n,
		/*
		* Get a slab from somewhere. Search in increasing NUMA distances.
		*/
		static void get_any_partial(struct kmem_cache s, gfp_t flags,
		struct slab **ret_slab)
		static void get_any_partial(struct kmem_cache s, struct partial_context *pc)
		{
		#ifdef CONFIG_NUMA
		struct zonelist *zonelist;
		struct zoneref *z;
		struct zone *zone;
		enum zone_type highest_zoneidx = gfp_zone(flags);
		enum zone_type highest_zoneidx = gfp_zone(pc->flags);
		void *object;
		unsigned int cpuset_mems_cookie;

		@@ -2226,15 +2281,15 @@ static void get_any_partial(struct kmem_cache s, gfp_t flags,

		do {
		cpuset_mems_cookie = read_mems_allowed_begin();
		zonelist = node_zonelist(mempolicy_slab_node(), flags);
		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
		struct kmem_cache_node *n;

		n = get_node(s, zone_to_nid(zone));

		if (n && cpuset_zone_allowed(zone, flags) &&
		if (n && cpuset_zone_allowed(zone, pc->flags) &&
		n->nr_partial > s->min_partial) {
		object = get_partial_node(s, n, ret_slab, flags);
		object = get_partial_node(s, n, pc);
		if (object) {
		/*
		* Don't check read_mems_allowed_retry()
		@@ -2255,8 +2310,7 @@ static void get_any_partial(struct kmem_cache s, gfp_t flags,
		/*
		* Get a partial slab, lock it and return it.
		*/
		static void get_partial(struct kmem_cache s, gfp_t flags, int node,
		struct slab **ret_slab)
		static void get_partial(struct kmem_cache s, int node, struct partial_context *pc)
		{
		void *object;
		int searchnode = node;
		@@ -2264,11 +2318,11 @@ static void get_partial(struct kmem_cache s, gfp_t flags, int node,
		if (node == NUMA_NO_NODE)
		searchnode = numa_mem_id();

		object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
		object = get_partial_node(s, get_node(s, searchnode), pc);
		if (object \|\| node != NUMA_NO_NODE)
		return object;

		return get_any_partial(s, flags, ret_slab);
		return get_any_partial(s, pc);
		}

		#ifdef CONFIG_PREEMPTION
		@@ -2989,11 +3043,12 @@ static inline void get_freelist(struct kmem_cache s, struct slab *slab)
		* already disabled (which is the case for bulk allocation).
		*/
		static void ___slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
		unsigned long addr, struct kmem_cache_cpu *c)
		unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
		{
		void *freelist;
		struct slab *slab;
		unsigned long flags;
		struct partial_context pc;

		stat(s, ALLOC_SLOWPATH);

		@@ -3107,7 +3162,10 @@ static void ___slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,

		new_objects:

		freelist = get_partial(s, gfpflags, node, &slab);
		pc.flags = gfpflags;
		pc.slab = &slab;
		pc.orig_size = orig_size;
		freelist = get_partial(s, node, &pc);
		if (freelist)
		goto check_new_slab;

		@@ -3123,7 +3181,7 @@ static void ___slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
		stat(s, ALLOC_SLAB);

		if (kmem_cache_debug(s)) {
		freelist = alloc_single_from_new_slab(s, slab);
		freelist = alloc_single_from_new_slab(s, slab, orig_size);

		if (unlikely(!freelist))
		goto new_objects;
		@@ -3155,6 +3213,7 @@ static void ___slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
		*/
		if (s->flags & SLAB_STORE_USER)
		set_track(s, freelist, TRACK_ALLOC, addr);

		return freelist;
		}

		@@ -3197,7 +3256,7 @@ static void ___slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
		* pointer.
		*/
		static void __slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
		unsigned long addr, struct kmem_cache_cpu *c)
		unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
		{
		void *p;

		@@ -3210,7 +3269,7 @@ static void __slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
		c = slub_get_cpu_ptr(s->cpu_slab);
		#endif

		p = ___slab_alloc(s, gfpflags, node, addr, c);
		p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
		#ifdef CONFIG_PREEMPT_COUNT
		slub_put_cpu_ptr(s->cpu_slab);
		#endif
		@@ -3295,7 +3354,7 @@ static __always_inline void slab_alloc_node(struct kmem_cache s, struct list_l

		if (!USE_LOCKLESS_FAST_PATH() \|\|
		unlikely(!object \|\| !slab \|\| !node_match(slab, node))) {
		object = __slab_alloc(s, gfpflags, node, addr, c);
		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
		} else {
		void *next_object = get_freepointer_safe(s, object);

		@@ -3793,7 +3852,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
		* of re-populating per CPU c->freelist
		*/
		p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
		_RET_IP_, c);
		_RET_IP_, c, s->object_size);
		if (unlikely(!p[i]))
		goto error;

		@@ -4196,12 +4255,17 @@ static int calculate_sizes(struct kmem_cache *s)
		}

		#ifdef CONFIG_SLUB_DEBUG
		if (flags & SLAB_STORE_USER)
		if (flags & SLAB_STORE_USER) {
		/*
		* Need to store information about allocs and frees after
		* the object.
		*/
		size += 2 * sizeof(struct track);

		/* Save the original kmalloc request size */
		if (flags & SLAB_KMALLOC)
		size += sizeof(unsigned int);
		}
		#endif

		kasan_cache_create(s, &size, &s->flags);
		@@ -5146,6 +5210,7 @@ struct location {
		depot_stack_handle_t handle;
		unsigned long count;
		unsigned long addr;
		unsigned long waste;
		long long sum_time;
		long min_time;
		long max_time;
		@@ -5192,13 +5257,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
		}

		static int add_location(struct loc_track t, struct kmem_cache s,
		const struct track *track)
		const struct track *track,
		unsigned int orig_size)
		{
		long start, end, pos;
		struct location *l;
		unsigned long caddr, chandle;
		unsigned long caddr, chandle, cwaste;
		unsigned long age = jiffies - track->when;
		depot_stack_handle_t handle = 0;
		unsigned int waste = s->object_size - orig_size;

		#ifdef CONFIG_STACKDEPOT
		handle = READ_ONCE(track->handle);
		@@ -5216,11 +5283,13 @@ static int add_location(struct loc_track t, struct kmem_cache s,
		if (pos == end)
		break;

		caddr = t->loc[pos].addr;
		chandle = t->loc[pos].handle;
		if ((track->addr == caddr) && (handle == chandle)) {

		l = &t->loc[pos];
		caddr = l->addr;
		chandle = l->handle;
		cwaste = l->waste;
		if ((track->addr == caddr) && (handle == chandle) &&
		(waste == cwaste)) {

		l->count++;
		if (track->when) {
		l->sum_time += age;
		@@ -5245,6 +5314,9 @@ static int add_location(struct loc_track t, struct kmem_cache s,
		end = pos;
		else if (track->addr == caddr && handle < chandle)
		end = pos;
		else if (track->addr == caddr && handle == chandle &&
		waste < cwaste)
		end = pos;
		else
		start = pos;
		}
		@@ -5268,6 +5340,7 @@ static int add_location(struct loc_track t, struct kmem_cache s,
		l->min_pid = track->pid;
		l->max_pid = track->pid;
		l->handle = handle;
		l->waste = waste;
		cpumask_clear(to_cpumask(l->cpus));
		cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
		nodes_clear(l->nodes);
		@@ -5280,13 +5353,16 @@ static void process_slab(struct loc_track t, struct kmem_cache s,
		unsigned long *obj_map)
		{
		void *addr = slab_address(slab);
		bool is_alloc = (alloc == TRACK_ALLOC);
		void *p;

		__fill_map(obj_map, s, slab);

		for_each_object(p, s, addr, slab->objects)
		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
		add_location(t, s, get_track(s, p, alloc));
		add_location(t, s, get_track(s, p, alloc),
		is_alloc ? get_orig_size(s, p) :
		s->object_size);
		}
		#endif /* CONFIG_DEBUG_FS */
		#endif /* CONFIG_SLUB_DEBUG */
		@@ -6156,6 +6232,10 @@ static int slab_debugfs_show(struct seq_file seq, void v)
		else
		seq_puts(seq, "<not-available>");

		if (l->waste)
		seq_printf(seq, " waste=%lu/%lu",
		l->count * l->waste, l->waste);

		if (l->sum_time != l->min_time) {
		seq_printf(seq, " age=%ld/%llu/%ld",
		l->min_time, div_u64(l->sum_time, l->count),