Commit 45ff8b47 authored by Dave Chinner's avatar Dave Chinner Committed by Dave Chinner
Browse files

xfs: can't use kmem_zalloc() for attribute buffers



Because heap allocation of 64kB buffers will fail:

....
 XFS: fs_mark(8414) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8417) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8409) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8428) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8430) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8437) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8433) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8406) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8412) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8432) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8424) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
....

I'd use kvmalloc() instead, but....

- 48.19% xfs_attr_create_intent
  - 46.89% xfs_attri_init
     - kvmalloc_node
	- 46.04% __kmalloc_node
	   - kmalloc_large_node
	      - 45.99% __alloc_pages
		 - 39.39% __alloc_pages_slowpath.constprop.0
		    - 38.89% __alloc_pages_direct_compact
		       - 38.71% try_to_compact_pages
			  - compact_zone_order
			  - compact_zone
			     - 21.09% isolate_migratepages_block
				  10.31% PageHuge
				  5.82% set_pfnblock_flags_mask
				  0.86% get_pfnblock_flags_mask
			     - 4.48% __reset_isolation_suitable
				  4.44% __reset_isolation_pfn
			     - 3.56% __pageblock_pfn_to_page
				  1.33% pfn_to_online_page
			       2.83% get_pfnblock_flags_mask
			     - 0.87% migrate_pages
				  0.86% compaction_alloc
			       0.84% find_suitable_fallback
		 - 6.60% get_page_from_freelist
		      4.99% clear_page_erms
		    - 1.19% _raw_spin_lock_irqsave
		       - do_raw_spin_lock
			    __pv_queued_spin_lock_slowpath
	- 0.86% __vmalloc_node_range
	     0.65% __alloc_pages_bulk

.... this is just yet another reminder of how much kvmalloc() sucks.
So lift xlog_cil_kvmalloc(), rename it to xlog_kvmalloc() and use
that instead....

We also clean up the attribute name and value lengths as they no
longer need to be rounded out to sizes compatible with log vectors.

Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarAllison Henderson <allison.henderson@oracle.com>
Reviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent 51e6104f
Loading
Loading
Loading
Loading
+15 −20
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ xfs_attri_item_free(
	struct xfs_attri_log_item	*attrip)
{
	kmem_free(attrip->attri_item.li_lv_shadow);
	kmem_free(attrip);
	kvfree(attrip);
}

/*
@@ -119,11 +119,11 @@ xfs_attri_item_format(
			sizeof(struct xfs_attri_log_format));
	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME,
			attrip->attri_name,
			xlog_calc_iovec_len(attrip->attri_name_len));
			attrip->attri_name_len);
	if (attrip->attri_value_len > 0)
		xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
				attrip->attri_value,
				xlog_calc_iovec_len(attrip->attri_value_len));
				attrip->attri_value_len);
}

/*
@@ -163,26 +163,21 @@ xfs_attri_init(

{
	struct xfs_attri_log_item	*attrip;
	uint32_t			name_vec_len = 0;
	uint32_t			value_vec_len = 0;
	uint32_t			buffer_size;

	if (name_len)
		name_vec_len = xlog_calc_iovec_len(name_len);
	if (value_len)
		value_vec_len = xlog_calc_iovec_len(value_len);

	buffer_size = name_vec_len + value_vec_len;
	uint32_t			buffer_size = name_len + value_len;

	if (buffer_size) {
		attrip = kmem_zalloc(sizeof(struct xfs_attri_log_item) +
				    buffer_size, KM_NOFS);
		if (attrip == NULL)
			return NULL;
		/*
		 * This could be over 64kB in length, so we have to use
		 * kvmalloc() for this. But kvmalloc() utterly sucks, so we
		 * use own version.
		 */
		attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) +
					buffer_size);
	} else {
		attrip = kmem_cache_zalloc(xfs_attri_cache,
		attrip = kmem_cache_alloc(xfs_attri_cache,
					GFP_NOFS | __GFP_NOFAIL);
	}
	memset(attrip, 0, sizeof(struct xfs_attri_log_item));

	attrip->attri_name_len = name_len;
	if (name_len)
@@ -195,7 +190,7 @@ xfs_attri_init(
	if (value_len)
		attrip->attri_value = ((char *)attrip) +
				sizeof(struct xfs_attri_log_item) +
				name_vec_len;
				name_len;
	else
		attrip->attri_value = NULL;

+1 −34
Original line number Diff line number Diff line
@@ -134,39 +134,6 @@ xlog_cil_iovec_space(
			sizeof(uint64_t));
}

/*
 * shadow buffers can be large, so we need to use kvmalloc() here to ensure
 * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
 * back to vmalloc, so we can't actually do anything useful with gfp flags to
 * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
 * direct reclaim and compaction in the slow path, both of which are
 * horrendously expensive. We just want kmalloc to fail fast and fall back to
 * vmalloc if it can't get somethign straight away from the free lists or buddy
 * allocator. Hence we have to open code kvmalloc outselves here.
 *
 * Also, we are in memalloc_nofs_save task context here, so despite the use of
 * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
 * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
 * just all pretend this is a GFP_KERNEL context operation....
 */
static inline void *
xlog_cil_kvmalloc(
	size_t		buf_size)
{
	gfp_t		flags = GFP_KERNEL;
	void		*p;

	flags &= ~__GFP_DIRECT_RECLAIM;
	flags |= __GFP_NOWARN | __GFP_NORETRY;
	do {
		p = kmalloc(buf_size, flags);
		if (!p)
			p = vmalloc(buf_size);
	} while (!p);

	return p;
}

/*
 * Allocate or pin log vector buffers for CIL insertion.
 *
@@ -283,7 +250,7 @@ xlog_cil_alloc_shadow_bufs(
			 * storage.
			 */
			kmem_free(lip->li_lv_shadow);
			lv = xlog_cil_kvmalloc(buf_size);
			lv = xlog_kvmalloc(buf_size);

			memset(lv, 0, xlog_cil_iovec_space(niovecs));

+34 −0
Original line number Diff line number Diff line
@@ -651,4 +651,38 @@ xlog_valid_lsn(
	return valid;
}

/*
 * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
 * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
 * to fall back to vmalloc, so we can't actually do anything useful with gfp
 * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
 * will do direct reclaim and compaction in the slow path, both of which are
 * horrendously expensive. We just want kmalloc to fail fast and fall back to
 * vmalloc if it can't get somethign straight away from the free lists or
 * buddy allocator. Hence we have to open code kvmalloc outselves here.
 *
 * This assumes that the caller uses memalloc_nofs_save task context here, so
 * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
 * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
 * allocations, so lets just all pretend this is a GFP_KERNEL context
 * operation....
 */
static inline void *
xlog_kvmalloc(
	size_t		buf_size)
{
	gfp_t		flags = GFP_KERNEL;
	void		*p;

	flags &= ~__GFP_DIRECT_RECLAIM;
	flags |= __GFP_NOWARN | __GFP_NORETRY;
	do {
		p = kmalloc(buf_size, flags);
		if (!p)
			p = vmalloc(buf_size);
	} while (!p);

	return p;
}

#endif	/* __XFS_LOG_PRIV_H__ */