Commit 8b9ed79c authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull x86 asm updates from Borislav Petkov:

 - Move the 32-bit memmove() asm implementation out-of-line in order to
   fix a 32-bit full LTO build failure with clang where it would fail at
   register allocation.

   Move it to an asm file and clean it up while at it, similar to what
   has been already done on 64-bit

* tag 'x86_asm_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mem: Move memmove to out of line assembler
parents fc4c9f45 bce5a1e8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y)
        lib-y += checksum_32.o
        lib-y += strstr_32.o
        lib-y += string_32.o
        lib-y += memmove_32.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
        lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
+0 −187
Original line number Diff line number Diff line
@@ -17,190 +17,3 @@ __visible void *memset(void *s, int c, size_t count)
	return __memset(s, c, count);
}
EXPORT_SYMBOL(memset);

__visible void *memmove(void *dest, const void *src, size_t n)
{
	int d0,d1,d2,d3,d4,d5;
	char *ret = dest;

	__asm__ __volatile__(
		/* Handle more 16 bytes in loop */
		"cmp $0x10, %0\n\t"
		"jb	1f\n\t"

		/* Decide forward/backward copy mode */
		"cmp %2, %1\n\t"
		"jb	2f\n\t"

		/*
		 * movs instruction have many startup latency
		 * so we handle small size by general register.
		 */
		"cmp  $680, %0\n\t"
		"jb 3f\n\t"
		/*
		 * movs instruction is only good for aligned case.
		 */
		"mov %1, %3\n\t"
		"xor %2, %3\n\t"
		"and $0xff, %3\n\t"
		"jz 4f\n\t"
		"3:\n\t"
		"sub $0x10, %0\n\t"

		/*
		 * We gobble 16 bytes forward in each loop.
		 */
		"3:\n\t"
		"sub $0x10, %0\n\t"
		"mov 0*4(%1), %3\n\t"
		"mov 1*4(%1), %4\n\t"
		"mov  %3, 0*4(%2)\n\t"
		"mov  %4, 1*4(%2)\n\t"
		"mov 2*4(%1), %3\n\t"
		"mov 3*4(%1), %4\n\t"
		"mov  %3, 2*4(%2)\n\t"
		"mov  %4, 3*4(%2)\n\t"
		"lea  0x10(%1), %1\n\t"
		"lea  0x10(%2), %2\n\t"
		"jae 3b\n\t"
		"add $0x10, %0\n\t"
		"jmp 1f\n\t"

		/*
		 * Handle data forward by movs.
		 */
		".p2align 4\n\t"
		"4:\n\t"
		"mov -4(%1, %0), %3\n\t"
		"lea -4(%2, %0), %4\n\t"
		"shr $2, %0\n\t"
		"rep movsl\n\t"
		"mov %3, (%4)\n\t"
		"jmp 11f\n\t"
		/*
		 * Handle data backward by movs.
		 */
		".p2align 4\n\t"
		"6:\n\t"
		"mov (%1), %3\n\t"
		"mov %2, %4\n\t"
		"lea -4(%1, %0), %1\n\t"
		"lea -4(%2, %0), %2\n\t"
		"shr $2, %0\n\t"
		"std\n\t"
		"rep movsl\n\t"
		"mov %3,(%4)\n\t"
		"cld\n\t"
		"jmp 11f\n\t"

		/*
		 * Start to prepare for backward copy.
		 */
		".p2align 4\n\t"
		"2:\n\t"
		"cmp  $680, %0\n\t"
		"jb 5f\n\t"
		"mov %1, %3\n\t"
		"xor %2, %3\n\t"
		"and $0xff, %3\n\t"
		"jz 6b\n\t"

		/*
		 * Calculate copy position to tail.
		 */
		"5:\n\t"
		"add %0, %1\n\t"
		"add %0, %2\n\t"
		"sub $0x10, %0\n\t"

		/*
		 * We gobble 16 bytes backward in each loop.
		 */
		"7:\n\t"
		"sub $0x10, %0\n\t"

		"mov -1*4(%1), %3\n\t"
		"mov -2*4(%1), %4\n\t"
		"mov  %3, -1*4(%2)\n\t"
		"mov  %4, -2*4(%2)\n\t"
		"mov -3*4(%1), %3\n\t"
		"mov -4*4(%1), %4\n\t"
		"mov  %3, -3*4(%2)\n\t"
		"mov  %4, -4*4(%2)\n\t"
		"lea  -0x10(%1), %1\n\t"
		"lea  -0x10(%2), %2\n\t"
		"jae 7b\n\t"
		/*
		 * Calculate copy position to head.
		 */
		"add $0x10, %0\n\t"
		"sub %0, %1\n\t"
		"sub %0, %2\n\t"

		/*
		 * Move data from 8 bytes to 15 bytes.
		 */
		".p2align 4\n\t"
		"1:\n\t"
		"cmp $8, %0\n\t"
		"jb 8f\n\t"
		"mov 0*4(%1), %3\n\t"
		"mov 1*4(%1), %4\n\t"
		"mov -2*4(%1, %0), %5\n\t"
		"mov -1*4(%1, %0), %1\n\t"

		"mov  %3, 0*4(%2)\n\t"
		"mov  %4, 1*4(%2)\n\t"
		"mov  %5, -2*4(%2, %0)\n\t"
		"mov  %1, -1*4(%2, %0)\n\t"
		"jmp 11f\n\t"

		/*
		 * Move data from 4 bytes to 7 bytes.
		 */
		".p2align 4\n\t"
		"8:\n\t"
		"cmp $4, %0\n\t"
		"jb 9f\n\t"
		"mov 0*4(%1), %3\n\t"
		"mov -1*4(%1, %0), %4\n\t"
		"mov  %3, 0*4(%2)\n\t"
		"mov  %4, -1*4(%2, %0)\n\t"
		"jmp 11f\n\t"

		/*
		 * Move data from 2 bytes to 3 bytes.
		 */
		".p2align 4\n\t"
		"9:\n\t"
		"cmp $2, %0\n\t"
		"jb 10f\n\t"
		"movw 0*2(%1), %%dx\n\t"
		"movw -1*2(%1, %0), %%bx\n\t"
		"movw %%dx, 0*2(%2)\n\t"
		"movw %%bx, -1*2(%2, %0)\n\t"
		"jmp 11f\n\t"

		/*
		 * Move data for 1 byte.
		 */
		".p2align 4\n\t"
		"10:\n\t"
		"cmp $1, %0\n\t"
		"jb 11f\n\t"
		"movb (%1), %%cl\n\t"
		"movb %%cl, (%2)\n\t"
		".p2align 4\n\t"
		"11:"
		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
		  "=r" (d3),"=r" (d4), "=r"(d5)
		:"0" (n),
		 "1" (src),
		 "2" (dest)
		:"memory");

	return ret;

}
EXPORT_SYMBOL(memmove);
+200 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */

#include <linux/linkage.h>
#include <asm/export.h>

SYM_FUNC_START(memmove)
/*
 * void *memmove(void *dest_in, const void *src_in, size_t n)
 * -mregparm=3 passes these in registers:
 * dest_in: %eax
 * src_in: %edx
 * n: %ecx
 * See also: arch/x86/entry/calling.h for description of the calling convention.
 *
 * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src
 * in %esi.
 */
.set dest_in, %eax
.set dest, %edi
.set src_in, %edx
.set src, %esi
.set n, %ecx
.set tmp0, %edx
.set tmp0w, %dx
.set tmp1, %ebx
.set tmp1w, %bx
.set tmp2, %eax
.set tmp3b, %cl

/*
 * Save all callee-saved registers, because this function is going to clobber
 * all of them:
 */
	pushl	%ebp
	movl	%esp, %ebp	// set standard frame pointer

	pushl	%ebx
	pushl	%edi
	pushl	%esi
	pushl	%eax		// save 'dest_in' parameter [eax] as the return value

	movl src_in, src
	movl dest_in, dest

	/* Handle more 16 bytes in loop */
	cmpl	$0x10, n
	jb	.Lmove_16B

	/* Decide forward/backward copy mode */
	cmpl	dest, src
	jb	.Lbackwards_header

	/*
	 * movs instruction have many startup latency
	 * so we handle small size by general register.
	 */
	cmpl	$680, n
	jb	.Ltoo_small_forwards
	/* movs instruction is only good for aligned case. */
	movl	src, tmp0
	xorl	dest, tmp0
	andl	$0xff, tmp0
	jz	.Lforward_movs
.Ltoo_small_forwards:
	subl	$0x10, n

	/* We gobble 16 bytes forward in each loop. */
.Lmove_16B_forwards_loop:
	subl	$0x10, n
	movl	0*4(src), tmp0
	movl	1*4(src), tmp1
	movl	tmp0, 0*4(dest)
	movl	tmp1, 1*4(dest)
	movl	2*4(src), tmp0
	movl	3*4(src), tmp1
	movl	tmp0, 2*4(dest)
	movl	tmp1, 3*4(dest)
	leal	0x10(src), src
	leal	0x10(dest), dest
	jae	.Lmove_16B_forwards_loop
	addl	$0x10, n
	jmp	.Lmove_16B

	/* Handle data forward by movs. */
.p2align 4
.Lforward_movs:
	movl	-4(src, n), tmp0
	leal	-4(dest, n), tmp1
	shrl	$2, n
	rep	movsl
	movl	tmp0, (tmp1)
	jmp	.Ldone

	/* Handle data backward by movs. */
.p2align 4
.Lbackwards_movs:
	movl	(src), tmp0
	movl	dest, tmp1
	leal	-4(src, n), src
	leal	-4(dest, n), dest
	shrl	$2, n
	std
	rep	movsl
	movl	tmp0,(tmp1)
	cld
	jmp	.Ldone

	/* Start to prepare for backward copy. */
.p2align 4
.Lbackwards_header:
	cmpl	$680, n
	jb	.Ltoo_small_backwards
	movl	src, tmp0
	xorl	dest, tmp0
	andl	$0xff, tmp0
	jz	.Lbackwards_movs

	/* Calculate copy position to tail. */
.Ltoo_small_backwards:
	addl	n, src
	addl	n, dest
	subl	$0x10, n

	/* We gobble 16 bytes backward in each loop. */
.Lmove_16B_backwards_loop:
	subl	$0x10, n

	movl	-1*4(src), tmp0
	movl	-2*4(src), tmp1
	movl	tmp0, -1*4(dest)
	movl	tmp1, -2*4(dest)
	movl	-3*4(src), tmp0
	movl	-4*4(src), tmp1
	movl	tmp0, -3*4(dest)
	movl	tmp1, -4*4(dest)
	leal	-0x10(src), src
	leal	-0x10(dest), dest
	jae	.Lmove_16B_backwards_loop
	/* Calculate copy position to head. */
	addl	$0x10, n
	subl	n, src
	subl	n, dest

	/* Move data from 8 bytes to 15 bytes. */
.p2align 4
.Lmove_16B:
	cmpl	$8, n
	jb	.Lmove_8B
	movl	0*4(src), tmp0
	movl	1*4(src), tmp1
	movl	-2*4(src, n), tmp2
	movl	-1*4(src, n), src

	movl	tmp0, 0*4(dest)
	movl	tmp1, 1*4(dest)
	movl	tmp2, -2*4(dest, n)
	movl	src, -1*4(dest, n)
	jmp	.Ldone

	/* Move data from 4 bytes to 7 bytes. */
.p2align 4
.Lmove_8B:
	cmpl	$4, n
	jb	.Lmove_4B
	movl	0*4(src), tmp0
	movl	-1*4(src, n), tmp1
	movl	tmp0, 0*4(dest)
	movl	tmp1, -1*4(dest, n)
	jmp	.Ldone

	/* Move data from 2 bytes to 3 bytes. */
.p2align 4
.Lmove_4B:
	cmpl	$2, n
	jb	.Lmove_1B
	movw	0*2(src), tmp0w
	movw	-1*2(src, n), tmp1w
	movw	tmp0w, 0*2(dest)
	movw	tmp1w, -1*2(dest, n)
	jmp	.Ldone

	/* Move data for 1 byte. */
.p2align 4
.Lmove_1B:
	cmpl	$1, n
	jb	.Ldone
	movb	(src), tmp3b
	movb	tmp3b, (dest)
.p2align 4
.Ldone:
	popl	dest_in	// restore 'dest_in' [eax] as the return value
	/* Restore all callee-saved registers: */
	popl	%esi
	popl	%edi
	popl	%ebx
	popl	%ebp

	RET
SYM_FUNC_END(memmove)
EXPORT_SYMBOL(memmove)
+22 −0
Original line number Diff line number Diff line
@@ -105,6 +105,8 @@ static void memcpy_test(struct kunit *test)
#undef TEST_OP
}

static unsigned char larger_array [2048];

static void memmove_test(struct kunit *test)
{
#define TEST_OP "memmove"
@@ -179,6 +181,26 @@ static void memmove_test(struct kunit *test)
	ptr = &overlap.data[2];
	memmove(ptr, overlap.data, 5);
	compare("overlapping write", overlap, overlap_expected);

	/* Verify larger overlapping moves. */
	larger_array[256] = 0xAAu;
	/*
	 * Test a backwards overlapping memmove first. 256 and 1024 are
	 * important for i386 to use rep movsl.
	 */
	memmove(larger_array, larger_array + 256, 1024);
	KUNIT_ASSERT_EQ(test, larger_array[0], 0xAAu);
	KUNIT_ASSERT_EQ(test, larger_array[256], 0x00);
	KUNIT_ASSERT_NULL(test,
		memchr(larger_array + 1, 0xaa, ARRAY_SIZE(larger_array) - 1));
	/* Test a forwards overlapping memmove. */
	larger_array[0] = 0xBBu;
	memmove(larger_array + 256, larger_array, 1024);
	KUNIT_ASSERT_EQ(test, larger_array[0], 0xBBu);
	KUNIT_ASSERT_EQ(test, larger_array[256], 0xBBu);
	KUNIT_ASSERT_NULL(test, memchr(larger_array + 1, 0xBBu, 256 - 1));
	KUNIT_ASSERT_NULL(test,
		memchr(larger_array + 257, 0xBBu, ARRAY_SIZE(larger_array) - 257));
#undef TEST_OP
}