Merge tag 'x86_asm_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (8b9ed79c) · Commits · EulixOS / Software / Kernel

arch/x86/lib/Makefile

+1 −0

Original line number	Diff line number	Diff line
		@@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y)
		lib-y += checksum_32.o
		lib-y += strstr_32.o
		lib-y += string_32.o
		lib-y += memmove_32.o
		ifneq ($(CONFIG_X86_CMPXCHG64),y)
		lib-y += cmpxchg8b_emu.o atomic64_386_32.o
		endif

arch/x86/lib/memcpy_32.c

+0 −187

Original line number	Diff line number	Diff line
		@@ -17,190 +17,3 @@ __visible void memset(void s, int c, size_t count)
		return __memset(s, c, count);
		}
		EXPORT_SYMBOL(memset);

		__visible void memmove(void dest, const void *src, size_t n)
		{
		int d0,d1,d2,d3,d4,d5;
		char *ret = dest;

		__asm__ __volatile__(
		/* Handle more 16 bytes in loop */
		"cmp $0x10, %0\n\t"
		"jb 1f\n\t"

		/* Decide forward/backward copy mode */
		"cmp %2, %1\n\t"
		"jb 2f\n\t"

		/*
		* movs instruction have many startup latency
		* so we handle small size by general register.
		*/
		"cmp $680, %0\n\t"
		"jb 3f\n\t"
		/*
		* movs instruction is only good for aligned case.
		*/
		"mov %1, %3\n\t"
		"xor %2, %3\n\t"
		"and $0xff, %3\n\t"
		"jz 4f\n\t"
		"3:\n\t"
		"sub $0x10, %0\n\t"

		/*
		* We gobble 16 bytes forward in each loop.
		*/
		"3:\n\t"
		"sub $0x10, %0\n\t"
		"mov 0*4(%1), %3\n\t"
		"mov 1*4(%1), %4\n\t"
		"mov %3, 0*4(%2)\n\t"
		"mov %4, 1*4(%2)\n\t"
		"mov 2*4(%1), %3\n\t"
		"mov 3*4(%1), %4\n\t"
		"mov %3, 2*4(%2)\n\t"
		"mov %4, 3*4(%2)\n\t"
		"lea 0x10(%1), %1\n\t"
		"lea 0x10(%2), %2\n\t"
		"jae 3b\n\t"
		"add $0x10, %0\n\t"
		"jmp 1f\n\t"

		/*
		* Handle data forward by movs.
		*/
		".p2align 4\n\t"
		"4:\n\t"
		"mov -4(%1, %0), %3\n\t"
		"lea -4(%2, %0), %4\n\t"
		"shr $2, %0\n\t"
		"rep movsl\n\t"
		"mov %3, (%4)\n\t"
		"jmp 11f\n\t"
		/*
		* Handle data backward by movs.
		*/
		".p2align 4\n\t"
		"6:\n\t"
		"mov (%1), %3\n\t"
		"mov %2, %4\n\t"
		"lea -4(%1, %0), %1\n\t"
		"lea -4(%2, %0), %2\n\t"
		"shr $2, %0\n\t"
		"std\n\t"
		"rep movsl\n\t"
		"mov %3,(%4)\n\t"
		"cld\n\t"
		"jmp 11f\n\t"

		/*
		* Start to prepare for backward copy.
		*/
		".p2align 4\n\t"
		"2:\n\t"
		"cmp $680, %0\n\t"
		"jb 5f\n\t"
		"mov %1, %3\n\t"
		"xor %2, %3\n\t"
		"and $0xff, %3\n\t"
		"jz 6b\n\t"

		/*
		* Calculate copy position to tail.
		*/
		"5:\n\t"
		"add %0, %1\n\t"
		"add %0, %2\n\t"
		"sub $0x10, %0\n\t"

		/*
		* We gobble 16 bytes backward in each loop.
		*/
		"7:\n\t"
		"sub $0x10, %0\n\t"

		"mov -1*4(%1), %3\n\t"
		"mov -2*4(%1), %4\n\t"
		"mov %3, -1*4(%2)\n\t"
		"mov %4, -2*4(%2)\n\t"
		"mov -3*4(%1), %3\n\t"
		"mov -4*4(%1), %4\n\t"
		"mov %3, -3*4(%2)\n\t"
		"mov %4, -4*4(%2)\n\t"
		"lea -0x10(%1), %1\n\t"
		"lea -0x10(%2), %2\n\t"
		"jae 7b\n\t"
		/*
		* Calculate copy position to head.
		*/
		"add $0x10, %0\n\t"
		"sub %0, %1\n\t"
		"sub %0, %2\n\t"

		/*
		* Move data from 8 bytes to 15 bytes.
		*/
		".p2align 4\n\t"
		"1:\n\t"
		"cmp $8, %0\n\t"
		"jb 8f\n\t"
		"mov 0*4(%1), %3\n\t"
		"mov 1*4(%1), %4\n\t"
		"mov -2*4(%1, %0), %5\n\t"
		"mov -1*4(%1, %0), %1\n\t"

		"mov %3, 0*4(%2)\n\t"
		"mov %4, 1*4(%2)\n\t"
		"mov %5, -2*4(%2, %0)\n\t"
		"mov %1, -1*4(%2, %0)\n\t"
		"jmp 11f\n\t"

		/*
		* Move data from 4 bytes to 7 bytes.
		*/
		".p2align 4\n\t"
		"8:\n\t"
		"cmp $4, %0\n\t"
		"jb 9f\n\t"
		"mov 0*4(%1), %3\n\t"
		"mov -1*4(%1, %0), %4\n\t"
		"mov %3, 0*4(%2)\n\t"
		"mov %4, -1*4(%2, %0)\n\t"
		"jmp 11f\n\t"

		/*
		* Move data from 2 bytes to 3 bytes.
		*/
		".p2align 4\n\t"
		"9:\n\t"
		"cmp $2, %0\n\t"
		"jb 10f\n\t"
		"movw 0*2(%1), %%dx\n\t"
		"movw -1*2(%1, %0), %%bx\n\t"
		"movw %%dx, 0*2(%2)\n\t"
		"movw %%bx, -1*2(%2, %0)\n\t"
		"jmp 11f\n\t"

		/*
		* Move data for 1 byte.
		*/
		".p2align 4\n\t"
		"10:\n\t"
		"cmp $1, %0\n\t"
		"jb 11f\n\t"
		"movb (%1), %%cl\n\t"
		"movb %%cl, (%2)\n\t"
		".p2align 4\n\t"
		"11:"
		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
		"=r" (d3),"=r" (d4), "=r"(d5)
		:"0" (n),
		"1" (src),
		"2" (dest)
		:"memory");

		return ret;

		}
		EXPORT_SYMBOL(memmove);

arch/x86/lib/memmove_32.S

0 → 100644

+200 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */

		#include <linux/linkage.h>
		#include <asm/export.h>

		SYM_FUNC_START(memmove)
		/*
		* void memmove(void dest_in, const void *src_in, size_t n)
		* -mregparm=3 passes these in registers:
		* dest_in: %eax
		* src_in: %edx
		* n: %ecx
		* See also: arch/x86/entry/calling.h for description of the calling convention.
		*
		* n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src
		* in %esi.
		*/
		.set dest_in, %eax
		.set dest, %edi
		.set src_in, %edx
		.set src, %esi
		.set n, %ecx
		.set tmp0, %edx
		.set tmp0w, %dx
		.set tmp1, %ebx
		.set tmp1w, %bx
		.set tmp2, %eax
		.set tmp3b, %cl

		/*
		* Save all callee-saved registers, because this function is going to clobber
		* all of them:
		*/
		pushl %ebp
		movl %esp, %ebp // set standard frame pointer

		pushl %ebx
		pushl %edi
		pushl %esi
		pushl %eax // save 'dest_in' parameter [eax] as the return value

		movl src_in, src
		movl dest_in, dest

		/* Handle more 16 bytes in loop */
		cmpl $0x10, n
		jb .Lmove_16B

		/* Decide forward/backward copy mode */
		cmpl dest, src
		jb .Lbackwards_header

		/*
		* movs instruction have many startup latency
		* so we handle small size by general register.
		*/
		cmpl $680, n
		jb .Ltoo_small_forwards
		/* movs instruction is only good for aligned case. */
		movl src, tmp0
		xorl dest, tmp0
		andl $0xff, tmp0
		jz .Lforward_movs
		.Ltoo_small_forwards:
		subl $0x10, n

		/* We gobble 16 bytes forward in each loop. */
		.Lmove_16B_forwards_loop:
		subl $0x10, n
		movl 0*4(src), tmp0
		movl 1*4(src), tmp1
		movl tmp0, 0*4(dest)
		movl tmp1, 1*4(dest)
		movl 2*4(src), tmp0
		movl 3*4(src), tmp1
		movl tmp0, 2*4(dest)
		movl tmp1, 3*4(dest)
		leal 0x10(src), src
		leal 0x10(dest), dest
		jae .Lmove_16B_forwards_loop
		addl $0x10, n
		jmp .Lmove_16B

		/* Handle data forward by movs. */
		.p2align 4
		.Lforward_movs:
		movl -4(src, n), tmp0
		leal -4(dest, n), tmp1
		shrl $2, n
		rep movsl
		movl tmp0, (tmp1)
		jmp .Ldone

		/* Handle data backward by movs. */
		.p2align 4
		.Lbackwards_movs:
		movl (src), tmp0
		movl dest, tmp1
		leal -4(src, n), src
		leal -4(dest, n), dest
		shrl $2, n
		std
		rep movsl
		movl tmp0,(tmp1)
		cld
		jmp .Ldone

		/* Start to prepare for backward copy. */
		.p2align 4
		.Lbackwards_header:
		cmpl $680, n
		jb .Ltoo_small_backwards
		movl src, tmp0
		xorl dest, tmp0
		andl $0xff, tmp0
		jz .Lbackwards_movs

		/* Calculate copy position to tail. */
		.Ltoo_small_backwards:
		addl n, src
		addl n, dest
		subl $0x10, n

		/* We gobble 16 bytes backward in each loop. */
		.Lmove_16B_backwards_loop:
		subl $0x10, n

		movl -1*4(src), tmp0
		movl -2*4(src), tmp1
		movl tmp0, -1*4(dest)
		movl tmp1, -2*4(dest)
		movl -3*4(src), tmp0
		movl -4*4(src), tmp1
		movl tmp0, -3*4(dest)
		movl tmp1, -4*4(dest)
		leal -0x10(src), src
		leal -0x10(dest), dest
		jae .Lmove_16B_backwards_loop
		/* Calculate copy position to head. */
		addl $0x10, n
		subl n, src
		subl n, dest

		/* Move data from 8 bytes to 15 bytes. */
		.p2align 4
		.Lmove_16B:
		cmpl $8, n
		jb .Lmove_8B
		movl 0*4(src), tmp0
		movl 1*4(src), tmp1
		movl -2*4(src, n), tmp2
		movl -1*4(src, n), src

		movl tmp0, 0*4(dest)
		movl tmp1, 1*4(dest)
		movl tmp2, -2*4(dest, n)
		movl src, -1*4(dest, n)
		jmp .Ldone

		/* Move data from 4 bytes to 7 bytes. */
		.p2align 4
		.Lmove_8B:
		cmpl $4, n
		jb .Lmove_4B
		movl 0*4(src), tmp0
		movl -1*4(src, n), tmp1
		movl tmp0, 0*4(dest)
		movl tmp1, -1*4(dest, n)
		jmp .Ldone

		/* Move data from 2 bytes to 3 bytes. */
		.p2align 4
		.Lmove_4B:
		cmpl $2, n
		jb .Lmove_1B
		movw 0*2(src), tmp0w
		movw -1*2(src, n), tmp1w
		movw tmp0w, 0*2(dest)
		movw tmp1w, -1*2(dest, n)
		jmp .Ldone

		/* Move data for 1 byte. */
		.p2align 4
		.Lmove_1B:
		cmpl $1, n
		jb .Ldone
		movb (src), tmp3b
		movb tmp3b, (dest)
		.p2align 4
		.Ldone:
		popl dest_in // restore 'dest_in' [eax] as the return value
		/* Restore all callee-saved registers: */
		popl %esi
		popl %edi
		popl %ebx
		popl %ebp

		RET
		SYM_FUNC_END(memmove)
		EXPORT_SYMBOL(memmove)

lib/memcpy_kunit.c

+22 −0

Original line number	Diff line number	Diff line
		@@ -105,6 +105,8 @@ static void memcpy_test(struct kunit *test)
		#undef TEST_OP
		}

		static unsigned char larger_array [2048];

		static void memmove_test(struct kunit *test)
		{
		#define TEST_OP "memmove"
		@@ -179,6 +181,26 @@ static void memmove_test(struct kunit *test)
		ptr = &overlap.data[2];
		memmove(ptr, overlap.data, 5);
		compare("overlapping write", overlap, overlap_expected);

		/* Verify larger overlapping moves. */
		larger_array[256] = 0xAAu;
		/*
		* Test a backwards overlapping memmove first. 256 and 1024 are
		* important for i386 to use rep movsl.
		*/
		memmove(larger_array, larger_array + 256, 1024);
		KUNIT_ASSERT_EQ(test, larger_array[0], 0xAAu);
		KUNIT_ASSERT_EQ(test, larger_array[256], 0x00);
		KUNIT_ASSERT_NULL(test,
		memchr(larger_array + 1, 0xaa, ARRAY_SIZE(larger_array) - 1));
		/* Test a forwards overlapping memmove. */
		larger_array[0] = 0xBBu;
		memmove(larger_array + 256, larger_array, 1024);
		KUNIT_ASSERT_EQ(test, larger_array[0], 0xBBu);
		KUNIT_ASSERT_EQ(test, larger_array[256], 0xBBu);
		KUNIT_ASSERT_NULL(test, memchr(larger_array + 1, 0xBBu, 256 - 1));
		KUNIT_ASSERT_NULL(test,
		memchr(larger_array + 257, 0xBBu, ARRAY_SIZE(larger_array) - 257));
		#undef TEST_OP
		}