arm64: Import latest memcpy()/memmove() implementation (28513304) · Commits · EulixOS / Software / Kernel

arch/arm64/lib/Makefile

+1 −1

Original line number	Diff line number	Diff line
		# SPDX-License-Identifier: GPL-2.0
		lib-y := clear_user.o delay.o copy_from_user.o \
		copy_to_user.o copy_in_user.o copy_page.o \
		clear_page.o csum.o memchr.o memcpy.o memmove.o \
		clear_page.o csum.o memchr.o memcpy.o \
		memset.o memcmp.o strcmp.o strncmp.o strlen.o \
		strnlen.o strchr.o strrchr.o tishift.o

arch/arm64/lib/memcpy.S

+229 −43

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-only */
		/*
		* Copyright (C) 2013 ARM Ltd.
		* Copyright (C) 2013 Linaro.
		* Copyright (c) 2012-2020, Arm Limited.
		*
		* This code is based on glibc cortex strings work originally authored by Linaro
		* be found @
		*
		* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		* files/head:/src/aarch64/
		* Adapted from the original at:
		* https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
		*/

		#include <linux/linkage.h>
		#include <asm/assembler.h>
		#include <asm/cache.h>

		/*
		* Copy a buffer from src to dest (alignment handled by the hardware)
		/* Assumptions:
		*
		* ARMv8-a, AArch64, unaligned accesses.
		*
		* Parameters:
		* x0 - dest
		* x1 - src
		* x2 - n
		* Returns:
		* x0 - dest
		*/
		.macro ldrb1 reg, ptr, val
		ldrb \reg, [\ptr], \val
		.endm

		.macro strb1 reg, ptr, val
		strb \reg, [\ptr], \val
		.endm

		.macro ldrh1 reg, ptr, val
		ldrh \reg, [\ptr], \val
		.endm
		#define L(label) .L ## label

		.macro strh1 reg, ptr, val
		strh \reg, [\ptr], \val
		.endm
		#define dstin x0
		#define src x1
		#define count x2
		#define dst x3
		#define srcend x4
		#define dstend x5
		#define A_l x6
		#define A_lw w6
		#define A_h x7
		#define B_l x8
		#define B_lw w8
		#define B_h x9
		#define C_l x10
		#define C_lw w10
		#define C_h x11
		#define D_l x12
		#define D_h x13
		#define E_l x14
		#define E_h x15
		#define F_l x16
		#define F_h x17
		#define G_l count
		#define G_h dst
		#define H_l src
		#define H_h srcend
		#define tmp1 x14

		.macro ldr1 reg, ptr, val
		ldr \reg, [\ptr], \val
		.endm
		/* This implementation handles overlaps and supports both memcpy and memmove
		from a single entry point. It uses unaligned accesses and branchless
		sequences to keep the code small, simple and improve performance.

		.macro str1 reg, ptr, val
		str \reg, [\ptr], \val
		.endm
		Copies are split into 3 main cases: small copies of up to 32 bytes, medium
		copies of up to 128 bytes, and large copies. The overhead of the overlap
		check is negligible since it is only required for large copies.

		.macro ldp1 reg1, reg2, ptr, val
		ldp \reg1, \reg2, [\ptr], \val
		.endm

		.macro stp1 reg1, reg2, ptr, val
		stp \reg1, \reg2, [\ptr], \val
		.endm
		Large copies use a software pipelined loop processing 64 bytes per iteration.
		The destination pointer is 16-byte aligned to minimize unaligned accesses.
		The loop tail is handled by always copying 64 bytes from the end.
		*/

		SYM_FUNC_START_ALIAS(__memmove)
		SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
		SYM_FUNC_START_ALIAS(__memcpy)
		SYM_FUNC_START_WEAK_PI(memcpy)
		#include "copy_template.S"
		add srcend, src, count
		add dstend, dstin, count
		cmp count, 128
		b.hi L(copy_long)
		cmp count, 32
		b.hi L(copy32_128)

		/* Small copies: 0..32 bytes. */
		cmp count, 16
		b.lo L(copy16)
		ldp A_l, A_h, [src]
		ldp D_l, D_h, [srcend, -16]
		stp A_l, A_h, [dstin]
		stp D_l, D_h, [dstend, -16]
		ret

		/* Copy 8-15 bytes. */
		L(copy16):
		tbz count, 3, L(copy8)
		ldr A_l, [src]
		ldr A_h, [srcend, -8]
		str A_l, [dstin]
		str A_h, [dstend, -8]
		ret

		.p2align 3
		/* Copy 4-7 bytes. */
		L(copy8):
		tbz count, 2, L(copy4)
		ldr A_lw, [src]
		ldr B_lw, [srcend, -4]
		str A_lw, [dstin]
		str B_lw, [dstend, -4]
		ret

		/* Copy 0..3 bytes using a branchless sequence. */
		L(copy4):
		cbz count, L(copy0)
		lsr tmp1, count, 1
		ldrb A_lw, [src]
		ldrb C_lw, [srcend, -1]
		ldrb B_lw, [src, tmp1]
		strb A_lw, [dstin]
		strb B_lw, [dstin, tmp1]
		strb C_lw, [dstend, -1]
		L(copy0):
		ret

		.p2align 4
		/* Medium copies: 33..128 bytes. */
		L(copy32_128):
		ldp A_l, A_h, [src]
		ldp B_l, B_h, [src, 16]
		ldp C_l, C_h, [srcend, -32]
		ldp D_l, D_h, [srcend, -16]
		cmp count, 64
		b.hi L(copy128)
		stp A_l, A_h, [dstin]
		stp B_l, B_h, [dstin, 16]
		stp C_l, C_h, [dstend, -32]
		stp D_l, D_h, [dstend, -16]
		ret

		.p2align 4
		/* Copy 65..128 bytes. */
		L(copy128):
		ldp E_l, E_h, [src, 32]
		ldp F_l, F_h, [src, 48]
		cmp count, 96
		b.ls L(copy96)
		ldp G_l, G_h, [srcend, -64]
		ldp H_l, H_h, [srcend, -48]
		stp G_l, G_h, [dstend, -64]
		stp H_l, H_h, [dstend, -48]
		L(copy96):
		stp A_l, A_h, [dstin]
		stp B_l, B_h, [dstin, 16]
		stp E_l, E_h, [dstin, 32]
		stp F_l, F_h, [dstin, 48]
		stp C_l, C_h, [dstend, -32]
		stp D_l, D_h, [dstend, -16]
		ret

		.p2align 4
		/* Copy more than 128 bytes. */
		L(copy_long):
		/* Use backwards copy if there is an overlap. */
		sub tmp1, dstin, src
		cbz tmp1, L(copy0)
		cmp tmp1, count
		b.lo L(copy_long_backwards)

		/* Copy 16 bytes and then align dst to 16-byte alignment. */

		ldp D_l, D_h, [src]
		and tmp1, dstin, 15
		bic dst, dstin, 15
		sub src, src, tmp1
		add count, count, tmp1 /* Count is now 16 too large. */
		ldp A_l, A_h, [src, 16]
		stp D_l, D_h, [dstin]
		ldp B_l, B_h, [src, 32]
		ldp C_l, C_h, [src, 48]
		ldp D_l, D_h, [src, 64]!
		subs count, count, 128 + 16 /* Test and readjust count. */
		b.ls L(copy64_from_end)

		L(loop64):
		stp A_l, A_h, [dst, 16]
		ldp A_l, A_h, [src, 16]
		stp B_l, B_h, [dst, 32]
		ldp B_l, B_h, [src, 32]
		stp C_l, C_h, [dst, 48]
		ldp C_l, C_h, [src, 48]
		stp D_l, D_h, [dst, 64]!
		ldp D_l, D_h, [src, 64]!
		subs count, count, 64
		b.hi L(loop64)

		/* Write the last iteration and copy 64 bytes from the end. */
		L(copy64_from_end):
		ldp E_l, E_h, [srcend, -64]
		stp A_l, A_h, [dst, 16]
		ldp A_l, A_h, [srcend, -48]
		stp B_l, B_h, [dst, 32]
		ldp B_l, B_h, [srcend, -32]
		stp C_l, C_h, [dst, 48]
		ldp C_l, C_h, [srcend, -16]
		stp D_l, D_h, [dst, 64]
		stp E_l, E_h, [dstend, -64]
		stp A_l, A_h, [dstend, -48]
		stp B_l, B_h, [dstend, -32]
		stp C_l, C_h, [dstend, -16]
		ret

		.p2align 4

		/* Large backwards copy for overlapping copies.
		Copy 16 bytes and then align dst to 16-byte alignment. */
		L(copy_long_backwards):
		ldp D_l, D_h, [srcend, -16]
		and tmp1, dstend, 15
		sub srcend, srcend, tmp1
		sub count, count, tmp1
		ldp A_l, A_h, [srcend, -16]
		stp D_l, D_h, [dstend, -16]
		ldp B_l, B_h, [srcend, -32]
		ldp C_l, C_h, [srcend, -48]
		ldp D_l, D_h, [srcend, -64]!
		sub dstend, dstend, tmp1
		subs count, count, 128
		b.ls L(copy64_from_start)

		L(loop64_backwards):
		stp A_l, A_h, [dstend, -16]
		ldp A_l, A_h, [srcend, -16]
		stp B_l, B_h, [dstend, -32]
		ldp B_l, B_h, [srcend, -32]
		stp C_l, C_h, [dstend, -48]
		ldp C_l, C_h, [srcend, -48]
		stp D_l, D_h, [dstend, -64]!
		ldp D_l, D_h, [srcend, -64]!
		subs count, count, 64
		b.hi L(loop64_backwards)

		/* Write the last iteration and copy 64 bytes from the start. */
		L(copy64_from_start):
		ldp G_l, G_h, [src, 48]
		stp A_l, A_h, [dstend, -16]
		ldp A_l, A_h, [src, 32]
		stp B_l, B_h, [dstend, -32]
		ldp B_l, B_h, [src, 16]
		stp C_l, C_h, [dstend, -48]
		ldp C_l, C_h, [src]
		stp D_l, D_h, [dstend, -64]
		stp G_l, G_h, [dstin, 48]
		stp A_l, A_h, [dstin, 32]
		stp B_l, B_h, [dstin, 16]
		stp C_l, C_h, [dstin]
		ret

		SYM_FUNC_END_PI(memcpy)
		EXPORT_SYMBOL(memcpy)
		SYM_FUNC_END_ALIAS(__memcpy)
		EXPORT_SYMBOL(__memcpy)
		SYM_FUNC_END_ALIAS_PI(memmove)
		EXPORT_SYMBOL(memmove)
		SYM_FUNC_END_ALIAS(__memmove)
		EXPORT_SYMBOL(__memmove)
		No newline at end of file

arch/arm64/lib/memmove.S

deleted100644 → 0

+0 −189

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-only */
		/*
		* Copyright (C) 2013 ARM Ltd.
		* Copyright (C) 2013 Linaro.
		*
		* This code is based on glibc cortex strings work originally authored by Linaro
		* be found @
		*
		* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		* files/head:/src/aarch64/
		*/

		#include <linux/linkage.h>
		#include <asm/assembler.h>
		#include <asm/cache.h>

		/*
		* Move a buffer from src to test (alignment handled by the hardware).
		* If dest <= src, call memcpy, otherwise copy in reverse order.
		*
		* Parameters:
		* x0 - dest
		* x1 - src
		* x2 - n
		* Returns:
		* x0 - dest
		*/
		dstin .req x0
		src .req x1
		count .req x2
		tmp1 .req x3
		tmp1w .req w3
		tmp2 .req x4
		tmp2w .req w4
		tmp3 .req x5
		tmp3w .req w5
		dst .req x6

		A_l .req x7
		A_h .req x8
		B_l .req x9
		B_h .req x10
		C_l .req x11
		C_h .req x12
		D_l .req x13
		D_h .req x14

		SYM_FUNC_START_ALIAS(__memmove)
		SYM_FUNC_START_WEAK_PI(memmove)
		cmp dstin, src
		b.lo __memcpy
		add tmp1, src, count
		cmp dstin, tmp1
		b.hs __memcpy /* No overlap. */

		add dst, dstin, count
		add src, src, count
		cmp count, #16
		b.lo .Ltail15 /probably non-alignment accesses./

		ands tmp2, src, #15 /* Bytes to reach alignment. */
		b.eq .LSrcAligned
		sub count, count, tmp2
		/*
		* process the aligned offset length to make the src aligned firstly.
		* those extra instructions' cost is acceptable. It also make the
		* coming accesses are based on aligned address.
		*/
		tbz tmp2, #0, 1f
		ldrb tmp1w, [src, #-1]!
		strb tmp1w, [dst, #-1]!
		1:
		tbz tmp2, #1, 2f
		ldrh tmp1w, [src, #-2]!
		strh tmp1w, [dst, #-2]!
		2:
		tbz tmp2, #2, 3f
		ldr tmp1w, [src, #-4]!
		str tmp1w, [dst, #-4]!
		3:
		tbz tmp2, #3, .LSrcAligned
		ldr tmp1, [src, #-8]!
		str tmp1, [dst, #-8]!

		.LSrcAligned:
		cmp count, #64
		b.ge .Lcpy_over64

		/*
		* Deal with small copies quickly by dropping straight into the
		* exit block.
		*/
		.Ltail63:
		/*
		* Copy up to 48 bytes of data. At this point we only need the
		* bottom 6 bits of count to be accurate.
		*/
		ands tmp1, count, #0x30
		b.eq .Ltail15
		cmp tmp1w, #0x20
		b.eq 1f
		b.lt 2f
		ldp A_l, A_h, [src, #-16]!
		stp A_l, A_h, [dst, #-16]!
		1:
		ldp A_l, A_h, [src, #-16]!
		stp A_l, A_h, [dst, #-16]!
		2:
		ldp A_l, A_h, [src, #-16]!
		stp A_l, A_h, [dst, #-16]!

		.Ltail15:
		tbz count, #3, 1f
		ldr tmp1, [src, #-8]!
		str tmp1, [dst, #-8]!
		1:
		tbz count, #2, 2f
		ldr tmp1w, [src, #-4]!
		str tmp1w, [dst, #-4]!
		2:
		tbz count, #1, 3f
		ldrh tmp1w, [src, #-2]!
		strh tmp1w, [dst, #-2]!
		3:
		tbz count, #0, .Lexitfunc
		ldrb tmp1w, [src, #-1]
		strb tmp1w, [dst, #-1]

		.Lexitfunc:
		ret

		.Lcpy_over64:
		subs count, count, #128
		b.ge .Lcpy_body_large
		/*
		* Less than 128 bytes to copy, so handle 64 bytes here and then jump
		* to the tail.
		*/
		ldp A_l, A_h, [src, #-16]
		stp A_l, A_h, [dst, #-16]
		ldp B_l, B_h, [src, #-32]
		ldp C_l, C_h, [src, #-48]
		stp B_l, B_h, [dst, #-32]
		stp C_l, C_h, [dst, #-48]
		ldp D_l, D_h, [src, #-64]!
		stp D_l, D_h, [dst, #-64]!

		tst count, #0x3f
		b.ne .Ltail63
		ret

		/*
		* Critical loop. Start at a new cache line boundary. Assuming
		* 64 bytes per line this ensures the entire loop is in one line.
		*/
		.p2align L1_CACHE_SHIFT
		.Lcpy_body_large:
		/* pre-load 64 bytes data. */
		ldp A_l, A_h, [src, #-16]
		ldp B_l, B_h, [src, #-32]
		ldp C_l, C_h, [src, #-48]
		ldp D_l, D_h, [src, #-64]!
		1:
		/*
		* interlace the load of next 64 bytes data block with store of the last
		* loaded 64 bytes data.
		*/
		stp A_l, A_h, [dst, #-16]
		ldp A_l, A_h, [src, #-16]
		stp B_l, B_h, [dst, #-32]
		ldp B_l, B_h, [src, #-32]
		stp C_l, C_h, [dst, #-48]
		ldp C_l, C_h, [src, #-48]
		stp D_l, D_h, [dst, #-64]!
		ldp D_l, D_h, [src, #-64]!
		subs count, count, #64
		b.ge 1b
		stp A_l, A_h, [dst, #-16]
		stp B_l, B_h, [dst, #-32]
		stp C_l, C_h, [dst, #-48]
		stp D_l, D_h, [dst, #-64]!

		tst count, #0x3f
		b.ne .Ltail63
		ret
		SYM_FUNC_END_PI(memmove)
		EXPORT_SYMBOL(memmove)
		SYM_FUNC_END_ALIAS(__memmove)
		EXPORT_SYMBOL(__memmove)