Commit 9e51cafd authored by Robin Murphy's avatar Robin Murphy Committed by Will Deacon
Browse files

arm64: Better optimised memchr()



Although we implement our own assembly version of memchr(), it turns
out to be barely any better than what GCC can generate for the generic
C version (and would go wrong if the size_t argument were ever large
enough to be interpreted as negative). Unfortunately we can't import the
tuned implementation from the Arm optimized-routines library, since that
has some Advanced SIMD parts which are not really viable for general
kernel library code. What we can do, however, is pep things up with some
relatively straightforward word-at-a-time logic for larger calls.

Adding some timing to optimized-routines' memchr() test for a simple
benchmark, overall this version comes in around half as fast as the SIMD
code, but still nearly 4x faster than our existing implementation.

Signed-off-by: default avatarRobin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/58471b42f9287e039dafa9e5e7035077152438fd.1622128527.git.robin.murphy@arm.com


Signed-off-by: default avatarWill Deacon <will@kernel.org>
parent 28513304
Loading
Loading
Loading
Loading
+53 −12
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/lib/memchr.S
 *
 * Copyright (C) 1995-2000 Russell King
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2021 Arm Ltd.
 */

#include <linux/linkage.h>
@@ -19,16 +16,60 @@
 * Returns:
 *	x0 - address of first occurrence of 'c' or 0
 */

#define L(label) .L ## label

#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f

#define srcin		x0
#define chrin		w1
#define cntin		x2

#define result		x0

#define wordcnt		x3
#define rep01		x4
#define repchr		x5
#define cur_word	x6
#define cur_byte	w6
#define tmp		x7
#define tmp2		x8

	.p2align 4
	nop
SYM_FUNC_START_WEAK_PI(memchr)
	and	w1, w1, #0xff
1:	subs	x2, x2, #1
	b.mi	2f
	ldrb	w3, [x0], #1
	cmp	w3, w1
	b.ne	1b
	sub	x0, x0, #1
	and	chrin, chrin, #0xff
	lsr	wordcnt, cntin, #3
	cbz	wordcnt, L(byte_loop)
	mov	rep01, #REP8_01
	mul	repchr, x1, rep01
	and	cntin, cntin, #7
L(word_loop):
	ldr	cur_word, [srcin], #8
	sub	wordcnt, wordcnt, #1
	eor	cur_word, cur_word, repchr
	sub	tmp, cur_word, rep01
	orr	tmp2, cur_word, #REP8_7f
	bics	tmp, tmp, tmp2
	b.ne	L(found_word)
	cbnz	wordcnt, L(word_loop)
L(byte_loop):
	cbz	cntin, L(not_found)
	ldrb	cur_byte, [srcin], #1
	sub	cntin, cntin, #1
	cmp	cur_byte, chrin
	b.ne	L(byte_loop)
	sub	srcin, srcin, #1
	ret
L(found_word):
CPU_LE(	rev	tmp, tmp)
	clz	tmp, tmp
	sub	tmp, tmp, #64
	add	result, srcin, tmp, asr #3
	ret
2:	mov	x0, #0
L(not_found):
	mov	result, #0
	ret
SYM_FUNC_END_PI(memchr)
EXPORT_SYMBOL_NOKASAN(memchr)