Commit 75ded18a authored by WANG Xuerui's avatar WANG Xuerui Committed by Huacai Chen
Browse files

LoongArch: Add SIMD-optimized XOR routines



Add LSX and LASX implementations of xor operations, operating on 64
bytes (one L1 cache line) at a time, for a balance between memory
utilization and instruction mix. Huacai confirmed that all future
LoongArch implementations by Loongson (that we care) will likely also
feature 64-byte cache lines, and experiments show no throughput
improvement with further unrolling.

Performance numbers measured during system boot on a 3A5000 @ 2.5GHz:

> 8regs           : 12702 MB/sec
> 8regs_prefetch  : 10920 MB/sec
> 32regs          : 12686 MB/sec
> 32regs_prefetch : 10918 MB/sec
> lsx             : 17589 MB/sec
> lasx            : 26116 MB/sec

Acked-by: default avatarSong Liu <song@kernel.org>
Signed-off-by: default avatarWANG Xuerui <git@xen0n.name>
Signed-off-by: default avatarHuacai Chen <chenhuacai@loongson.cn>
parent 2478e4b7
Loading
Loading
Loading
Loading
+68 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
 */
#ifndef _ASM_LOONGARCH_XOR_H
#define _ASM_LOONGARCH_XOR_H

#include <asm/cpu-features.h>
#include <asm/xor_simd.h>

#ifdef CONFIG_CPU_HAS_LSX
static struct xor_block_template xor_block_lsx = {
	.name = "lsx",
	.do_2 = xor_lsx_2,
	.do_3 = xor_lsx_3,
	.do_4 = xor_lsx_4,
	.do_5 = xor_lsx_5,
};

#define XOR_SPEED_LSX()					\
	do {						\
		if (cpu_has_lsx)			\
			xor_speed(&xor_block_lsx);	\
	} while (0)
#else /* CONFIG_CPU_HAS_LSX */
#define XOR_SPEED_LSX()
#endif /* CONFIG_CPU_HAS_LSX */

#ifdef CONFIG_CPU_HAS_LASX
static struct xor_block_template xor_block_lasx = {
	.name = "lasx",
	.do_2 = xor_lasx_2,
	.do_3 = xor_lasx_3,
	.do_4 = xor_lasx_4,
	.do_5 = xor_lasx_5,
};

#define XOR_SPEED_LASX()					\
	do {							\
		if (cpu_has_lasx)				\
			xor_speed(&xor_block_lasx);		\
	} while (0)
#else /* CONFIG_CPU_HAS_LASX */
#define XOR_SPEED_LASX()
#endif /* CONFIG_CPU_HAS_LASX */

/*
 * For grins, also test the generic routines.
 *
 * More importantly: it cannot be ruled out at this point of time, that some
 * future (maybe reduced) models could run the vector algorithms slower than
 * the scalar ones, maybe for errata or micro-op reasons. It may be
 * appropriate to revisit this after one or two more uarch generations.
 */
#include <asm-generic/xor.h>

#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES				\
do {							\
	xor_speed(&xor_block_8regs);			\
	xor_speed(&xor_block_8regs_p);			\
	xor_speed(&xor_block_32regs);			\
	xor_speed(&xor_block_32regs_p);			\
	XOR_SPEED_LSX();				\
	XOR_SPEED_LASX();				\
} while (0)

#endif /* _ASM_LOONGARCH_XOR_H */
+34 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
 */
#ifndef _ASM_LOONGARCH_XOR_SIMD_H
#define _ASM_LOONGARCH_XOR_SIMD_H

#ifdef CONFIG_CPU_HAS_LSX
void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
	       const unsigned long * __restrict p2);
void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
	       const unsigned long * __restrict p2, const unsigned long * __restrict p3);
void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
	       const unsigned long * __restrict p2, const unsigned long * __restrict p3,
	       const unsigned long * __restrict p4);
void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
	       const unsigned long * __restrict p2, const unsigned long * __restrict p3,
	       const unsigned long * __restrict p4, const unsigned long * __restrict p5);
#endif /* CONFIG_CPU_HAS_LSX */

#ifdef CONFIG_CPU_HAS_LASX
void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
	        const unsigned long * __restrict p2);
void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
	        const unsigned long * __restrict p2, const unsigned long * __restrict p3);
void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
	        const unsigned long * __restrict p2, const unsigned long * __restrict p3,
	        const unsigned long * __restrict p4);
void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
	        const unsigned long * __restrict p2, const unsigned long * __restrict p3,
	        const unsigned long * __restrict p4, const unsigned long * __restrict p5);
#endif /* CONFIG_CPU_HAS_LASX */

#endif /* _ASM_LOONGARCH_XOR_SIMD_H */
+2 −0
Original line number Diff line number Diff line
@@ -6,4 +6,6 @@
lib-y	+= delay.o memset.o memcpy.o memmove.o \
	   clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o

obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o

obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+93 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * LoongArch SIMD XOR operations
 *
 * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
 */

#include "xor_simd.h"

/*
 * Process one cache line (64 bytes) per loop. This is assuming all future
 * popular LoongArch cores are similar performance-characteristics-wise to the
 * current models.
 */
#define LINE_WIDTH 64

#ifdef CONFIG_CPU_HAS_LSX

#define LD(reg, base, offset)	\
	"vld $vr" #reg ", %[" #base "], " #offset "\n\t"
#define ST(reg, base, offset)	\
	"vst $vr" #reg ", %[" #base "], " #offset "\n\t"
#define XOR(dj, k)	"vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t"

#define LD_INOUT_LINE(base)	\
	LD(0, base, 0)		\
	LD(1, base, 16)		\
	LD(2, base, 32)		\
	LD(3, base, 48)

#define LD_AND_XOR_LINE(base)	\
	LD(4, base, 0)		\
	LD(5, base, 16)		\
	LD(6, base, 32)		\
	LD(7, base, 48)		\
	XOR(0, 4)		\
	XOR(1, 5)		\
	XOR(2, 6)		\
	XOR(3, 7)

#define ST_LINE(base)		\
	ST(0, base, 0)		\
	ST(1, base, 16)		\
	ST(2, base, 32)		\
	ST(3, base, 48)

#define XOR_FUNC_NAME(nr) __xor_lsx_##nr
#include "xor_template.c"

#undef LD
#undef ST
#undef XOR
#undef LD_INOUT_LINE
#undef LD_AND_XOR_LINE
#undef ST_LINE
#undef XOR_FUNC_NAME

#endif /* CONFIG_CPU_HAS_LSX */

#ifdef CONFIG_CPU_HAS_LASX

#define LD(reg, base, offset)	\
	"xvld $xr" #reg ", %[" #base "], " #offset "\n\t"
#define ST(reg, base, offset)	\
	"xvst $xr" #reg ", %[" #base "], " #offset "\n\t"
#define XOR(dj, k)	"xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t"

#define LD_INOUT_LINE(base)	\
	LD(0, base, 0)		\
	LD(1, base, 32)

#define LD_AND_XOR_LINE(base)	\
	LD(2, base, 0)		\
	LD(3, base, 32)		\
	XOR(0, 2)		\
	XOR(1, 3)

#define ST_LINE(base)		\
	ST(0, base, 0)		\
	ST(1, base, 32)

#define XOR_FUNC_NAME(nr) __xor_lasx_##nr
#include "xor_template.c"

#undef LD
#undef ST
#undef XOR
#undef LD_INOUT_LINE
#undef LD_AND_XOR_LINE
#undef ST_LINE
#undef XOR_FUNC_NAME

#endif /* CONFIG_CPU_HAS_LASX */
+38 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Simple interface to link xor_simd.c and xor_simd_glue.c
 *
 * Separating these files ensures that no SIMD instructions are run outside of
 * the kfpu critical section.
 */

#ifndef __LOONGARCH_LIB_XOR_SIMD_H
#define __LOONGARCH_LIB_XOR_SIMD_H

#ifdef CONFIG_CPU_HAS_LSX
void __xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
		 const unsigned long * __restrict p2);
void __xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
		 const unsigned long * __restrict p2, const unsigned long * __restrict p3);
void __xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
		 const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		 const unsigned long * __restrict p4);
void __xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
		 const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		 const unsigned long * __restrict p4, const unsigned long * __restrict p5);
#endif /* CONFIG_CPU_HAS_LSX */

#ifdef CONFIG_CPU_HAS_LASX
void __xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
		  const unsigned long * __restrict p2);
void __xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
		  const unsigned long * __restrict p2, const unsigned long * __restrict p3);
void __xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
		  const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		  const unsigned long * __restrict p4);
void __xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
		  const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		  const unsigned long * __restrict p4, const unsigned long * __restrict p5);
#endif /* CONFIG_CPU_HAS_LASX */

#endif /* __LOONGARCH_LIB_XOR_SIMD_H */
Loading