LoongArch: Add SIMD-optimized XOR routines (75ded18a) · Commits · EulixOS / Software / Kernel

arch/loongarch/include/asm/xor.h

0 → 100644

+68 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-or-later */
		/*
		* Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
		*/
		#ifndef _ASM_LOONGARCH_XOR_H
		#define _ASM_LOONGARCH_XOR_H

		#include <asm/cpu-features.h>
		#include <asm/xor_simd.h>

		#ifdef CONFIG_CPU_HAS_LSX
		static struct xor_block_template xor_block_lsx = {
		.name = "lsx",
		.do_2 = xor_lsx_2,
		.do_3 = xor_lsx_3,
		.do_4 = xor_lsx_4,
		.do_5 = xor_lsx_5,
		};

		#define XOR_SPEED_LSX() \
		do { \
		if (cpu_has_lsx) \
		xor_speed(&xor_block_lsx); \
		} while (0)
		#else /* CONFIG_CPU_HAS_LSX */
		#define XOR_SPEED_LSX()
		#endif /* CONFIG_CPU_HAS_LSX */

		#ifdef CONFIG_CPU_HAS_LASX
		static struct xor_block_template xor_block_lasx = {
		.name = "lasx",
		.do_2 = xor_lasx_2,
		.do_3 = xor_lasx_3,
		.do_4 = xor_lasx_4,
		.do_5 = xor_lasx_5,
		};

		#define XOR_SPEED_LASX() \
		do { \
		if (cpu_has_lasx) \
		xor_speed(&xor_block_lasx); \
		} while (0)
		#else /* CONFIG_CPU_HAS_LASX */
		#define XOR_SPEED_LASX()
		#endif /* CONFIG_CPU_HAS_LASX */

		/*
		* For grins, also test the generic routines.
		*
		* More importantly: it cannot be ruled out at this point of time, that some
		* future (maybe reduced) models could run the vector algorithms slower than
		* the scalar ones, maybe for errata or micro-op reasons. It may be
		* appropriate to revisit this after one or two more uarch generations.
		*/
		#include <asm-generic/xor.h>

		#undef XOR_TRY_TEMPLATES
		#define XOR_TRY_TEMPLATES \
		do { \
		xor_speed(&xor_block_8regs); \
		xor_speed(&xor_block_8regs_p); \
		xor_speed(&xor_block_32regs); \
		xor_speed(&xor_block_32regs_p); \
		XOR_SPEED_LSX(); \
		XOR_SPEED_LASX(); \
		} while (0)

		#endif /* _ASM_LOONGARCH_XOR_H */

arch/loongarch/include/asm/xor_simd.h

0 → 100644

+34 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-or-later */
		/*
		* Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
		*/
		#ifndef _ASM_LOONGARCH_XOR_SIMD_H
		#define _ASM_LOONGARCH_XOR_SIMD_H

		#ifdef CONFIG_CPU_HAS_LSX
		void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2);
		void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3);
		void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4);
		void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4, const unsigned long * __restrict p5);
		#endif /* CONFIG_CPU_HAS_LSX */

		#ifdef CONFIG_CPU_HAS_LASX
		void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2);
		void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3);
		void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4);
		void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4, const unsigned long * __restrict p5);
		#endif /* CONFIG_CPU_HAS_LASX */

		#endif /* _ASM_LOONGARCH_XOR_SIMD_H */

arch/loongarch/lib/Makefile

+2 −0

Original line number	Diff line number	Diff line
		@@ -6,4 +6,6 @@
		lib-y += delay.o memset.o memcpy.o memmove.o \
		clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o

		obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o

		obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o

arch/loongarch/lib/xor_simd.c

0 → 100644

+93 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0-or-later
		/*
		* LoongArch SIMD XOR operations
		*
		* Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
		*/

		#include "xor_simd.h"

		/*
		* Process one cache line (64 bytes) per loop. This is assuming all future
		* popular LoongArch cores are similar performance-characteristics-wise to the
		* current models.
		*/
		#define LINE_WIDTH 64

		#ifdef CONFIG_CPU_HAS_LSX

		#define LD(reg, base, offset) \
		"vld $vr" #reg ", %[" #base "], " #offset "\n\t"
		#define ST(reg, base, offset) \
		"vst $vr" #reg ", %[" #base "], " #offset "\n\t"
		#define XOR(dj, k) "vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t"

		#define LD_INOUT_LINE(base) \
		LD(0, base, 0) \
		LD(1, base, 16) \
		LD(2, base, 32) \
		LD(3, base, 48)

		#define LD_AND_XOR_LINE(base) \
		LD(4, base, 0) \
		LD(5, base, 16) \
		LD(6, base, 32) \
		LD(7, base, 48) \
		XOR(0, 4) \
		XOR(1, 5) \
		XOR(2, 6) \
		XOR(3, 7)

		#define ST_LINE(base) \
		ST(0, base, 0) \
		ST(1, base, 16) \
		ST(2, base, 32) \
		ST(3, base, 48)

		#define XOR_FUNC_NAME(nr) __xor_lsx_##nr
		#include "xor_template.c"

		#undef LD
		#undef ST
		#undef XOR
		#undef LD_INOUT_LINE
		#undef LD_AND_XOR_LINE
		#undef ST_LINE
		#undef XOR_FUNC_NAME

		#endif /* CONFIG_CPU_HAS_LSX */

		#ifdef CONFIG_CPU_HAS_LASX

		#define LD(reg, base, offset) \
		"xvld $xr" #reg ", %[" #base "], " #offset "\n\t"
		#define ST(reg, base, offset) \
		"xvst $xr" #reg ", %[" #base "], " #offset "\n\t"
		#define XOR(dj, k) "xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t"

		#define LD_INOUT_LINE(base) \
		LD(0, base, 0) \
		LD(1, base, 32)

		#define LD_AND_XOR_LINE(base) \
		LD(2, base, 0) \
		LD(3, base, 32) \
		XOR(0, 2) \
		XOR(1, 3)

		#define ST_LINE(base) \
		ST(0, base, 0) \
		ST(1, base, 32)

		#define XOR_FUNC_NAME(nr) __xor_lasx_##nr
		#include "xor_template.c"

		#undef LD
		#undef ST
		#undef XOR
		#undef LD_INOUT_LINE
		#undef LD_AND_XOR_LINE
		#undef ST_LINE
		#undef XOR_FUNC_NAME

		#endif /* CONFIG_CPU_HAS_LASX */

arch/loongarch/lib/xor_simd.h

0 → 100644

+38 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-or-later */
		/*
		* Simple interface to link xor_simd.c and xor_simd_glue.c
		*
		* Separating these files ensures that no SIMD instructions are run outside of
		* the kfpu critical section.
		*/

		#ifndef __LOONGARCH_LIB_XOR_SIMD_H
		#define __LOONGARCH_LIB_XOR_SIMD_H

		#ifdef CONFIG_CPU_HAS_LSX
		void __xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2);
		void __xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3);
		void __xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4);
		void __xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4, const unsigned long * __restrict p5);
		#endif /* CONFIG_CPU_HAS_LSX */

		#ifdef CONFIG_CPU_HAS_LASX
		void __xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2);
		void __xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3);
		void __xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4);
		void __xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
		const unsigned long * __restrict p2, const unsigned long * __restrict p3,
		const unsigned long * __restrict p4, const unsigned long * __restrict p5);
		#endif /* CONFIG_CPU_HAS_LASX */

		#endif /* __LOONGARCH_LIB_XOR_SIMD_H */