Commit 2c54b423 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Catalin Marinas
Browse files

arm64/xor: use EOR3 instructions when available



Use the EOR3 instruction to implement xor_blocks() if the instruction is
available, which is the case if the CPU implements the SHA-3 extension.
This is about 20% faster on Apple M1 when using the 5-way version.

Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20211213140252.2856053-1-ardb@kernel.org


Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent d58071a8
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -1545,6 +1545,12 @@ endmenu

menu "ARMv8.2 architectural features"

config AS_HAS_ARMV8_2
       def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)

config AS_HAS_SHA3
       def_bool $(as-instr,.arch armv8.2-a+sha3)

config ARM64_PMEM
	bool "Enable support for persistent memory"
	select ARCH_HAS_PMEM_API
+5 −0
Original line number Diff line number Diff line
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
					include/generated/asm-offsets.h))
endif

ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
# make sure to pass the newest target architecture to -march.
asm-arch := armv8.2-a
endif

# Ensure that if the compiler supports branch protection we default it
# off, this will be overridden if we are using branch protection.
branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
+146 −1
Original line number Diff line number Diff line
@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
	} while (--lines > 0);
}

struct xor_block_template const xor_block_inner_neon = {
struct xor_block_template xor_block_inner_neon __ro_after_init = {
	.name	= "__inner_neon__",
	.do_2	= xor_arm64_neon_2,
	.do_3	= xor_arm64_neon_3,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
};
EXPORT_SYMBOL(xor_block_inner_neon);

static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
{
	uint64x2_t res;

	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
	return res;
}

static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
			     unsigned long *p2, unsigned long *p3)
{
	uint64_t *dp1 = (uint64_t *)p1;
	uint64_t *dp2 = (uint64_t *)p2;
	uint64_t *dp3 = (uint64_t *)p3;

	register uint64x2_t v0, v1, v2, v3;
	long lines = bytes / (sizeof(uint64x2_t) * 4);

	do {
		/* p1 ^= p2 ^ p3 */
		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
			  vld1q_u64(dp3 + 0));
		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
			  vld1q_u64(dp3 + 2));
		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
			  vld1q_u64(dp3 + 4));
		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
			  vld1q_u64(dp3 + 6));

		/* store */
		vst1q_u64(dp1 + 0, v0);
		vst1q_u64(dp1 + 2, v1);
		vst1q_u64(dp1 + 4, v2);
		vst1q_u64(dp1 + 6, v3);

		dp1 += 8;
		dp2 += 8;
		dp3 += 8;
	} while (--lines > 0);
}

static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
			     unsigned long *p2, unsigned long *p3,
			     unsigned long *p4)
{
	uint64_t *dp1 = (uint64_t *)p1;
	uint64_t *dp2 = (uint64_t *)p2;
	uint64_t *dp3 = (uint64_t *)p3;
	uint64_t *dp4 = (uint64_t *)p4;

	register uint64x2_t v0, v1, v2, v3;
	long lines = bytes / (sizeof(uint64x2_t) * 4);

	do {
		/* p1 ^= p2 ^ p3 */
		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
			  vld1q_u64(dp3 + 0));
		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
			  vld1q_u64(dp3 + 2));
		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
			  vld1q_u64(dp3 + 4));
		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
			  vld1q_u64(dp3 + 6));

		/* p1 ^= p4 */
		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));

		/* store */
		vst1q_u64(dp1 + 0, v0);
		vst1q_u64(dp1 + 2, v1);
		vst1q_u64(dp1 + 4, v2);
		vst1q_u64(dp1 + 6, v3);

		dp1 += 8;
		dp2 += 8;
		dp3 += 8;
		dp4 += 8;
	} while (--lines > 0);
}

static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
			     unsigned long *p2, unsigned long *p3,
			     unsigned long *p4, unsigned long *p5)
{
	uint64_t *dp1 = (uint64_t *)p1;
	uint64_t *dp2 = (uint64_t *)p2;
	uint64_t *dp3 = (uint64_t *)p3;
	uint64_t *dp4 = (uint64_t *)p4;
	uint64_t *dp5 = (uint64_t *)p5;

	register uint64x2_t v0, v1, v2, v3;
	long lines = bytes / (sizeof(uint64x2_t) * 4);

	do {
		/* p1 ^= p2 ^ p3 */
		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
			  vld1q_u64(dp3 + 0));
		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
			  vld1q_u64(dp3 + 2));
		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
			  vld1q_u64(dp3 + 4));
		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
			  vld1q_u64(dp3 + 6));

		/* p1 ^= p4 ^ p5 */
		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));

		/* store */
		vst1q_u64(dp1 + 0, v0);
		vst1q_u64(dp1 + 2, v1);
		vst1q_u64(dp1 + 4, v2);
		vst1q_u64(dp1 + 6, v3);

		dp1 += 8;
		dp2 += 8;
		dp3 += 8;
		dp4 += 8;
		dp5 += 8;
	} while (--lines > 0);
}

static int __init xor_neon_init(void)
{
	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
	}
	return 0;
}
module_init(xor_neon_init);

static void __exit xor_neon_exit(void)
{
}
module_exit(xor_neon_exit);

MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
MODULE_LICENSE("GPL");