Commit ce41fefd authored by Tianjia Zhang's avatar Tianjia Zhang Committed by Herbert Xu
Browse files

crypto: arm64/sm4 - refactor and simplify CE implementation



This patch does not add new features, but only refactors and simplifies the
implementation of the Crypto Extension acceleration of the SM4 algorithm:

Extract the macro optimized by SM4 Crypto Extension for reuse in the
subsequent optimization of CCM/GCM modes.

Encryption in CBC and CFB modes processes four blocks at a time instead of
one, allowing the ld1 instruction to load 64 bytes of data at a time, which
will reduces unnecessary memory accesses.

CBC/CFB/CTR makes full use of free registers to reduce redundant memory
accesses, and rearranges some instructions to improve out-of-order execution
capabilities.

Signed-off-by: default avatarTianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 3c383637
Loading
Loading
Loading
Loading
+209 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SM4 helper macros for Crypto Extensions
 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
 */

#define SM4_PREPARE(ptr)					\
	ld1		{v24.16b-v27.16b}, [ptr], #64;		\
	ld1		{v28.16b-v31.16b}, [ptr];

#define SM4_CRYPT_BLK_BE(b0)					\
	sm4e		b0.4s, v24.4s;				\
	sm4e		b0.4s, v25.4s;				\
	sm4e		b0.4s, v26.4s;				\
	sm4e		b0.4s, v27.4s;				\
	sm4e		b0.4s, v28.4s;				\
	sm4e		b0.4s, v29.4s;				\
	sm4e		b0.4s, v30.4s;				\
	sm4e		b0.4s, v31.4s;				\
	rev64		b0.4s, b0.4s;				\
	ext		b0.16b, b0.16b, b0.16b, #8;		\
	rev32		b0.16b, b0.16b;

#define SM4_CRYPT_BLK(b0)					\
	rev32		b0.16b, b0.16b;				\
	SM4_CRYPT_BLK_BE(b0);

#define SM4_CRYPT_BLK2_BE(b0, b1)				\
	sm4e		b0.4s, v24.4s;				\
	sm4e		b1.4s, v24.4s;				\
	sm4e		b0.4s, v25.4s;				\
	sm4e		b1.4s, v25.4s;				\
	sm4e		b0.4s, v26.4s;				\
	sm4e		b1.4s, v26.4s;				\
	sm4e		b0.4s, v27.4s;				\
	sm4e		b1.4s, v27.4s;				\
	sm4e		b0.4s, v28.4s;				\
	sm4e		b1.4s, v28.4s;				\
	sm4e		b0.4s, v29.4s;				\
	sm4e		b1.4s, v29.4s;				\
	sm4e		b0.4s, v30.4s;				\
	sm4e		b1.4s, v30.4s;				\
	sm4e		b0.4s, v31.4s;				\
	sm4e		b1.4s, v31.4s;				\
	rev64		b0.4s, b0.4s;				\
	rev64		b1.4s, b1.4s;				\
	ext		b0.16b, b0.16b, b0.16b, #8;		\
	ext		b1.16b, b1.16b, b1.16b, #8;		\
	rev32		b0.16b, b0.16b;				\
	rev32		b1.16b, b1.16b;				\

#define SM4_CRYPT_BLK2(b0, b1)					\
	rev32		b0.16b, b0.16b;				\
	rev32		b1.16b, b1.16b;				\
	SM4_CRYPT_BLK2_BE(b0, b1);

#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)			\
	sm4e		b0.4s, v24.4s;				\
	sm4e		b1.4s, v24.4s;				\
	sm4e		b2.4s, v24.4s;				\
	sm4e		b3.4s, v24.4s;				\
	sm4e		b0.4s, v25.4s;				\
	sm4e		b1.4s, v25.4s;				\
	sm4e		b2.4s, v25.4s;				\
	sm4e		b3.4s, v25.4s;				\
	sm4e		b0.4s, v26.4s;				\
	sm4e		b1.4s, v26.4s;				\
	sm4e		b2.4s, v26.4s;				\
	sm4e		b3.4s, v26.4s;				\
	sm4e		b0.4s, v27.4s;				\
	sm4e		b1.4s, v27.4s;				\
	sm4e		b2.4s, v27.4s;				\
	sm4e		b3.4s, v27.4s;				\
	sm4e		b0.4s, v28.4s;				\
	sm4e		b1.4s, v28.4s;				\
	sm4e		b2.4s, v28.4s;				\
	sm4e		b3.4s, v28.4s;				\
	sm4e		b0.4s, v29.4s;				\
	sm4e		b1.4s, v29.4s;				\
	sm4e		b2.4s, v29.4s;				\
	sm4e		b3.4s, v29.4s;				\
	sm4e		b0.4s, v30.4s;				\
	sm4e		b1.4s, v30.4s;				\
	sm4e		b2.4s, v30.4s;				\
	sm4e		b3.4s, v30.4s;				\
	sm4e		b0.4s, v31.4s;				\
	sm4e		b1.4s, v31.4s;				\
	sm4e		b2.4s, v31.4s;				\
	sm4e		b3.4s, v31.4s;				\
	rev64		b0.4s, b0.4s;				\
	rev64		b1.4s, b1.4s;				\
	rev64		b2.4s, b2.4s;				\
	rev64		b3.4s, b3.4s;				\
	ext		b0.16b, b0.16b, b0.16b, #8;		\
	ext		b1.16b, b1.16b, b1.16b, #8;		\
	ext		b2.16b, b2.16b, b2.16b, #8;		\
	ext		b3.16b, b3.16b, b3.16b, #8;		\
	rev32		b0.16b, b0.16b;				\
	rev32		b1.16b, b1.16b;				\
	rev32		b2.16b, b2.16b;				\
	rev32		b3.16b, b3.16b;

#define SM4_CRYPT_BLK4(b0, b1, b2, b3)				\
	rev32		b0.16b, b0.16b;				\
	rev32		b1.16b, b1.16b;				\
	rev32		b2.16b, b2.16b;				\
	rev32		b3.16b, b3.16b;				\
	SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);

#define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7)	\
	sm4e		b0.4s, v24.4s;				\
	sm4e		b1.4s, v24.4s;				\
	sm4e		b2.4s, v24.4s;				\
	sm4e		b3.4s, v24.4s;				\
	sm4e		b4.4s, v24.4s;				\
	sm4e		b5.4s, v24.4s;				\
	sm4e		b6.4s, v24.4s;				\
	sm4e		b7.4s, v24.4s;				\
	sm4e		b0.4s, v25.4s;				\
	sm4e		b1.4s, v25.4s;				\
	sm4e		b2.4s, v25.4s;				\
	sm4e		b3.4s, v25.4s;				\
	sm4e		b4.4s, v25.4s;				\
	sm4e		b5.4s, v25.4s;				\
	sm4e		b6.4s, v25.4s;				\
	sm4e		b7.4s, v25.4s;				\
	sm4e		b0.4s, v26.4s;				\
	sm4e		b1.4s, v26.4s;				\
	sm4e		b2.4s, v26.4s;				\
	sm4e		b3.4s, v26.4s;				\
	sm4e		b4.4s, v26.4s;				\
	sm4e		b5.4s, v26.4s;				\
	sm4e		b6.4s, v26.4s;				\
	sm4e		b7.4s, v26.4s;				\
	sm4e		b0.4s, v27.4s;				\
	sm4e		b1.4s, v27.4s;				\
	sm4e		b2.4s, v27.4s;				\
	sm4e		b3.4s, v27.4s;				\
	sm4e		b4.4s, v27.4s;				\
	sm4e		b5.4s, v27.4s;				\
	sm4e		b6.4s, v27.4s;				\
	sm4e		b7.4s, v27.4s;				\
	sm4e		b0.4s, v28.4s;				\
	sm4e		b1.4s, v28.4s;				\
	sm4e		b2.4s, v28.4s;				\
	sm4e		b3.4s, v28.4s;				\
	sm4e		b4.4s, v28.4s;				\
	sm4e		b5.4s, v28.4s;				\
	sm4e		b6.4s, v28.4s;				\
	sm4e		b7.4s, v28.4s;				\
	sm4e		b0.4s, v29.4s;				\
	sm4e		b1.4s, v29.4s;				\
	sm4e		b2.4s, v29.4s;				\
	sm4e		b3.4s, v29.4s;				\
	sm4e		b4.4s, v29.4s;				\
	sm4e		b5.4s, v29.4s;				\
	sm4e		b6.4s, v29.4s;				\
	sm4e		b7.4s, v29.4s;				\
	sm4e		b0.4s, v30.4s;				\
	sm4e		b1.4s, v30.4s;				\
	sm4e		b2.4s, v30.4s;				\
	sm4e		b3.4s, v30.4s;				\
	sm4e		b4.4s, v30.4s;				\
	sm4e		b5.4s, v30.4s;				\
	sm4e		b6.4s, v30.4s;				\
	sm4e		b7.4s, v30.4s;				\
	sm4e		b0.4s, v31.4s;				\
	sm4e		b1.4s, v31.4s;				\
	sm4e		b2.4s, v31.4s;				\
	sm4e		b3.4s, v31.4s;				\
	sm4e		b4.4s, v31.4s;				\
	sm4e		b5.4s, v31.4s;				\
	sm4e		b6.4s, v31.4s;				\
	sm4e		b7.4s, v31.4s;				\
	rev64		b0.4s, b0.4s;				\
	rev64		b1.4s, b1.4s;				\
	rev64		b2.4s, b2.4s;				\
	rev64		b3.4s, b3.4s;				\
	rev64		b4.4s, b4.4s;				\
	rev64		b5.4s, b5.4s;				\
	rev64		b6.4s, b6.4s;				\
	rev64		b7.4s, b7.4s;				\
	ext		b0.16b, b0.16b, b0.16b, #8;		\
	ext		b1.16b, b1.16b, b1.16b, #8;		\
	ext		b2.16b, b2.16b, b2.16b, #8;		\
	ext		b3.16b, b3.16b, b3.16b, #8;		\
	ext		b4.16b, b4.16b, b4.16b, #8;		\
	ext		b5.16b, b5.16b, b5.16b, #8;		\
	ext		b6.16b, b6.16b, b6.16b, #8;		\
	ext		b7.16b, b7.16b, b7.16b, #8;		\
	rev32		b0.16b, b0.16b;				\
	rev32		b1.16b, b1.16b;				\
	rev32		b2.16b, b2.16b;				\
	rev32		b3.16b, b3.16b;				\
	rev32		b4.16b, b4.16b;				\
	rev32		b5.16b, b5.16b;				\
	rev32		b6.16b, b6.16b;				\
	rev32		b7.16b, b7.16b;

#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)		\
	rev32		b0.16b, b0.16b;				\
	rev32		b1.16b, b1.16b;				\
	rev32		b2.16b, b2.16b;				\
	rev32		b3.16b, b3.16b;				\
	rev32		b4.16b, b4.16b;				\
	rev32		b5.16b, b5.16b;				\
	rev32		b6.16b, b6.16b;				\
	rev32		b7.16b, b7.16b;				\
	SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7);
+283 −363
Original line number Diff line number Diff line
@@ -10,10 +10,12 @@

#include <linux/linkage.h>
#include <asm/assembler.h>
#include "sm4-ce-asm.h"

.arch	armv8-a+crypto

.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
		20, 24, 25, 26, 27, 28, 29, 30, 31
	.set .Lv\b\().4s, \b
.endr

@@ -34,174 +36,6 @@

#define RIV	v20

/* Helper macros. */

#define PREPARE                                       \
	ld1		{v24.16b-v27.16b}, [x0], #64; \
	ld1		{v28.16b-v31.16b}, [x0];

#define SM4_CRYPT_BLK(b0)                           \
	rev32		b0.16b, b0.16b;             \
	sm4e		b0.4s, v24.4s;              \
	sm4e		b0.4s, v25.4s;              \
	sm4e		b0.4s, v26.4s;              \
	sm4e		b0.4s, v27.4s;              \
	sm4e		b0.4s, v28.4s;              \
	sm4e		b0.4s, v29.4s;              \
	sm4e		b0.4s, v30.4s;              \
	sm4e		b0.4s, v31.4s;              \
	rev64		b0.4s, b0.4s;               \
	ext		b0.16b, b0.16b, b0.16b, #8; \
	rev32		b0.16b, b0.16b;

#define SM4_CRYPT_BLK4(b0, b1, b2, b3)              \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;             \
	sm4e		b0.4s, v24.4s;              \
	sm4e		b1.4s, v24.4s;              \
	sm4e		b2.4s, v24.4s;              \
	sm4e		b3.4s, v24.4s;              \
	sm4e		b0.4s, v25.4s;              \
	sm4e		b1.4s, v25.4s;              \
	sm4e		b2.4s, v25.4s;              \
	sm4e		b3.4s, v25.4s;              \
	sm4e		b0.4s, v26.4s;              \
	sm4e		b1.4s, v26.4s;              \
	sm4e		b2.4s, v26.4s;              \
	sm4e		b3.4s, v26.4s;              \
	sm4e		b0.4s, v27.4s;              \
	sm4e		b1.4s, v27.4s;              \
	sm4e		b2.4s, v27.4s;              \
	sm4e		b3.4s, v27.4s;              \
	sm4e		b0.4s, v28.4s;              \
	sm4e		b1.4s, v28.4s;              \
	sm4e		b2.4s, v28.4s;              \
	sm4e		b3.4s, v28.4s;              \
	sm4e		b0.4s, v29.4s;              \
	sm4e		b1.4s, v29.4s;              \
	sm4e		b2.4s, v29.4s;              \
	sm4e		b3.4s, v29.4s;              \
	sm4e		b0.4s, v30.4s;              \
	sm4e		b1.4s, v30.4s;              \
	sm4e		b2.4s, v30.4s;              \
	sm4e		b3.4s, v30.4s;              \
	sm4e		b0.4s, v31.4s;              \
	sm4e		b1.4s, v31.4s;              \
	sm4e		b2.4s, v31.4s;              \
	sm4e		b3.4s, v31.4s;              \
	rev64		b0.4s, b0.4s;               \
	rev64		b1.4s, b1.4s;               \
	rev64		b2.4s, b2.4s;               \
	rev64		b3.4s, b3.4s;               \
	ext		b0.16b, b0.16b, b0.16b, #8; \
	ext		b1.16b, b1.16b, b1.16b, #8; \
	ext		b2.16b, b2.16b, b2.16b, #8; \
	ext		b3.16b, b3.16b, b3.16b, #8; \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;

#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;             \
	rev32		b4.16b, b4.16b;             \
	rev32		b5.16b, b5.16b;             \
	rev32		b6.16b, b6.16b;             \
	rev32		b7.16b, b7.16b;             \
	sm4e		b0.4s, v24.4s;              \
	sm4e		b1.4s, v24.4s;              \
	sm4e		b2.4s, v24.4s;              \
	sm4e		b3.4s, v24.4s;              \
	sm4e		b4.4s, v24.4s;              \
	sm4e		b5.4s, v24.4s;              \
	sm4e		b6.4s, v24.4s;              \
	sm4e		b7.4s, v24.4s;              \
	sm4e		b0.4s, v25.4s;              \
	sm4e		b1.4s, v25.4s;              \
	sm4e		b2.4s, v25.4s;              \
	sm4e		b3.4s, v25.4s;              \
	sm4e		b4.4s, v25.4s;              \
	sm4e		b5.4s, v25.4s;              \
	sm4e		b6.4s, v25.4s;              \
	sm4e		b7.4s, v25.4s;              \
	sm4e		b0.4s, v26.4s;              \
	sm4e		b1.4s, v26.4s;              \
	sm4e		b2.4s, v26.4s;              \
	sm4e		b3.4s, v26.4s;              \
	sm4e		b4.4s, v26.4s;              \
	sm4e		b5.4s, v26.4s;              \
	sm4e		b6.4s, v26.4s;              \
	sm4e		b7.4s, v26.4s;              \
	sm4e		b0.4s, v27.4s;              \
	sm4e		b1.4s, v27.4s;              \
	sm4e		b2.4s, v27.4s;              \
	sm4e		b3.4s, v27.4s;              \
	sm4e		b4.4s, v27.4s;              \
	sm4e		b5.4s, v27.4s;              \
	sm4e		b6.4s, v27.4s;              \
	sm4e		b7.4s, v27.4s;              \
	sm4e		b0.4s, v28.4s;              \
	sm4e		b1.4s, v28.4s;              \
	sm4e		b2.4s, v28.4s;              \
	sm4e		b3.4s, v28.4s;              \
	sm4e		b4.4s, v28.4s;              \
	sm4e		b5.4s, v28.4s;              \
	sm4e		b6.4s, v28.4s;              \
	sm4e		b7.4s, v28.4s;              \
	sm4e		b0.4s, v29.4s;              \
	sm4e		b1.4s, v29.4s;              \
	sm4e		b2.4s, v29.4s;              \
	sm4e		b3.4s, v29.4s;              \
	sm4e		b4.4s, v29.4s;              \
	sm4e		b5.4s, v29.4s;              \
	sm4e		b6.4s, v29.4s;              \
	sm4e		b7.4s, v29.4s;              \
	sm4e		b0.4s, v30.4s;              \
	sm4e		b1.4s, v30.4s;              \
	sm4e		b2.4s, v30.4s;              \
	sm4e		b3.4s, v30.4s;              \
	sm4e		b4.4s, v30.4s;              \
	sm4e		b5.4s, v30.4s;              \
	sm4e		b6.4s, v30.4s;              \
	sm4e		b7.4s, v30.4s;              \
	sm4e		b0.4s, v31.4s;              \
	sm4e		b1.4s, v31.4s;              \
	sm4e		b2.4s, v31.4s;              \
	sm4e		b3.4s, v31.4s;              \
	sm4e		b4.4s, v31.4s;              \
	sm4e		b5.4s, v31.4s;              \
	sm4e		b6.4s, v31.4s;              \
	sm4e		b7.4s, v31.4s;              \
	rev64		b0.4s, b0.4s;               \
	rev64		b1.4s, b1.4s;               \
	rev64		b2.4s, b2.4s;               \
	rev64		b3.4s, b3.4s;               \
	rev64		b4.4s, b4.4s;               \
	rev64		b5.4s, b5.4s;               \
	rev64		b6.4s, b6.4s;               \
	rev64		b7.4s, b7.4s;               \
	ext		b0.16b, b0.16b, b0.16b, #8; \
	ext		b1.16b, b1.16b, b1.16b, #8; \
	ext		b2.16b, b2.16b, b2.16b, #8; \
	ext		b3.16b, b3.16b, b3.16b, #8; \
	ext		b4.16b, b4.16b, b4.16b, #8; \
	ext		b5.16b, b5.16b, b5.16b, #8; \
	ext		b6.16b, b6.16b, b6.16b, #8; \
	ext		b7.16b, b7.16b, b7.16b, #8; \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;             \
	rev32		b4.16b, b4.16b;             \
	rev32		b5.16b, b5.16b;             \
	rev32		b6.16b, b6.16b;             \
	rev32		b7.16b, b7.16b;


.align 3
SYM_FUNC_START(sm4_ce_expand_key)
@@ -268,7 +102,7 @@ SYM_FUNC_START(sm4_ce_crypt_block)
	 *   x1: dst
	 *   x2: src
	 */
	PREPARE;
	SM4_PREPARE(x0)

	ld1		{v0.16b}, [x2];
	SM4_CRYPT_BLK(v0);
@@ -285,7 +119,7 @@ SYM_FUNC_START(sm4_ce_crypt)
	 *   x2: src
	 *   w3: nblocks
	 */
	PREPARE;
	SM4_PREPARE(x0)

.Lcrypt_loop_blk:
	sub		w3, w3, #8;
@@ -337,26 +171,50 @@ SYM_FUNC_START(sm4_ce_cbc_enc)
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;
	SM4_PREPARE(x0)

	ld1		{RIV.16b}, [x3]

.Lcbc_enc_loop_4x:
	cmp		w4, #4
	blt		.Lcbc_enc_loop_1x

	sub		w4, w4, #4

	ld1		{RIV.16b}, [x3];
	ld1		{v0.16b-v3.16b}, [x2], #64

.Lcbc_enc_loop:
	sub		w4, w4, #1;
	eor		v0.16b, v0.16b, RIV.16b
	SM4_CRYPT_BLK(v0)
	eor		v1.16b, v1.16b, v0.16b
	SM4_CRYPT_BLK(v1)
	eor		v2.16b, v2.16b, v1.16b
	SM4_CRYPT_BLK(v2)
	eor		v3.16b, v3.16b, v2.16b
	SM4_CRYPT_BLK(v3)

	ld1		{RTMP0.16b}, [x2], #16;
	eor		RIV.16b, RIV.16b, RTMP0.16b;
	st1		{v0.16b-v3.16b}, [x1], #64
	mov		RIV.16b, v3.16b

	SM4_CRYPT_BLK(RIV);
	cbz		w4, .Lcbc_enc_end
	b		.Lcbc_enc_loop_4x

	st1		{RIV.16b}, [x1], #16;
.Lcbc_enc_loop_1x:
	sub		w4, w4, #1

	cbnz		w4, .Lcbc_enc_loop;
	ld1		{v0.16b}, [x2], #16

	eor		RIV.16b, RIV.16b, v0.16b
	SM4_CRYPT_BLK(RIV)

	st1		{RIV.16b}, [x1], #16

	cbnz		w4, .Lcbc_enc_loop_1x

.Lcbc_enc_end:
	/* store new IV */
	st1		{RIV.16b}, [x3];
	st1		{RIV.16b}, [x3]

	ret;
	ret
SYM_FUNC_END(sm4_ce_cbc_enc)

.align 3
@@ -368,79 +226,93 @@ SYM_FUNC_START(sm4_ce_cbc_dec)
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;
	SM4_PREPARE(x0)

	ld1		{RIV.16b}, [x3];
	ld1		{RIV.16b}, [x3]

.Lcbc_loop_blk:
	sub		w4, w4, #8;
	tbnz		w4, #31, .Lcbc_tail8;
.Lcbc_dec_loop_8x:
	sub		w4, w4, #8
	tbnz		w4, #31, .Lcbc_dec_4x

	ld1		{v0.16b-v3.16b}, [x2], #64;
	ld1		{v4.16b-v7.16b}, [x2];
	ld1		{v0.16b-v3.16b}, [x2], #64
	ld1		{v4.16b-v7.16b}, [x2], #64

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
	rev32		v8.16b, v0.16b
	rev32		v9.16b, v1.16b
	rev32		v10.16b, v2.16b
	rev32		v11.16b, v3.16b
	rev32		v12.16b, v4.16b
	rev32		v13.16b, v5.16b
	rev32		v14.16b, v6.16b
	rev32		v15.16b, v7.16b

	sub		x2, x2, #64;
	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v1.16b, v1.16b, RTMP0.16b;
	eor		v2.16b, v2.16b, RTMP1.16b;
	eor		v3.16b, v3.16b, RTMP2.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;
	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)

	eor		v4.16b, v4.16b, RTMP3.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v5.16b, v5.16b, RTMP0.16b;
	eor		v6.16b, v6.16b, RTMP1.16b;
	eor		v7.16b, v7.16b, RTMP2.16b;
	eor		v8.16b, v8.16b, RIV.16b
	eor		v9.16b, v9.16b, v0.16b
	eor		v10.16b, v10.16b, v1.16b
	eor		v11.16b, v11.16b, v2.16b
	eor		v12.16b, v12.16b, v3.16b
	eor		v13.16b, v13.16b, v4.16b
	eor		v14.16b, v14.16b, v5.16b
	eor		v15.16b, v15.16b, v6.16b

	mov		RIV.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;
	st1		{v8.16b-v11.16b}, [x1], #64
	st1		{v12.16b-v15.16b}, [x1], #64

	cbz		w4, .Lcbc_end;
	b		.Lcbc_loop_blk;
	mov		RIV.16b, v7.16b

.Lcbc_tail8:
	add		w4, w4, #8;
	cmp		w4, #4;
	blt		.Lcbc_tail4;
	cbz		w4, .Lcbc_dec_end
	b		.Lcbc_dec_loop_8x

	sub		w4, w4, #4;
.Lcbc_dec_4x:
	add		w4, w4, #8
	cmp		w4, #4
	blt		.Lcbc_dec_loop_1x

	ld1		{v0.16b-v3.16b}, [x2];
	sub		w4, w4, #4

	SM4_CRYPT_BLK4(v0, v1, v2, v3);
	ld1		{v0.16b-v3.16b}, [x2], #64

	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v1.16b, v1.16b, RTMP0.16b;
	eor		v2.16b, v2.16b, RTMP1.16b;
	eor		v3.16b, v3.16b, RTMP2.16b;
	rev32		v8.16b, v0.16b
	rev32		v9.16b, v1.16b
	rev32		v10.16b, v2.16b
	rev32		v11.16b, v3.16b

	mov		RIV.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;
	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)

	cbz		w4, .Lcbc_end;
	eor		v8.16b, v8.16b, RIV.16b
	eor		v9.16b, v9.16b, v0.16b
	eor		v10.16b, v10.16b, v1.16b
	eor		v11.16b, v11.16b, v2.16b

.Lcbc_tail4:
	sub		w4, w4, #1;
	st1		{v8.16b-v11.16b}, [x1], #64

	ld1		{v0.16b}, [x2];
	mov		RIV.16b, v3.16b

	SM4_CRYPT_BLK(v0);
	cbz		w4, .Lcbc_dec_end

	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RIV.16b}, [x2], #16;
	st1		{v0.16b}, [x1], #16;
.Lcbc_dec_loop_1x:
	sub		w4, w4, #1

	ld1		{v0.16b}, [x2], #16

	rev32		v8.16b, v0.16b

	SM4_CRYPT_BLK_BE(v8)

	cbnz		w4, .Lcbc_tail4;
	eor		v8.16b, v8.16b, RIV.16b
	st1		{v8.16b}, [x1], #16

.Lcbc_end:
	mov		RIV.16b, v0.16b

	cbnz		w4, .Lcbc_dec_loop_1x

.Lcbc_dec_end:
	/* store new IV */
	st1		{RIV.16b}, [x3];
	st1		{RIV.16b}, [x3]

	ret;
	ret
SYM_FUNC_END(sm4_ce_cbc_dec)

.align 3
@@ -452,25 +324,57 @@ SYM_FUNC_START(sm4_ce_cfb_enc)
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;
	SM4_PREPARE(x0)

	ld1		{RIV.16b}, [x3]

.Lcfb_enc_loop_4x:
	cmp		w4, #4
	blt		.Lcfb_enc_loop_1x

	sub		w4, w4, #4

	ld1		{v0.16b-v3.16b}, [x2], #64

	rev32		v8.16b, RIV.16b
	SM4_CRYPT_BLK_BE(v8)
	eor		v0.16b, v0.16b, v8.16b

	rev32		v8.16b, v0.16b
	SM4_CRYPT_BLK_BE(v8)
	eor		v1.16b, v1.16b, v8.16b

	rev32		v8.16b, v1.16b
	SM4_CRYPT_BLK_BE(v8)
	eor		v2.16b, v2.16b, v8.16b

	rev32		v8.16b, v2.16b
	SM4_CRYPT_BLK_BE(v8)
	eor		v3.16b, v3.16b, v8.16b

	ld1		{RIV.16b}, [x3];
	st1		{v0.16b-v3.16b}, [x1], #64
	mov		RIV.16b, v3.16b

.Lcfb_enc_loop:
	sub		w4, w4, #1;
	cbz		w4, .Lcfb_enc_end
	b		.Lcfb_enc_loop_4x

	SM4_CRYPT_BLK(RIV);
.Lcfb_enc_loop_1x:
	sub		w4, w4, #1

	ld1		{RTMP0.16b}, [x2], #16;
	eor		RIV.16b, RIV.16b, RTMP0.16b;
	st1		{RIV.16b}, [x1], #16;
	ld1		{v0.16b}, [x2], #16

	cbnz		w4, .Lcfb_enc_loop;
	SM4_CRYPT_BLK(RIV)
	eor		RIV.16b, RIV.16b, v0.16b

	st1		{RIV.16b}, [x1], #16

	cbnz		w4, .Lcfb_enc_loop_1x

.Lcfb_enc_end:
	/* store new IV */
	st1		{RIV.16b}, [x3];
	st1		{RIV.16b}, [x3]

	ret;
	ret
SYM_FUNC_END(sm4_ce_cfb_enc)

.align 3
@@ -482,79 +386,91 @@ SYM_FUNC_START(sm4_ce_cfb_dec)
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;
	SM4_PREPARE(x0)

	ld1		{v0.16b}, [x3];
	ld1		{RIV.16b}, [x3]

.Lcfb_loop_blk:
	sub		w4, w4, #8;
	tbnz		w4, #31, .Lcfb_tail8;
.Lcfb_dec_loop_8x:
	sub		w4, w4, #8
	tbnz		w4, #31, .Lcfb_dec_4x

	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
	ld1		{v4.16b-v7.16b}, [x2];
	ld1		{v0.16b-v3.16b}, [x2], #64
	ld1		{v4.16b-v7.16b}, [x2], #64

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
	rev32		v8.16b, RIV.16b
	rev32		v9.16b, v0.16b
	rev32		v10.16b, v1.16b
	rev32		v11.16b, v2.16b
	rev32		v12.16b, v3.16b
	rev32		v13.16b, v4.16b
	rev32		v14.16b, v5.16b
	rev32		v15.16b, v6.16b

	sub		x2, x2, #48;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;
	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v4.16b, v4.16b, RTMP0.16b;
	eor		v5.16b, v5.16b, RTMP1.16b;
	eor		v6.16b, v6.16b, RTMP2.16b;
	eor		v7.16b, v7.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;
	mov		RIV.16b, v7.16b

	mov		v0.16b, RTMP3.16b;
	eor		v0.16b, v0.16b, v8.16b
	eor		v1.16b, v1.16b, v9.16b
	eor		v2.16b, v2.16b, v10.16b
	eor		v3.16b, v3.16b, v11.16b
	eor		v4.16b, v4.16b, v12.16b
	eor		v5.16b, v5.16b, v13.16b
	eor		v6.16b, v6.16b, v14.16b
	eor		v7.16b, v7.16b, v15.16b

	cbz		w4, .Lcfb_end;
	b		.Lcfb_loop_blk;
	st1		{v0.16b-v3.16b}, [x1], #64
	st1		{v4.16b-v7.16b}, [x1], #64

.Lcfb_tail8:
	add		w4, w4, #8;
	cmp		w4, #4;
	blt		.Lcfb_tail4;
	cbz		w4, .Lcfb_dec_end
	b		.Lcfb_dec_loop_8x

	sub		w4, w4, #4;
.Lcfb_dec_4x:
	add		w4, w4, #8
	cmp		w4, #4
	blt		.Lcfb_dec_loop_1x

	ld1		{v1.16b, v2.16b, v3.16b}, [x2];
	sub		w4, w4, #4

	SM4_CRYPT_BLK4(v0, v1, v2, v3);
	ld1		{v0.16b-v3.16b}, [x2], #64

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;
	rev32		v8.16b, RIV.16b
	rev32		v9.16b, v0.16b
	rev32		v10.16b, v1.16b
	rev32		v11.16b, v2.16b

	mov		v0.16b, RTMP3.16b;
	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)

	cbz		w4, .Lcfb_end;
	mov		RIV.16b, v3.16b

.Lcfb_tail4:
	sub		w4, w4, #1;
	eor		v0.16b, v0.16b, v8.16b
	eor		v1.16b, v1.16b, v9.16b
	eor		v2.16b, v2.16b, v10.16b
	eor		v3.16b, v3.16b, v11.16b

	SM4_CRYPT_BLK(v0);
	st1		{v0.16b-v3.16b}, [x1], #64

	ld1		{RTMP0.16b}, [x2], #16;
	eor		v0.16b, v0.16b, RTMP0.16b;
	st1		{v0.16b}, [x1], #16;
	cbz		w4, .Lcfb_dec_end

.Lcfb_dec_loop_1x:
	sub		w4, w4, #1

	ld1		{v0.16b}, [x2], #16

	mov		v0.16b, RTMP0.16b;
	SM4_CRYPT_BLK(RIV)

	cbnz		w4, .Lcfb_tail4;
	eor		RIV.16b, RIV.16b, v0.16b
	st1		{RIV.16b}, [x1], #16

.Lcfb_end:
	mov		RIV.16b, v0.16b

	cbnz		w4, .Lcfb_dec_loop_1x

.Lcfb_dec_end:
	/* store new IV */
	st1		{v0.16b}, [x3];
	st1		{RIV.16b}, [x3]

	ret;
	ret
SYM_FUNC_END(sm4_ce_cfb_dec)

.align 3
@@ -566,95 +482,99 @@ SYM_FUNC_START(sm4_ce_ctr_enc)
	 *   x3: ctr (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;
	SM4_PREPARE(x0)

	ldp		x7, x8, [x3];
	rev		x7, x7;
	rev		x8, x8;
	ldp		x7, x8, [x3]
	rev		x7, x7
	rev		x8, x8

.Lctr_loop_blk:
	sub		w4, w4, #8;
	tbnz		w4, #31, .Lctr_tail8;
.Lctr_loop_8x:
	sub		w4, w4, #8
	tbnz		w4, #31, .Lctr_4x

#define inc_le128(vctr)					\
		mov		vctr.d[1], x8;		\
		mov		vctr.d[0], x7;		\
		adds		x8, x8, #1;		\
	adc		x7, x7, xzr;        \
	rev64		vctr.16b, vctr.16b;
		rev64		vctr.16b, vctr.16b;	\
		adc		x7, x7, xzr;

	/* construct CTRs */
	inc_le128(v0);			/* +0 */
	inc_le128(v1);			/* +1 */
	inc_le128(v2);			/* +2 */
	inc_le128(v3);			/* +3 */
	inc_le128(v4);			/* +4 */
	inc_le128(v5);			/* +5 */
	inc_le128(v6);			/* +6 */
	inc_le128(v7);			/* +7 */
	inc_le128(v0)			/* +0 */
	inc_le128(v1)			/* +1 */
	inc_le128(v2)			/* +2 */
	inc_le128(v3)			/* +3 */
	inc_le128(v4)			/* +4 */
	inc_le128(v5)			/* +5 */
	inc_le128(v6)			/* +6 */
	inc_le128(v7)			/* +7 */

	ld1		{v8.16b-v11.16b}, [x2], #64
	ld1		{v12.16b-v15.16b}, [x2], #64

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

	eor		v0.16b, v0.16b, v8.16b
	eor		v1.16b, v1.16b, v9.16b
	eor		v2.16b, v2.16b, v10.16b
	eor		v3.16b, v3.16b, v11.16b
	eor		v4.16b, v4.16b, v12.16b
	eor		v5.16b, v5.16b, v13.16b
	eor		v6.16b, v6.16b, v14.16b
	eor		v7.16b, v7.16b, v15.16b

	st1		{v0.16b-v3.16b}, [x1], #64
	st1		{v4.16b-v7.16b}, [x1], #64

	cbz		w4, .Lctr_end
	b		.Lctr_loop_8x

.Lctr_4x:
	add		w4, w4, #8
	cmp		w4, #4
	blt		.Lctr_loop_1x

	sub		w4, w4, #4

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;
	/* construct CTRs */
	inc_le128(v0)			/* +0 */
	inc_le128(v1)			/* +1 */
	inc_le128(v2)			/* +2 */
	inc_le128(v3)			/* +3 */

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v4.16b, v4.16b, RTMP0.16b;
	eor		v5.16b, v5.16b, RTMP1.16b;
	eor		v6.16b, v6.16b, RTMP2.16b;
	eor		v7.16b, v7.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;
	ld1		{v8.16b-v11.16b}, [x2], #64

	cbz		w4, .Lctr_end;
	b		.Lctr_loop_blk;
	SM4_CRYPT_BLK4(v0, v1, v2, v3)

.Lctr_tail8:
	add		w4, w4, #8;
	cmp		w4, #4;
	blt		.Lctr_tail4;
	eor		v0.16b, v0.16b, v8.16b
	eor		v1.16b, v1.16b, v9.16b
	eor		v2.16b, v2.16b, v10.16b
	eor		v3.16b, v3.16b, v11.16b

	sub		w4, w4, #4;
	st1		{v0.16b-v3.16b}, [x1], #64

	/* construct CTRs */
	inc_le128(v0);			/* +0 */
	inc_le128(v1);			/* +1 */
	inc_le128(v2);			/* +2 */
	inc_le128(v3);			/* +3 */
	cbz		w4, .Lctr_end

	SM4_CRYPT_BLK4(v0, v1, v2, v3);

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	cbz		w4, .Lctr_end;

.Lctr_tail4:
	sub		w4, w4, #1;
.Lctr_loop_1x:
	sub		w4, w4, #1

	/* construct CTRs */
	inc_le128(v0);
	inc_le128(v0)

	SM4_CRYPT_BLK(v0);
	ld1		{v8.16b}, [x2], #16

	ld1		{RTMP0.16b}, [x2], #16;
	eor		v0.16b, v0.16b, RTMP0.16b;
	st1		{v0.16b}, [x1], #16;
	SM4_CRYPT_BLK(v0)

	eor		v0.16b, v0.16b, v8.16b
	st1		{v0.16b}, [x1], #16

	cbnz		w4, .Lctr_tail4;
	cbnz		w4, .Lctr_loop_1x

.Lctr_end:
	/* store new CTR */
	rev		x7, x7;
	rev		x8, x8;
	stp		x7, x8, [x3];
	rev		x7, x7
	rev		x8, x8
	stp		x7, x8, [x3]

	ret;
	ret
SYM_FUNC_END(sm4_ce_ctr_enc)
+27 −37

File changed.

Preview size limit exceeded, changes collapsed.