Commit ad00d41b authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: aegis128/neon - optimize tail block handling



Avoid copying the tail block via a stack buffer if the total size
exceeds a single AEGIS block. In this case, we can use overlapping
loads and stores and NEON permutation instructions instead, which
leads to a modest performance improvement on some cores (< 5%),
and is slightly cleaner. Note that we still need to use a stack
buffer if the entire input is smaller than 16 bytes, given that
we cannot use 16 byte NEON loads and stores safely in this case.

Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Reviewed-by: default avatarOndrej Mosnacek <omosnacek@gmail.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 02685906
Loading
Loading
Loading
Loading
+75 −14
Original line number Diff line number Diff line
@@ -20,7 +20,6 @@
extern int aegis128_have_aes_insn;

void *memcpy(void *dest, const void *src, size_t n);
void *memset(void *s, int c, size_t n);

struct aegis128_state {
	uint8x16_t v[5];
@@ -173,10 +172,46 @@ void crypto_aegis128_update_neon(void *state, const void *msg)
	aegis128_save_state_neon(st, state);
}

#ifdef CONFIG_ARM
/*
 * AArch32 does not provide these intrinsics natively because it does not
 * implement the underlying instructions. AArch32 only provides 64-bit
 * wide vtbl.8/vtbx.8 instruction, so use those instead.
 */
static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
{
	union {
		uint8x16_t	val;
		uint8x8x2_t	pair;
	} __a = { a };

	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
			   vtbl2_u8(__a.pair, vget_high_u8(b)));
}

static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
{
	union {
		uint8x16_t	val;
		uint8x8x2_t	pair;
	} __a = { a };

	return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
			   vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
}
#endif

static const uint8_t permute[] __aligned(64) = {
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};

void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
					unsigned int size)
{
	struct aegis128_state st = aegis128_load_state_neon(state);
	const int short_input = size < AEGIS_BLOCK_SIZE;
	uint8x16_t msg;

	preload_sbox();
@@ -186,7 +221,8 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,

		msg = vld1q_u8(src);
		st = aegis128_update_neon(st, msg);
		vst1q_u8(dst, msg ^ s);
		msg ^= s;
		vst1q_u8(dst, msg);

		size -= AEGIS_BLOCK_SIZE;
		src += AEGIS_BLOCK_SIZE;
@@ -195,13 +231,26 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,

	if (size > 0) {
		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
		uint8_t buf[AEGIS_BLOCK_SIZE] = {};
		uint8_t buf[AEGIS_BLOCK_SIZE];
		const void *in = src;
		void *out = dst;
		uint8x16_t m;

		memcpy(buf, src, size);
		msg = vld1q_u8(buf);
		st = aegis128_update_neon(st, msg);
		vst1q_u8(buf, msg ^ s);
		memcpy(dst, buf, size);
		if (__builtin_expect(short_input, 0))
			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);

		m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
			       vld1q_u8(permute + 32 - size));

		st = aegis128_update_neon(st, m);

		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
			 vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));

		if (__builtin_expect(short_input, 0))
			memcpy(dst, out, size);
		else
			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
	}

	aegis128_save_state_neon(st, state);
@@ -211,6 +260,7 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
					unsigned int size)
{
	struct aegis128_state st = aegis128_load_state_neon(state);
	const int short_input = size < AEGIS_BLOCK_SIZE;
	uint8x16_t msg;

	preload_sbox();
@@ -228,14 +278,25 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
	if (size > 0) {
		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
		uint8_t buf[AEGIS_BLOCK_SIZE];
		const void *in = src;
		void *out = dst;
		uint8x16_t m;

		vst1q_u8(buf, s);
		memcpy(buf, src, size);
		msg = vld1q_u8(buf) ^ s;
		vst1q_u8(buf, msg);
		memcpy(dst, buf, size);
		if (__builtin_expect(short_input, 0))
			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);

		st = aegis128_update_neon(st, msg);
		m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
				   vld1q_u8(permute + 32 - size));

		st = aegis128_update_neon(st, m);

		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
			 vqtbl1q_u8(m, vld1q_u8(permute + size)));

		if (__builtin_expect(short_input, 0))
			memcpy(dst, out, size);
		else
			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
	}

	aegis128_save_state_neon(st, state);