Commit fc754c02 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/crc-t10dif - move NEON yield to C code



Instead of yielding from the bowels of the asm routine if a reschedule
is needed, divide up the input into 4 KB chunks in the C glue. This
simplifies the code substantially, and avoids scheduling out the task
with the asm routine on the call stack, which is undesirable from a
CFI/instrumentation point of view.

Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent f0070f4a
Loading
Loading
Loading
Loading
+11 −32
Original line number Diff line number Diff line
@@ -68,10 +68,10 @@
	.text
	.arch		armv8-a+crypto

	init_crc	.req	w19
	buf		.req	x20
	len		.req	x21
	fold_consts_ptr	.req	x22
	init_crc	.req	w0
	buf		.req	x1
	len		.req	x2
	fold_consts_ptr	.req	x3

	fold_consts	.req	v10

@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
	.endm

	.macro		crc_t10dif_pmull, p
	frame_push	4, 128

	mov		init_crc, w0
	mov		buf, x1
	mov		len, x2

	__pmull_init_\p

	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
	fold_32_bytes	\p, v6, v7

	subs		len, len, #128
	b.lt		.Lfold_128_bytes_loop_done_\@

	if_will_cond_yield_neon
	stp		q0, q1, [sp, #.Lframe_local_offset]
	stp		q2, q3, [sp, #.Lframe_local_offset + 32]
	stp		q4, q5, [sp, #.Lframe_local_offset + 64]
	stp		q6, q7, [sp, #.Lframe_local_offset + 96]
	do_cond_yield_neon
	ldp		q0, q1, [sp, #.Lframe_local_offset]
	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
	ld1		{fold_consts.2d}, [fold_consts_ptr]
	__pmull_init_\p
	__pmull_pre_\p	fold_consts
	endif_yield_neon

	b		.Lfold_128_bytes_loop_\@

.Lfold_128_bytes_loop_done_\@:
	b.ge		.Lfold_128_bytes_loop_\@

	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.

@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.

	umov		w0, v0.h[0]
	frame_pop
	.ifc		\p, p8
	ldp		x29, x30, [sp], #16
	.endif
	ret

.Lless_than_256_bytes_\@:
@@ -489,6 +466,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Assumes len >= 16.
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp
	crc_t10dif_pmull p8
SYM_FUNC_END(crc_t10dif_pmull_p8)

+24 −6
Original line number Diff line number Diff line
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
	u16 *crc = shash_desc_ctx(desc);

	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
		do {
			unsigned int chunk = length;

			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
				chunk = SZ_4K;

			kernel_neon_begin();
		*crc = crc_t10dif_pmull_p8(*crc, data, length);
			*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
			kernel_neon_end();
			data += chunk;
			length -= chunk;
		} while (length);
	} else {
		*crc = crc_t10dif_generic(*crc, data, length);
	}
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
	u16 *crc = shash_desc_ctx(desc);

	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
		do {
			unsigned int chunk = length;

			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
				chunk = SZ_4K;

			kernel_neon_begin();
		*crc = crc_t10dif_pmull_p64(*crc, data, length);
			*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
			kernel_neon_end();
			data += chunk;
			length -= chunk;
		} while (length);
	} else {
		*crc = crc_t10dif_generic(*crc, data, length);
	}