Commit c4fc6328 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/chacha - simplify tail block handling



Based on lessons learnt from optimizing the 32-bit version of this driver,
we can simplify the arm64 version considerably, by reordering the final
two stores when the last block is not a multiple of 64 bytes. This removes
the need to use permutation instructions to calculate the elements that are
clobbered by the final overlapping store, given that the store of the
penultimate block now follows it, and that one carries the correct values
for those elements already.

While at it, simplify the overlapping loads as well, by calculating the
address of the final overlapping load upfront, and switching to this
address for every load that would otherwise extend past the end of the
source buffer.

There is no impact on performance, but the resulting code is substantially
smaller and easier to follow.

Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 9c0cef23
Loading
Loading
Loading
Loading
+69 −124
Original line number Diff line number Diff line
@@ -195,7 +195,6 @@ SYM_FUNC_START(chacha_4block_xor_neon)
	adr_l		x10, .Lpermute
	and		x5, x4, #63
	add		x10, x10, x5
	add		x11, x10, #64

	//
	// This function encrypts four consecutive ChaCha blocks by loading
@@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 )
	zip2		v31.4s, v14.4s, v15.4s
	  eor		a15, a15, w9

	mov		x3, #64
	add		x3, x2, x4
	sub		x3, x3, #128		// start of last block

	subs		x5, x4, #128
	add		x6, x5, x2
	csel		x3, x3, xzr, ge
	csel		x2, x2, x6, ge
	csel		x2, x2, x3, ge

	// interleave 64-bit words in state n, n+2
	zip1		v0.2d, v16.2d, v18.2d
@@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 )
	zip1		v8.2d, v17.2d, v19.2d
	zip2		v12.2d, v17.2d, v19.2d
	  stp		a2, a3, [x1, #-56]
	ld1		{v16.16b-v19.16b}, [x2], x3

	subs		x6, x4, #192
	ccmp		x3, xzr, #4, lt
	add		x7, x6, x2
	csel		x3, x3, xzr, eq
	csel		x2, x2, x7, eq
	ld1		{v16.16b-v19.16b}, [x2], #64
	csel		x2, x2, x3, ge

	zip1		v1.2d, v20.2d, v22.2d
	zip2		v5.2d, v20.2d, v22.2d
@@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 )
	zip1		v9.2d, v21.2d, v23.2d
	zip2		v13.2d, v21.2d, v23.2d
	  stp		a6, a7, [x1, #-40]
	ld1		{v20.16b-v23.16b}, [x2], x3

	subs		x7, x4, #256
	ccmp		x3, xzr, #4, lt
	add		x8, x7, x2
	csel		x3, x3, xzr, eq
	csel		x2, x2, x8, eq
	ld1		{v20.16b-v23.16b}, [x2], #64
	csel		x2, x2, x3, ge

	zip1		v2.2d, v24.2d, v26.2d
	zip2		v6.2d, v24.2d, v26.2d
@@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 )
	zip1		v10.2d, v25.2d, v27.2d
	zip2		v14.2d, v25.2d, v27.2d
	  stp		a10, a11, [x1, #-24]
	ld1		{v24.16b-v27.16b}, [x2], x3

	subs		x8, x4, #320
	ccmp		x3, xzr, #4, lt
	add		x9, x8, x2
	csel		x2, x2, x9, eq
	ld1		{v24.16b-v27.16b}, [x2], #64
	csel		x2, x2, x3, ge

	zip1		v3.2d, v28.2d, v30.2d
	zip2		v7.2d, v28.2d, v30.2d
@@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 )
	zip1		v11.2d, v29.2d, v31.2d
	zip2		v15.2d, v29.2d, v31.2d
	  stp		a14, a15, [x1, #-8]

	tbnz		x5, #63, .Lt128
	ld1		{v28.16b-v31.16b}, [x2]

	// xor with corresponding input, write to output
	tbnz		x5, #63, 0f
	eor		v16.16b, v16.16b, v0.16b
	eor		v17.16b, v17.16b, v1.16b
	eor		v18.16b, v18.16b, v2.16b
	eor		v19.16b, v19.16b, v3.16b
	st1		{v16.16b-v19.16b}, [x1], #64
	cbz		x5, .Lout

	tbnz		x6, #63, 1f
	tbnz		x6, #63, .Lt192

	eor		v20.16b, v20.16b, v4.16b
	eor		v21.16b, v21.16b, v5.16b
	eor		v22.16b, v22.16b, v6.16b
	eor		v23.16b, v23.16b, v7.16b
	st1		{v20.16b-v23.16b}, [x1], #64
	cbz		x6, .Lout

	tbnz		x7, #63, 2f
	st1		{v16.16b-v19.16b}, [x1], #64
	tbnz		x7, #63, .Lt256

	eor		v24.16b, v24.16b, v8.16b
	eor		v25.16b, v25.16b, v9.16b
	eor		v26.16b, v26.16b, v10.16b
	eor		v27.16b, v27.16b, v11.16b
	st1		{v24.16b-v27.16b}, [x1], #64
	cbz		x7, .Lout

	tbnz		x8, #63, 3f
	st1		{v20.16b-v23.16b}, [x1], #64
	tbnz		x8, #63, .Lt320

	eor		v28.16b, v28.16b, v12.16b
	eor		v29.16b, v29.16b, v13.16b
	eor		v30.16b, v30.16b, v14.16b
	eor		v31.16b, v31.16b, v15.16b

	st1		{v24.16b-v27.16b}, [x1], #64
	st1		{v28.16b-v31.16b}, [x1]

.Lout:	frame_pop
	ret

	// fewer than 128 bytes of in/output
0:	ld1		{v8.16b}, [x10]
	ld1		{v9.16b}, [x11]
	movi		v10.16b, #16
	sub		x2, x1, #64
	add		x1, x1, x5
	ld1		{v16.16b-v19.16b}, [x2]
	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
	add		v8.16b, v8.16b, v10.16b
	add		v9.16b, v9.16b, v10.16b
	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
	add		v8.16b, v8.16b, v10.16b
	add		v9.16b, v9.16b, v10.16b
	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
	add		v8.16b, v8.16b, v10.16b
	add		v9.16b, v9.16b, v10.16b
	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b

	eor		v20.16b, v20.16b, v4.16b
	eor		v21.16b, v21.16b, v5.16b
	eor		v22.16b, v22.16b, v6.16b
	eor		v23.16b, v23.16b, v7.16b
	st1		{v20.16b-v23.16b}, [x1]
	b		.Lout

	// fewer than 192 bytes of in/output
1:	ld1		{v8.16b}, [x10]
	ld1		{v9.16b}, [x11]
	movi		v10.16b, #16
	add		x1, x1, x6
	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
	add		v8.16b, v8.16b, v10.16b
	add		v9.16b, v9.16b, v10.16b
	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
	add		v8.16b, v8.16b, v10.16b
	add		v9.16b, v9.16b, v10.16b
	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
	add		v8.16b, v8.16b, v10.16b
	add		v9.16b, v9.16b, v10.16b
	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b

	eor		v20.16b, v20.16b, v0.16b
	eor		v21.16b, v21.16b, v1.16b
	eor		v22.16b, v22.16b, v2.16b
	eor		v23.16b, v23.16b, v3.16b
	st1		{v20.16b-v23.16b}, [x1]
.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
	ld1		{v28.16b-v31.16b}, [x10]
	add		x5, x5, x1
	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b

0:	eor		v20.16b, v20.16b, v28.16b
	eor		v21.16b, v21.16b, v29.16b
	eor		v22.16b, v22.16b, v30.16b
	eor		v23.16b, v23.16b, v31.16b
	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
1:	st1		{v16.16b-v19.16b}, [x1]
	b		.Lout

	// fewer than 128 bytes of in/output
.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
	add		x5, x5, x1
	sub		x1, x1, #64
	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
	b		0b

	// fewer than 256 bytes of in/output
2:	ld1		{v4.16b}, [x10]
	ld1		{v5.16b}, [x11]
	movi		v6.16b, #16
	add		x1, x1, x7
.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
	ld1		{v4.16b-v7.16b}, [x10]
	add		x6, x6, x1
	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
	add		v4.16b, v4.16b, v6.16b
	add		v5.16b, v5.16b, v6.16b
	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
	add		v4.16b, v4.16b, v6.16b
	add		v5.16b, v5.16b, v6.16b
	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
	add		v4.16b, v4.16b, v6.16b
	add		v5.16b, v5.16b, v6.16b
	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b

	eor		v24.16b, v24.16b, v0.16b
	eor		v25.16b, v25.16b, v1.16b
	eor		v26.16b, v26.16b, v2.16b
	eor		v27.16b, v27.16b, v3.16b
	st1		{v24.16b-v27.16b}, [x1]
	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b

	eor		v28.16b, v28.16b, v0.16b
	eor		v29.16b, v29.16b, v1.16b
	eor		v30.16b, v30.16b, v2.16b
	eor		v31.16b, v31.16b, v3.16b
	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
2:	st1		{v20.16b-v23.16b}, [x1]
	b		.Lout

	// fewer than 320 bytes of in/output
3:	ld1		{v4.16b}, [x10]
	ld1		{v5.16b}, [x11]
	movi		v6.16b, #16
	add		x1, x1, x8
.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
	ld1		{v4.16b-v7.16b}, [x10]
	add		x7, x7, x1
	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
	add		v4.16b, v4.16b, v6.16b
	add		v5.16b, v5.16b, v6.16b
	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
	add		v4.16b, v4.16b, v6.16b
	add		v5.16b, v5.16b, v6.16b
	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
	add		v4.16b, v4.16b, v6.16b
	add		v5.16b, v5.16b, v6.16b
	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b

	eor		v28.16b, v28.16b, v0.16b
	eor		v29.16b, v29.16b, v1.16b
	eor		v30.16b, v30.16b, v2.16b
	eor		v31.16b, v31.16b, v3.16b
	st1		{v28.16b-v31.16b}, [x1]
	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
3:	st1		{v24.16b-v27.16b}, [x1]
	b		.Lout
SYM_FUNC_END(chacha_4block_xor_neon)

@@ -851,7 +796,7 @@ SYM_FUNC_END(chacha_4block_xor_neon)
	.align		L1_CACHE_SHIFT
.Lpermute:
	.set		.Li, 0
	.rept		192
	.rept		128
	.byte		(.Li - 64)
	.set		.Li, .Li + 1
	.endr