Commit b327dfe0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm

Pull ARM udpates from Russell King:

 - Improve Kconfig help text for Cortex A8 and Cortex A9 errata

 - Kconfig spelling and grammar fixes

 - Allow kernel-mode VFP/Neon in softirq context

 - Use Neon in softirq context

 - Implement AES-CTR/GHASH version of GCM

* tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm:
  ARM: 9289/1: Allow pre-ARMv5 builds with ld.lld 16.0.0 and newer
  ARM: 9288/1: Kconfigs: fix spelling & grammar
  ARM: 9286/1: crypto: Implement fused AES-CTR/GHASH version of GCM
  ARM: 9285/1: remove meaningless arch/arm/mach-rda/Makefile
  ARM: 9283/1: permit non-nested kernel mode NEON in softirq context
  ARM: 9282/1: vfp: Manipulate task VFP state with softirqs disabled
  ARM: 9281/1: improve Cortex A8/A9 errata help text
parents eb6d5bbe 5eb6e280
Loading
Loading
Loading
Loading
+25 −7
Original line number Diff line number Diff line
@@ -344,14 +344,16 @@ comment "CPU Core family selection"
config ARCH_MULTI_V4
	bool "ARMv4 based platforms (FA526, StrongARM)"
	depends on !ARCH_MULTI_V6_V7
	depends on !LD_IS_LLD
	# https://github.com/llvm/llvm-project/issues/50764
	depends on !LD_IS_LLD || LLD_VERSION >= 160000
	select ARCH_MULTI_V4_V5
	select CPU_FA526 if !(CPU_SA110 || CPU_SA1100)

config ARCH_MULTI_V4T
	bool "ARMv4T based platforms (ARM720T, ARM920T, ...)"
	depends on !ARCH_MULTI_V6_V7
	depends on !LD_IS_LLD
	# https://github.com/llvm/llvm-project/issues/50764
	depends on !LD_IS_LLD || LLD_VERSION >= 160000
	select ARCH_MULTI_V4_V5
	select CPU_ARM920T if !(CPU_ARM7TDMI || CPU_ARM720T || \
		CPU_ARM740T || CPU_ARM9TDMI || CPU_ARM922T || \
@@ -656,7 +658,9 @@ config ARM_ERRATA_458693
	  hazard might then cause a processor deadlock. The workaround enables
	  the L1 caching of the NEON accesses and disables the PLD instruction
	  in the ACTLR register. Note that setting specific bits in the ACTLR
	  register may not be available in non-secure mode.
	  register may not be available in non-secure mode and thus is not
	  available on a multiplatform kernel. This should be applied by the
	  bootloader instead.

config ARM_ERRATA_460075
	bool "ARM errata: Data written to the L2 cache can be overwritten with stale data"
@@ -669,7 +673,9 @@ config ARM_ERRATA_460075
	  and overwritten with stale memory contents from external memory. The
	  workaround disables the write-allocate mode for the L2 cache via the
	  ACTLR register. Note that setting specific bits in the ACTLR register
	  may not be available in non-secure mode.
	  may not be available in non-secure mode and thus is not available on
	  a multiplatform kernel. This should be applied by the bootloader
	  instead.

config ARM_ERRATA_742230
	bool "ARM errata: DMB operation may be faulty"
@@ -682,7 +688,10 @@ config ARM_ERRATA_742230
	  ordering of the two writes. This workaround sets a specific bit in
	  the diagnostic register of the Cortex-A9 which causes the DMB
	  instruction to behave as a DSB, ensuring the correct behaviour of
	  the two writes.
	  the two writes. Note that setting specific bits in the diagnostics
	  register may not be available in non-secure mode and thus is not
	  available on a multiplatform kernel. This should be applied by the
	  bootloader instead.

config ARM_ERRATA_742231
	bool "ARM errata: Incorrect hazard handling in the SCU may lead to data corruption"
@@ -697,7 +706,10 @@ config ARM_ERRATA_742231
	  replaced from one of the CPUs at the same time as another CPU is
	  accessing it. This workaround sets specific bits in the diagnostic
	  register of the Cortex-A9 which reduces the linefill issuing
	  capabilities of the processor.
	  capabilities of the processor. Note that setting specific bits in the
	  diagnostics register may not be available in non-secure mode and thus
	  is not available on a multiplatform kernel. This should be applied by
	  the bootloader instead.

config ARM_ERRATA_643719
	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
@@ -734,7 +746,9 @@ config ARM_ERRATA_743622
	  register of the Cortex-A9 which disables the Store Buffer
	  optimisation, preventing the defect from occurring. This has no
	  visible impact on the overall performance or power consumption of the
	  processor.
	  processor. Note that setting specific bits in the diagnostics register
	  may not be available in non-secure mode and thus is not available on a
	  multiplatform kernel. This should be applied by the bootloader instead.

config ARM_ERRATA_751472
	bool "ARM errata: Interrupted ICIALLUIS may prevent completion of broadcasted operation"
@@ -746,6 +760,10 @@ config ARM_ERRATA_751472
	  completion of a following broadcasted operation if the second
	  operation is received by a CPU before the ICIALLUIS has completed,
	  potentially leading to corrupted entries in the cache or TLB.
	  Note that setting specific bits in the diagnostics register may
	  not be available in non-secure mode and thus is not available on
	  a multiplatform kernel. This should be applied by the bootloader
	  instead.

config ARM_ERRATA_754322
	bool "ARM errata: possible faulty MMU translations following an ASID switch"
+2 −2
Original line number Diff line number Diff line
@@ -1206,8 +1206,8 @@ choice
		depends on MACH_STM32MP157
		select DEBUG_STM32_UART
		help
		  Say Y here if you want kernel low-level debugging support
		  on STM32MP1 based platforms, wich default UART is wired on
		  Say Y here if you want kernel low-level debugging support on
		  STM32MP1-based platforms, where the default UART is wired to
		  UART4, but another UART instance can be selected by modifying
		  CONFIG_DEBUG_UART_PHYS and CONFIG_DEBUG_UART_VIRT.

+0 −1
Original line number Diff line number Diff line
@@ -209,7 +209,6 @@ machine-$(CONFIG_ARCH_OMAP2PLUS) += omap2
machine-$(CONFIG_ARCH_ORION5X)		+= orion5x
machine-$(CONFIG_ARCH_PXA)		+= pxa
machine-$(CONFIG_ARCH_QCOM)		+= qcom
machine-$(CONFIG_ARCH_RDA)		+= rda
machine-$(CONFIG_ARCH_REALTEK)		+= realtek
machine-$(CONFIG_ARCH_ROCKCHIP)		+= rockchip
machine-$(CONFIG_ARCH_RPC)		+= rpc
+2 −0
Original line number Diff line number Diff line
@@ -16,8 +16,10 @@ config CRYPTO_CURVE25519_NEON
config CRYPTO_GHASH_ARM_CE
	tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)"
	depends on KERNEL_MODE_NEON
	select CRYPTO_AEAD
	select CRYPTO_HASH
	select CRYPTO_CRYPTD
	select CRYPTO_LIB_AES
	select CRYPTO_LIB_GF128MUL
	help
	  GCM GHASH function (NIST SP800-38D)
+369 −13
Original line number Diff line number Diff line
@@ -2,7 +2,8 @@
/*
 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
 *
 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 * Copyright (C) 2015 - 2017 Linaro Ltd.
 * Copyright (C) 2023 Google LLC. <ardb@google.com>
 */

#include <linux/linkage.h>
@@ -44,7 +45,7 @@
	t2q		.req	q7
	t3q		.req	q8
	t4q		.req	q9
	T2		.req	q9
	XH2		.req	q9

	s1l		.req	d20
	s1h		.req	d21
@@ -80,7 +81,7 @@

	XL2		.req	q5
	XM2		.req	q6
	XH2		.req	q7
	T2		.req	q7
	T3		.req	q8

	XL2_L		.req	d10
@@ -192,9 +193,10 @@
	vshr.u64	XL, XL, #1
	.endm

	.macro		ghash_update, pn
	.macro		ghash_update, pn, enc, aggregate=1, head=1
	vld1.64		{XL}, [r1]

	.if		\head
	/* do the head block first, if supplied */
	ldr		ip, [sp]
	teq		ip, #0
@@ -202,13 +204,32 @@
	vld1.64		{T1}, [ip]
	teq		r0, #0
	b		3f
	.endif

0:	.ifc		\pn, p64
	.if		\aggregate
	tst		r0, #3			// skip until #blocks is a
	bne		2f			// round multiple of 4

	vld1.8		{XL2-XM2}, [r2]!
1:	vld1.8		{T3-T2}, [r2]!
1:	vld1.8		{T2-T3}, [r2]!

	.ifnb		\enc
	\enc\()_4x	XL2, XM2, T2, T3

	add		ip, r3, #16
	vld1.64		{HH}, [ip, :128]!
	vld1.64		{HH3-HH4}, [ip, :128]

	veor		SHASH2_p64, SHASH_L, SHASH_H
	veor		SHASH2_H, HH_L, HH_H
	veor		HH34_L, HH3_L, HH3_H
	veor		HH34_H, HH4_L, HH4_H

	vmov.i8		MASK, #0xe1
	vshl.u64	MASK, MASK, #57
	.endif

	vrev64.8	XL2, XL2
	vrev64.8	XM2, XM2

@@ -218,8 +239,8 @@
	veor		XL2_H, XL2_H, XL_L
	veor		XL, XL, T1

	vrev64.8	T3, T3
	vrev64.8	T1, T2
	vrev64.8	T1, T3
	vrev64.8	T3, T2

	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
	veor		XL2_H, XL2_H, XL_H
@@ -267,14 +288,22 @@

	b		1b
	.endif
	.endif

2:	vld1.8		{T1}, [r2]!

	.ifnb		\enc
	\enc\()_1x	T1
	veor		SHASH2_p64, SHASH_L, SHASH_H
	vmov.i8		MASK, #0xe1
	vshl.u64	MASK, MASK, #57
	.endif

2:	vld1.64		{T1}, [r2]!
	subs		r0, r0, #1

3:	/* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN
	vrev64.8	T1, T1
#endif

	vext.8		IN1, T1, T1, #8
	veor		T1_L, T1_L, XL_H
	veor		XL, XL, IN1
@@ -293,9 +322,6 @@
	veor		XL, XL, T1

	bne		0b

	vst1.64		{XL}, [r1]
	bx		lr
	.endm

	/*
@@ -316,6 +342,9 @@ ENTRY(pmull_ghash_update_p64)
	vshl.u64	MASK, MASK, #57

	ghash_update	p64
	vst1.64		{XL}, [r1]

	bx		lr
ENDPROC(pmull_ghash_update_p64)

ENTRY(pmull_ghash_update_p8)
@@ -336,4 +365,331 @@ ENTRY(pmull_ghash_update_p8)
	vmov.i64	k48, #0xffffffffffff

	ghash_update	p8
	vst1.64		{XL}, [r1]

	bx		lr
ENDPROC(pmull_ghash_update_p8)

	e0		.req	q9
	e1		.req	q10
	e2		.req	q11
	e3		.req	q12
	e0l		.req	d18
	e0h		.req	d19
	e2l		.req	d22
	e2h		.req	d23
	e3l		.req	d24
	e3h		.req	d25
	ctr		.req	q13
	ctr0		.req	d26
	ctr1		.req	d27

	ek0		.req	q14
	ek1		.req	q15

	.macro		round, rk:req, regs:vararg
	.irp		r, \regs
	aese.8		\r, \rk
	aesmc.8		\r, \r
	.endr
	.endm

	.macro		aes_encrypt, rkp, rounds, regs:vararg
	vld1.8		{ek0-ek1}, [\rkp, :128]!
	cmp		\rounds, #12
	blt		.L\@			// AES-128

	round		ek0, \regs
	vld1.8		{ek0}, [\rkp, :128]!
	round		ek1, \regs
	vld1.8		{ek1}, [\rkp, :128]!

	beq		.L\@			// AES-192

	round		ek0, \regs
	vld1.8		{ek0}, [\rkp, :128]!
	round		ek1, \regs
	vld1.8		{ek1}, [\rkp, :128]!

.L\@:	.rept		4
	round		ek0, \regs
	vld1.8		{ek0}, [\rkp, :128]!
	round		ek1, \regs
	vld1.8		{ek1}, [\rkp, :128]!
	.endr

	round		ek0, \regs
	vld1.8		{ek0}, [\rkp, :128]

	.irp		r, \regs
	aese.8		\r, ek1
	.endr
	.irp		r, \regs
	veor		\r, \r, ek0
	.endr
	.endm

pmull_aes_encrypt:
	add		ip, r5, #4
	vld1.8		{ctr0}, [r5]		// load 12 byte IV
	vld1.8		{ctr1}, [ip]
	rev		r8, r7
	vext.8		ctr1, ctr1, ctr1, #4
	add		r7, r7, #1
	vmov.32		ctr1[1], r8
	vmov		e0, ctr

	add		ip, r3, #64
	aes_encrypt	ip, r6, e0
	bx		lr
ENDPROC(pmull_aes_encrypt)

pmull_aes_encrypt_4x:
	add		ip, r5, #4
	vld1.8		{ctr0}, [r5]
	vld1.8		{ctr1}, [ip]
	rev		r8, r7
	vext.8		ctr1, ctr1, ctr1, #4
	add		r7, r7, #1
	vmov.32		ctr1[1], r8
	rev		ip, r7
	vmov		e0, ctr
	add		r7, r7, #1
	vmov.32		ctr1[1], ip
	rev		r8, r7
	vmov		e1, ctr
	add		r7, r7, #1
	vmov.32		ctr1[1], r8
	rev		ip, r7
	vmov		e2, ctr
	add		r7, r7, #1
	vmov.32		ctr1[1], ip
	vmov		e3, ctr

	add		ip, r3, #64
	aes_encrypt	ip, r6, e0, e1, e2, e3
	bx		lr
ENDPROC(pmull_aes_encrypt_4x)

pmull_aes_encrypt_final:
	add		ip, r5, #4
	vld1.8		{ctr0}, [r5]
	vld1.8		{ctr1}, [ip]
	rev		r8, r7
	vext.8		ctr1, ctr1, ctr1, #4
	mov		r7, #1 << 24		// BE #1 for the tag
	vmov.32		ctr1[1], r8
	vmov		e0, ctr
	vmov.32		ctr1[1], r7
	vmov		e1, ctr

	add		ip, r3, #64
	aes_encrypt	ip, r6, e0, e1
	bx		lr
ENDPROC(pmull_aes_encrypt_final)

	.macro		enc_1x, in0
	bl		pmull_aes_encrypt
	veor		\in0, \in0, e0
	vst1.8		{\in0}, [r4]!
	.endm

	.macro		dec_1x, in0
	bl		pmull_aes_encrypt
	veor		e0, e0, \in0
	vst1.8		{e0}, [r4]!
	.endm

	.macro		enc_4x, in0, in1, in2, in3
	bl		pmull_aes_encrypt_4x

	veor		\in0, \in0, e0
	veor		\in1, \in1, e1
	veor		\in2, \in2, e2
	veor		\in3, \in3, e3

	vst1.8		{\in0-\in1}, [r4]!
	vst1.8		{\in2-\in3}, [r4]!
	.endm

	.macro		dec_4x, in0, in1, in2, in3
	bl		pmull_aes_encrypt_4x

	veor		e0, e0, \in0
	veor		e1, e1, \in1
	veor		e2, e2, \in2
	veor		e3, e3, \in3

	vst1.8		{e0-e1}, [r4]!
	vst1.8		{e2-e3}, [r4]!
	.endm

	/*
	 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
	 *			  struct gcm_key const *k, char *dst,
	 *			  char *iv, int rounds, u32 counter)
	 */
ENTRY(pmull_gcm_encrypt)
	push		{r4-r8, lr}
	ldrd		r4, r5, [sp, #24]
	ldrd		r6, r7, [sp, #32]

	vld1.64		{SHASH}, [r3]

	ghash_update	p64, enc, head=0
	vst1.64		{XL}, [r1]

	pop		{r4-r8, pc}
ENDPROC(pmull_gcm_encrypt)

	/*
	 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
	 *			  struct gcm_key const *k, char *dst,
	 *			  char *iv, int rounds, u32 counter)
	 */
ENTRY(pmull_gcm_decrypt)
	push		{r4-r8, lr}
	ldrd		r4, r5, [sp, #24]
	ldrd		r6, r7, [sp, #32]

	vld1.64		{SHASH}, [r3]

	ghash_update	p64, dec, head=0
	vst1.64		{XL}, [r1]

	pop		{r4-r8, pc}
ENDPROC(pmull_gcm_decrypt)

	/*
	 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
	 *			    struct gcm_key const *k, char *head,
	 *			    char *iv, int rounds, u32 counter)
	 */
ENTRY(pmull_gcm_enc_final)
	push		{r4-r8, lr}
	ldrd		r4, r5, [sp, #24]
	ldrd		r6, r7, [sp, #32]

	bl		pmull_aes_encrypt_final

	cmp		r0, #0
	beq		.Lenc_final

	mov_l		ip, .Lpermute
	sub		r4, r4, #16
	add		r8, ip, r0
	add		ip, ip, #32
	add		r4, r4, r0
	sub		ip, ip, r0

	vld1.8		{e3}, [r8]		// permute vector for key stream
	vld1.8		{e2}, [ip]		// permute vector for ghash input

	vtbl.8		e3l, {e0}, e3l
	vtbl.8		e3h, {e0}, e3h

	vld1.8		{e0}, [r4]		// encrypt tail block
	veor		e0, e0, e3
	vst1.8		{e0}, [r4]

	vtbl.8		T1_L, {e0}, e2l
	vtbl.8		T1_H, {e0}, e2h

	vld1.64		{XL}, [r1]
.Lenc_final:
	vld1.64		{SHASH}, [r3, :128]
	vmov.i8		MASK, #0xe1
	veor		SHASH2_p64, SHASH_L, SHASH_H
	vshl.u64	MASK, MASK, #57
	mov		r0, #1
	bne		3f			// process head block first
	ghash_update	p64, aggregate=0, head=0

	vrev64.8	XL, XL
	vext.8		XL, XL, XL, #8
	veor		XL, XL, e1

	sub		r2, r2, #16		// rewind src pointer
	vst1.8		{XL}, [r2]		// store tag

	pop		{r4-r8, pc}
ENDPROC(pmull_gcm_enc_final)

	/*
	 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
	 *			   struct gcm_key const *k, char *head,
	 *			   char *iv, int rounds, u32 counter,
	 *			   const char *otag, int authsize)
	 */
ENTRY(pmull_gcm_dec_final)
	push		{r4-r8, lr}
	ldrd		r4, r5, [sp, #24]
	ldrd		r6, r7, [sp, #32]

	bl		pmull_aes_encrypt_final

	cmp		r0, #0
	beq		.Ldec_final

	mov_l		ip, .Lpermute
	sub		r4, r4, #16
	add		r8, ip, r0
	add		ip, ip, #32
	add		r4, r4, r0
	sub		ip, ip, r0

	vld1.8		{e3}, [r8]		// permute vector for key stream
	vld1.8		{e2}, [ip]		// permute vector for ghash input

	vtbl.8		e3l, {e0}, e3l
	vtbl.8		e3h, {e0}, e3h

	vld1.8		{e0}, [r4]

	vtbl.8		T1_L, {e0}, e2l
	vtbl.8		T1_H, {e0}, e2h

	veor		e0, e0, e3
	vst1.8		{e0}, [r4]

	vld1.64		{XL}, [r1]
.Ldec_final:
	vld1.64		{SHASH}, [r3]
	vmov.i8		MASK, #0xe1
	veor		SHASH2_p64, SHASH_L, SHASH_H
	vshl.u64	MASK, MASK, #57
	mov		r0, #1
	bne		3f			// process head block first
	ghash_update	p64, aggregate=0, head=0

	vrev64.8	XL, XL
	vext.8		XL, XL, XL, #8
	veor		XL, XL, e1

	mov_l		ip, .Lpermute
	ldrd		r2, r3, [sp, #40]	// otag and authsize
	vld1.8		{T1}, [r2]
	add		ip, ip, r3
	vceq.i8		T1, T1, XL		// compare tags
	vmvn		T1, T1			// 0 for eq, -1 for ne

	vld1.8		{e0}, [ip]
	vtbl.8		XL_L, {T1}, e0l		// keep authsize bytes only
	vtbl.8		XL_H, {T1}, e0h

	vpmin.s8	XL_L, XL_L, XL_H	// take the minimum s8 across the vector
	vpmin.s8	XL_L, XL_L, XL_L
	vmov.32		r0, XL_L[0]		// fail if != 0x0

	pop		{r4-r8, pc}
ENDPROC(pmull_gcm_dec_final)

	.section	".rodata", "a", %progbits
	.align		5
.Lpermute:
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
Loading