lib/crypto: arm64/aes: Migrate optimized code into library

Move the ARM64 optimized AES key expansion and single-block AES en/decryption code into lib/crypto/, wire it up to the AES library API, and remove the superseded crypto_cipher algorithms. The result is that both the AES library and crypto_cipher APIs are now optimized for ARM64, whereas previously only crypto_cipher was (and the optimizations weren't enabled by default, which this fixes as well). Note: to see the diff from arch/arm64/crypto/aes-ce-glue.c to lib/crypto/arm64/aes.h, view this commit with 'git show -M10'. Acked-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20260112192035.10427-12-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2026-03-22 07:27:12 +08:00 · 2026-01-12 11:20:09 -08:00
parent fa2297750c
commit 2b1ef7aeeb
13 changed files with 181 additions and 290 deletions
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -15,6 +15,7 @@ config CRYPTO_LIB_AES_ARCH
 	bool
 	depends on CRYPTO_LIB_AES && !UML && !KMSAN
 	default y if ARM
+	default y if ARM64

 config CRYPTO_LIB_AESCFB
 	tristate
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -24,6 +24,11 @@ CFLAGS_aes.o += -I$(src)/$(SRCARCH)

 libaes-$(CONFIG_ARM) += arm/aes-cipher-core.o

+ifeq ($(CONFIG_ARM64),y)
+libaes-y += arm64/aes-cipher-core.o
+libaes-$(CONFIG_KERNEL_MODE_NEON) += arm64/aes-ce-core.o
+endif
+
 endif # CONFIG_CRYPTO_LIB_AES_ARCH

 ################################################################################
--- a/lib/crypto/arm64/aes-ce-core.S
+++ b/lib/crypto/arm64/aes-ce-core.S
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.arch		armv8-a+crypto
+
+SYM_FUNC_START(__aes_ce_encrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aese		v0.16b, v2.16b
+	aesmc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aese		v0.16b, v3.16b
+	aesmc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aese		v0.16b, v1.16b
+	aesmc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aese		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+SYM_FUNC_END(__aes_ce_encrypt)
+
+SYM_FUNC_START(__aes_ce_decrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aesd		v0.16b, v2.16b
+	aesimc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aesd		v0.16b, v3.16b
+	aesimc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aesd		v0.16b, v1.16b
+	aesimc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aesd		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+SYM_FUNC_END(__aes_ce_decrypt)
+
+/*
+ * __aes_ce_sub() - use the aese instruction to perform the AES sbox
+ *                  substitution on each byte in 'input'
+ */
+SYM_FUNC_START(__aes_ce_sub)
+	dup		v1.4s, w0
+	movi		v0.16b, #0
+	aese		v0.16b, v1.16b
+	umov		w0, v0.s[0]
+	ret
+SYM_FUNC_END(__aes_ce_sub)
+
+SYM_FUNC_START(__aes_ce_invert)
+	ld1		{v0.4s}, [x1]
+	aesimc		v1.16b, v0.16b
+	st1		{v1.4s}, [x0]
+	ret
+SYM_FUNC_END(__aes_ce_invert)
--- a/lib/crypto/arm64/aes-cipher-core.S
+++ b/lib/crypto/arm64/aes-cipher-core.S
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+	.text
+
+	rk		.req	x0
+	out		.req	x1
+	in		.req	x2
+	rounds		.req	x3
+	tt		.req	x2
+
+	.macro		__pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
+	.ifc		\op\shift, b0
+	ubfiz		\reg0, \in0, #2, #8
+	ubfiz		\reg1, \in1e, #2, #8
+	.else
+	ubfx		\reg0, \in0, #\shift, #8
+	ubfx		\reg1, \in1e, #\shift, #8
+	.endif
+
+	/*
+	 * AArch64 cannot do byte size indexed loads from a table containing
+	 * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
+	 * valid instruction. So perform the shift explicitly first for the
+	 * high bytes (the low byte is shifted implicitly by using ubfiz rather
+	 * than ubfx above)
+	 */
+	.ifnc		\op, b
+	ldr		\reg0, [tt, \reg0, uxtw #2]
+	ldr		\reg1, [tt, \reg1, uxtw #2]
+	.else
+	.if		\shift > 0
+	lsl		\reg0, \reg0, #2
+	lsl		\reg1, \reg1, #2
+	.endif
+	ldrb		\reg0, [tt, \reg0, uxtw]
+	ldrb		\reg1, [tt, \reg1, uxtw]
+	.endif
+	.endm
+
+	.macro		__pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
+	ubfx		\reg0, \in0, #\shift, #8
+	ubfx		\reg1, \in1d, #\shift, #8
+	ldr\op		\reg0, [tt, \reg0, uxtw #\sz]
+	ldr\op		\reg1, [tt, \reg1, uxtw #\sz]
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
+	ldp		\out0, \out1, [rk], #8
+
+	__pair\enc	\sz, \op, w12, w13, \in0, \in1, \in3, 0
+	__pair\enc	\sz, \op, w14, w15, \in1, \in2, \in0, 8
+	__pair\enc	\sz, \op, w16, w17, \in2, \in3, \in1, 16
+	__pair\enc	\sz, \op, \t0, \t1, \in3, \in0, \in2, 24
+
+	eor		\out0, \out0, w12
+	eor		\out1, \out1, w13
+	eor		\out0, \out0, w14, ror #24
+	eor		\out1, \out1, w15, ror #24
+	eor		\out0, \out0, w16, ror #16
+	eor		\out1, \out1, w17, ror #16
+	eor		\out0, \out0, \t0, ror #8
+	eor		\out1, \out1, \t1, ror #8
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab, bsz
+	ldp		w4, w5, [in]
+	ldp		w6, w7, [in, #8]
+	ldp		w8, w9, [rk], #16
+	ldp		w10, w11, [rk, #-8]
+
+CPU_BE(	rev		w4, w4		)
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+
+	eor		w4, w4, w8
+	eor		w5, w5, w9
+	eor		w6, w6, w10
+	eor		w7, w7, w11
+
+	adr_l		tt, \ttab
+
+	tbnz		rounds, #1, 1f
+
+0:	\round		w8, w9, w10, w11, w4, w5, w6, w7
+	\round		w4, w5, w6, w7, w8, w9, w10, w11
+
+1:	subs		rounds, rounds, #4
+	\round		w8, w9, w10, w11, w4, w5, w6, w7
+	b.ls		3f
+2:	\round		w4, w5, w6, w7, w8, w9, w10, w11
+	b		0b
+3:	adr_l		tt, \ltab
+	\round		w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
+
+CPU_BE(	rev		w4, w4		)
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+
+	stp		w4, w5, [out]
+	stp		w6, w7, [out, #8]
+	ret
+	.endm
+
+SYM_FUNC_START(__aes_arm64_encrypt)
+	do_crypt	fround, aes_enc_tab, aes_enc_tab + 1, 2
+SYM_FUNC_END(__aes_arm64_encrypt)
+
+	.align		5
+SYM_FUNC_START(__aes_arm64_decrypt)
+	do_crypt	iround, aes_dec_tab, crypto_aes_inv_sbox, 0
+SYM_FUNC_END(__aes_arm64_decrypt)
--- a/lib/crypto/arm64/aes.h
+++ b/lib/crypto/arm64/aes.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AES block cipher, optimized for ARM64
+ *
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/unaligned.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes);
+
+struct aes_block {
+	u8 b[AES_BLOCK_SIZE];
+};
+
+asmlinkage void __aes_arm64_encrypt(const u32 rk[], u8 out[AES_BLOCK_SIZE],
+				    const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage void __aes_arm64_decrypt(const u32 inv_rk[], u8 out[AES_BLOCK_SIZE],
+				    const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage void __aes_ce_encrypt(const u32 rk[], u8 out[AES_BLOCK_SIZE],
+				 const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage void __aes_ce_decrypt(const u32 inv_rk[], u8 out[AES_BLOCK_SIZE],
+				 const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage u32 __aes_ce_sub(u32 l);
+asmlinkage void __aes_ce_invert(struct aes_block *out,
+				const struct aes_block *in);
+
+/*
+ * Expand an AES key using the crypto extensions if supported and usable or
+ * generic code otherwise.  The expanded key format is compatible between the
+ * two cases.  The outputs are @rndkeys (required) and @inv_rndkeys (optional).
+ */
+static void aes_expandkey_arm64(u32 rndkeys[], u32 *inv_rndkeys,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	/*
+	 * The AES key schedule round constants
+	 */
+	static u8 const rcon[] = {
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+	};
+
+	u32 kwords = key_len / sizeof(u32);
+	struct aes_block *key_enc, *key_dec;
+	int i, j;
+
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ||
+	    !static_branch_likely(&have_aes) || unlikely(!may_use_simd())) {
+		aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len);
+		return;
+	}
+
+	for (i = 0; i < kwords; i++)
+		rndkeys[i] = get_unaligned_le32(&in_key[i * sizeof(u32)]);
+
+	scoped_ksimd() {
+		for (i = 0; i < sizeof(rcon); i++) {
+			u32 *rki = &rndkeys[i * kwords];
+			u32 *rko = rki + kwords;
+
+			rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
+				 rcon[i] ^ rki[0];
+			rko[1] = rko[0] ^ rki[1];
+			rko[2] = rko[1] ^ rki[2];
+			rko[3] = rko[2] ^ rki[3];
+
+			if (key_len == AES_KEYSIZE_192) {
+				if (i >= 7)
+					break;
+				rko[4] = rko[3] ^ rki[4];
+				rko[5] = rko[4] ^ rki[5];
+			} else if (key_len == AES_KEYSIZE_256) {
+				if (i >= 6)
+					break;
+				rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
+				rko[5] = rko[4] ^ rki[5];
+				rko[6] = rko[5] ^ rki[6];
+				rko[7] = rko[6] ^ rki[7];
+			}
+		}
+
+		/*
+		 * Generate the decryption keys for the Equivalent Inverse
+		 * Cipher.  This involves reversing the order of the round
+		 * keys, and applying the Inverse Mix Columns transformation on
+		 * all but the first and the last one.
+		 */
+		if (inv_rndkeys) {
+			key_enc = (struct aes_block *)rndkeys;
+			key_dec = (struct aes_block *)inv_rndkeys;
+			j = nrounds;
+
+			key_dec[0] = key_enc[j];
+			for (i = 1, j--; j > 0; i++, j--)
+				__aes_ce_invert(key_dec + i, key_enc + j);
+			key_dec[i] = key_enc[0];
+		}
+	}
+}
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	aes_expandkey_arm64(k->rndkeys, inv_k ? inv_k->inv_rndkeys : NULL,
+			    in_key, key_len, nrounds);
+}
+
+/*
+ * This is here temporarily until the remaining AES mode implementations are
+ * migrated from arch/arm64/crypto/ to lib/crypto/arm64/.
+ */
+int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
+		     unsigned int key_len)
+{
+	if (aes_check_keylen(key_len) != 0)
+		return -EINVAL;
+	ctx->key_length = key_len;
+	aes_expandkey_arm64(ctx->key_enc, ctx->key_dec, in_key, key_len,
+			    6 + key_len / 4);
+	return 0;
+}
+EXPORT_SYMBOL(ce_aes_expandkey);
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_aes) && likely(may_use_simd())) {
+		scoped_ksimd()
+			__aes_ce_encrypt(key->k.rndkeys, out, in, key->nrounds);
+	} else {
+		__aes_arm64_encrypt(key->k.rndkeys, out, in, key->nrounds);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_aes) && likely(may_use_simd())) {
+		scoped_ksimd()
+			__aes_ce_decrypt(key->inv_k.inv_rndkeys, out, in,
+					 key->nrounds);
+	} else {
+		__aes_arm64_decrypt(key->inv_k.inv_rndkeys, out, in,
+				    key->nrounds);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(AES))
+		static_branch_enable(&have_aes);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */