Merge tag 'libcrypto-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull crypto library updates from Eric Biggers: - Add support for verifying ML-DSA signatures. ML-DSA (Module-Lattice-Based Digital Signature Algorithm) is a recently-standardized post-quantum (quantum-resistant) signature algorithm. It was known as Dilithium pre-standardization. The first use case in the kernel will be module signing. But there are also other users of RSA and ECDSA signatures in the kernel that might want to upgrade to ML-DSA eventually. - Improve the AES library: - Make the AES key expansion and single block encryption and decryption functions use the architecture-optimized AES code. Enable these optimizations by default. - Support preparing an AES key for encryption-only, using about half as much memory as a bidirectional key. - Replace the existing two generic implementations of AES with a single one. - Simplify how Adiantum message hashing is implemented. Remove the "nhpoly1305" crypto_shash in favor of direct lib/crypto/ support for NH hashing, and enable optimizations by default. * tag 'libcrypto-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: (53 commits) lib/crypto: mldsa: Clarify the documentation for mldsa_verify() slightly lib/crypto: aes: Drop 'volatile' from aes_sbox and aes_inv_sbox lib/crypto: aes: Remove old AES en/decryption functions lib/crypto: aesgcm: Use new AES library API lib/crypto: aescfb: Use new AES library API crypto: omap - Use new AES library API crypto: inside-secure - Use new AES library API crypto: drbg - Use new AES library API crypto: crypto4xx - Use new AES library API crypto: chelsio - Use new AES library API crypto: ccp - Use new AES library API crypto: x86/aes-gcm - Use new AES library API crypto: arm64/ghash - Use new AES library API crypto: arm/ghash - Use new AES library API staging: rtl8723bs: core: Use new AES library API net: phy: mscc: macsec: Use new AES library API chelsio: Use new AES library API Bluetooth: SMP: Use new AES library API crypto: x86/aes - Remove the superseded AES-NI crypto_cipher lib/crypto: x86/aes: Add AES-NI optimization ...
2026-03-22 07:27:12 +08:00 · 2026-02-10 08:31:09 -08:00
parent 35149653ee ffd42b6d04
commit 13d83ea9d8
141 changed files with 6668 additions and 5264 deletions
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -11,6 +11,18 @@ config CRYPTO_LIB_UTILS
 config CRYPTO_LIB_AES
 	tristate

+config CRYPTO_LIB_AES_ARCH
+	bool
+	depends on CRYPTO_LIB_AES && !UML && !KMSAN
+	default y if ARM
+	default y if ARM64
+	default y if PPC && (SPE || (PPC64 && VSX))
+	default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		     RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
+	default y if S390
+	default y if SPARC64
+	default y if X86
+
 config CRYPTO_LIB_AESCFB
 	tristate
 	select CRYPTO_LIB_AES
@@ -101,6 +113,26 @@ config CRYPTO_LIB_MD5_ARCH
 	default y if PPC
 	default y if SPARC64

+config CRYPTO_LIB_MLDSA
+	tristate
+	select CRYPTO_LIB_SHA3
+	help
+	  The ML-DSA library functions.  Select this if your module uses any of
+	  the functions from <crypto/mldsa.h>.
+
+config CRYPTO_LIB_NH
+	tristate
+	help
+	  Implementation of the NH almost-universal hash function, specifically
+	  the variant of NH used in Adiantum.
+
+config CRYPTO_LIB_NH_ARCH
+	bool
+	depends on CRYPTO_LIB_NH && !UML && !KMSAN
+	default y if ARM && KERNEL_MODE_NEON
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if X86_64
+
 config CRYPTO_LIB_POLY1305
 	tristate
 	help
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -15,8 +15,47 @@ obj-$(CONFIG_CRYPTO_HASH_INFO)			+= hash_info.o
 obj-$(CONFIG_CRYPTO_LIB_UTILS)			+= libcryptoutils.o
 libcryptoutils-y				:= memneq.o utils.o

-obj-$(CONFIG_CRYPTO_LIB_AES)			+= libaes.o
-libaes-y					:= aes.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
+libaes-y := aes.o
+ifeq ($(CONFIG_CRYPTO_LIB_AES_ARCH),y)
+CFLAGS_aes.o += -I$(src)/$(SRCARCH)
+
+libaes-$(CONFIG_ARM) += arm/aes-cipher-core.o
+
+ifeq ($(CONFIG_ARM64),y)
+libaes-y += arm64/aes-cipher-core.o
+libaes-$(CONFIG_KERNEL_MODE_NEON) += arm64/aes-ce-core.o
+endif
+
+ifeq ($(CONFIG_PPC),y)
+ifeq ($(CONFIG_SPE),y)
+libaes-y += powerpc/aes-spe-core.o \
+	    powerpc/aes-spe-keys.o \
+	    powerpc/aes-spe-modes.o \
+	    powerpc/aes-tab-4k.o
+else
+libaes-y += powerpc/aesp8-ppc.o
+aes-perlasm-flavour-y := linux-ppc64
+aes-perlasm-flavour-$(CONFIG_PPC64_ELF_ABI_V2) := linux-ppc64-elfv2
+aes-perlasm-flavour-$(CONFIG_CPU_LITTLE_ENDIAN) := linux-ppc64le
+quiet_cmd_perlasm_aes = PERLASM $@
+      cmd_perlasm_aes = $(PERL) $< $(aes-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/powerpc/aesp8-ppc.S: $(src)/powerpc/aesp8-ppc.pl FORCE
+	$(call if_changed,perlasm_aes)
+targets += powerpc/aesp8-ppc.S
+OBJECT_FILES_NON_STANDARD_powerpc/aesp8-ppc.o := y
+endif # !CONFIG_SPE
+endif # CONFIG_PPC
+
+libaes-$(CONFIG_RISCV) += riscv/aes-riscv64-zvkned.o
+libaes-$(CONFIG_SPARC) += sparc/aes_asm.o
+libaes-$(CONFIG_X86) += x86/aes-aesni.o
+endif # CONFIG_CRYPTO_LIB_AES_ARCH
+
+################################################################################

 obj-$(CONFIG_CRYPTO_LIB_AESCFB)			+= libaescfb.o
 libaescfb-y					:= aescfb.o
@@ -126,6 +165,22 @@ endif # CONFIG_CRYPTO_LIB_MD5_ARCH

 ################################################################################

+obj-$(CONFIG_CRYPTO_LIB_MLDSA) += libmldsa.o
+libmldsa-y := mldsa.o
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_NH) += libnh.o
+libnh-y := nh.o
+ifeq ($(CONFIG_CRYPTO_LIB_NH_ARCH),y)
+CFLAGS_nh.o += -I$(src)/$(SRCARCH)
+libnh-$(CONFIG_ARM) += arm/nh-neon-core.o
+libnh-$(CONFIG_ARM64) += arm64/nh-neon-core.o
+libnh-$(CONFIG_X86) += x86/nh-sse2.o x86/nh-avx2.o
+endif
+
+################################################################################
+
 obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o
 libpoly1305-y := poly1305.o
 ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -1,19 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * Copyright (C) 2017-2019 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright 2026 Google LLC
 */

 #include <crypto/aes.h>
+#include <linux/cache.h>
 #include <linux/crypto.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/unaligned.h>

-/*
- * Emit the sbox as volatile const to prevent the compiler from doing
- * constant folding on sbox references involving fixed indexes.
- */
-static volatile const u8 ____cacheline_aligned aes_sbox[] = {
+static const u8 ____cacheline_aligned aes_sbox[] = {
 	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
 	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
 	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
@@ -48,7 +46,7 @@ static volatile const u8 ____cacheline_aligned aes_sbox[] = {
 	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
 };

-static volatile const u8 ____cacheline_aligned aes_inv_sbox[] = {
+static const u8 ____cacheline_aligned aes_inv_sbox[] = {
 	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
 	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
 	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
@@ -89,6 +87,110 @@ extern const u8 crypto_aes_inv_sbox[256] __alias(aes_inv_sbox);
 EXPORT_SYMBOL(crypto_aes_sbox);
 EXPORT_SYMBOL(crypto_aes_inv_sbox);

+/* aes_enc_tab[i] contains MixColumn([SubByte(i), 0, 0, 0]). */
+const u32 ____cacheline_aligned aes_enc_tab[256] = {
+	0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6,
+	0xb16f6fde, 0x54c5c591, 0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56,
+	0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec, 0x45caca8f, 0x9d82821f,
+	0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb,
+	0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45, 0xbf9c9c23, 0xf7a4a453,
+	0x967272e4, 0x5bc0c09b, 0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c,
+	0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83, 0x5c343468, 0xf4a5a551,
+	0x34e5e5d1, 0x08f1f1f9, 0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a,
+	0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d, 0x28181830, 0xa1969637,
+	0x0f05050a, 0xb59a9a2f, 0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df,
+	0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea, 0x1b090912, 0x9e83831d,
+	0x742c2c58, 0x2e1a1a34, 0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b,
+	0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d, 0x7b292952, 0x3ee3e3dd,
+	0x712f2f5e, 0x97848413, 0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1,
+	0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6, 0xbe6a6ad4, 0x46cbcb8d,
+	0xd9bebe67, 0x4b393972, 0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85,
+	0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed, 0xc5434386, 0xd74d4d9a,
+	0x55333366, 0x94858511, 0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe,
+	0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b, 0xf35151a2, 0xfea3a35d,
+	0xc0404080, 0x8a8f8f05, 0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1,
+	0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142, 0x30101020, 0x1affffe5,
+	0x0ef3f3fd, 0x6dd2d2bf, 0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3,
+	0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e, 0x57c4c493, 0xf2a7a755,
+	0x827e7efc, 0x473d3d7a, 0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6,
+	0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3, 0x66222244, 0x7e2a2a54,
+	0xab90903b, 0x8388880b, 0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428,
+	0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad, 0x3be0e0db, 0x56323264,
+	0x4e3a3a74, 0x1e0a0a14, 0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8,
+	0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4, 0xa8919139, 0xa4959531,
+	0x37e4e4d3, 0x8b7979f2, 0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda,
+	0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949, 0xb46c6cd8, 0xfa5656ac,
+	0x07f4f4f3, 0x25eaeacf, 0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810,
+	0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c, 0x241c1c38, 0xf1a6a657,
+	0xc7b4b473, 0x51c6c697, 0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e,
+	0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f, 0x907070e0, 0x423e3e7c,
+	0xc4b5b571, 0xaa6666cc, 0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c,
+	0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969, 0x91868617, 0x58c1c199,
+	0x271d1d3a, 0xb99e9e27, 0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122,
+	0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433, 0xb69b9b2d, 0x221e1e3c,
+	0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5,
+	0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7,
+	0xc6424284, 0xb86868d0, 0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e,
+	0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c,
+};
+EXPORT_SYMBOL(aes_enc_tab);
+
+/* aes_dec_tab[i] contains InvMixColumn([InvSubByte(i), 0, 0, 0]). */
+const u32 ____cacheline_aligned aes_dec_tab[256] = {
+	0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f,
+	0xab58faac, 0x9303e34b, 0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5,
+	0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5, 0x495ab1de, 0x671bba25,
+	0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b,
+	0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458,
+	0x2969e049, 0x44c8c98e, 0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927,
+	0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d, 0x184adf63, 0x82311ae5,
+	0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9,
+	0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72,
+	0x578f1fe3, 0x2aab5566, 0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3,
+	0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed, 0x2b1ccf8a, 0x92b479a7,
+	0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4,
+	0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040,
+	0x069f715e, 0x51106ebd, 0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d,
+	0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060, 0x24fb9819, 0x97e9bdd6,
+	0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879,
+	0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32,
+	0xac70111e, 0x4e725a6c, 0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36,
+	0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624, 0xb1670a0c, 0x0fe75793,
+	0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c,
+	0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2,
+	0xb9a8b62d, 0xc8a91e14, 0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3,
+	0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b, 0x7629438b, 0xdcc623cb,
+	0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684,
+	0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc,
+	0xec52860d, 0xd0e3c177, 0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947,
+	0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322, 0xc74e4987, 0xc1d138d9,
+	0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f,
+	0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890,
+	0x5ef7392e, 0xf5afc382, 0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf,
+	0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb, 0x097826cd, 0xf418596e,
+	0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef,
+	0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a,
+	0x3094a5c6, 0xc066a235, 0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733,
+	0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117, 0x8dd64d76, 0x4db0ef43,
+	0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546,
+	0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92,
+	0x335610e9, 0x1347d66d, 0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb,
+	0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a, 0x59dfd29c, 0x3f73f255,
+	0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478,
+	0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc,
+	0x8b493c28, 0x41950dff, 0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664,
+	0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0,
+};
+EXPORT_SYMBOL(aes_dec_tab);
+
+/* Prefetch data into L1 cache.  @mem should be cacheline-aligned. */
+static __always_inline void aes_prefetch(const void *mem, size_t len)
+{
+	for (size_t i = 0; i < len; i += L1_CACHE_BYTES)
+		*(volatile const u8 *)(mem + i);
+	barrier();
+}
+
 static u32 mul_by_x(u32 w)
 {
 	u32 x = w & 0x7f7f7f7f;
@@ -145,22 +247,6 @@ static u32 inv_mix_columns(u32 x)
 	return mix_columns(x ^ y ^ ror32(y, 16));
 }

-static __always_inline u32 subshift(u32 in[], int pos)
-{
-	return (aes_sbox[in[pos] & 0xff]) ^
-	       (aes_sbox[(in[(pos + 1) % 4] >>  8) & 0xff] <<  8) ^
-	       (aes_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
-	       (aes_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24);
-}
-
-static __always_inline u32 inv_subshift(u32 in[], int pos)
-{
-	return (aes_inv_sbox[in[pos] & 0xff]) ^
-	       (aes_inv_sbox[(in[(pos + 3) % 4] >>  8) & 0xff] <<  8) ^
-	       (aes_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
-	       (aes_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24);
-}
-
 static u32 subw(u32 in)
 {
 	return (aes_sbox[in & 0xff]) ^
@@ -169,38 +255,17 @@ static u32 subw(u32 in)
 	       (aes_sbox[(in >> 24) & 0xff] << 24);
 }

-/**
- * aes_expandkey - Expands the AES key as described in FIPS-197
- * @ctx:	The location where the computed key will be stored.
- * @in_key:	The supplied key.
- * @key_len:	The length of the supplied key.
- *
- * Returns 0 on success. The function fails only if an invalid key size (or
- * pointer) is supplied.
- * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes
- * key schedule plus a 16 bytes key which is used before the first round).
- * The decryption key is prepared for the "Equivalent Inverse Cipher" as
- * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is
- * for the initial combination, the second slot for the first round and so on.
- */
-int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
-		  unsigned int key_len)
+static void aes_expandkey_generic(u32 rndkeys[], u32 *inv_rndkeys,
+				  const u8 *in_key, int key_len)
 {
 	u32 kwords = key_len / sizeof(u32);
 	u32 rc, i, j;
-	int err;
-
-	err = aes_check_keylen(key_len);
-	if (err)
-		return err;
-
-	ctx->key_length = key_len;

 	for (i = 0; i < kwords; i++)
-		ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
+		rndkeys[i] = get_unaligned_le32(&in_key[i * sizeof(u32)]);

 	for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) {
-		u32 *rki = ctx->key_enc + (i * kwords);
+		u32 *rki = &rndkeys[i * kwords];
 		u32 *rko = rki + kwords;

 		rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0];
@@ -229,129 +294,239 @@ int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 	 * the Inverse Mix Columns transformation to all but the first and
 	 * the last one.
 	 */
-	ctx->key_dec[0] = ctx->key_enc[key_len + 24];
-	ctx->key_dec[1] = ctx->key_enc[key_len + 25];
-	ctx->key_dec[2] = ctx->key_enc[key_len + 26];
-	ctx->key_dec[3] = ctx->key_enc[key_len + 27];
+	if (inv_rndkeys) {
+		inv_rndkeys[0] = rndkeys[key_len + 24];
+		inv_rndkeys[1] = rndkeys[key_len + 25];
+		inv_rndkeys[2] = rndkeys[key_len + 26];
+		inv_rndkeys[3] = rndkeys[key_len + 27];

-	for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
-		ctx->key_dec[i]     = inv_mix_columns(ctx->key_enc[j]);
-		ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]);
-		ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]);
-		ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]);
+		for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
+			inv_rndkeys[i]     = inv_mix_columns(rndkeys[j]);
+			inv_rndkeys[i + 1] = inv_mix_columns(rndkeys[j + 1]);
+			inv_rndkeys[i + 2] = inv_mix_columns(rndkeys[j + 2]);
+			inv_rndkeys[i + 3] = inv_mix_columns(rndkeys[j + 3]);
+		}
+
+		inv_rndkeys[i]     = rndkeys[0];
+		inv_rndkeys[i + 1] = rndkeys[1];
+		inv_rndkeys[i + 2] = rndkeys[2];
+		inv_rndkeys[i + 3] = rndkeys[3];
 	}
+}

-	ctx->key_dec[i]     = ctx->key_enc[0];
-	ctx->key_dec[i + 1] = ctx->key_enc[1];
-	ctx->key_dec[i + 2] = ctx->key_enc[2];
-	ctx->key_dec[i + 3] = ctx->key_enc[3];
-
+int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
+		  unsigned int key_len)
+{
+	if (aes_check_keylen(key_len) != 0)
+		return -EINVAL;
+	ctx->key_length = key_len;
+	aes_expandkey_generic(ctx->key_enc, ctx->key_dec, in_key, key_len);
 	return 0;
 }
 EXPORT_SYMBOL(aes_expandkey);

-/**
- * aes_encrypt - Encrypt a single AES block
- * @ctx:	Context struct containing the key schedule
- * @out:	Buffer to store the ciphertext
- * @in:		Buffer containing the plaintext
- */
-void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
+static __always_inline u32 enc_quarterround(const u32 w[4], int i, u32 rk)
 {
-	const u32 *rkp = ctx->key_enc + 4;
-	int rounds = 6 + ctx->key_length / 4;
-	u32 st0[4], st1[4];
-	int round;
+	return rk ^ aes_enc_tab[(u8)w[i]] ^
+	       rol32(aes_enc_tab[(u8)(w[(i + 1) % 4] >> 8)], 8) ^
+	       rol32(aes_enc_tab[(u8)(w[(i + 2) % 4] >> 16)], 16) ^
+	       rol32(aes_enc_tab[(u8)(w[(i + 3) % 4] >> 24)], 24);
+}

-	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
-	st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
-	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
-	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
+static __always_inline u32 enclast_quarterround(const u32 w[4], int i, u32 rk)
+{
+	return rk ^ ((aes_enc_tab[(u8)w[i]] & 0x0000ff00) >> 8) ^
+	       (aes_enc_tab[(u8)(w[(i + 1) % 4] >> 8)] & 0x0000ff00) ^
+	       ((aes_enc_tab[(u8)(w[(i + 2) % 4] >> 16)] & 0x0000ff00) << 8) ^
+	       ((aes_enc_tab[(u8)(w[(i + 3) % 4] >> 24)] & 0x0000ff00) << 16);
+}
+
+static void __maybe_unused aes_encrypt_generic(const u32 rndkeys[], int nrounds,
+					       u8 out[AES_BLOCK_SIZE],
+					       const u8 in[AES_BLOCK_SIZE])
+{
+	const u32 *rkp = rndkeys;
+	int n = nrounds - 1;
+	u32 w[4];
+
+	w[0] = get_unaligned_le32(&in[0]) ^ *rkp++;
+	w[1] = get_unaligned_le32(&in[4]) ^ *rkp++;
+	w[2] = get_unaligned_le32(&in[8]) ^ *rkp++;
+	w[3] = get_unaligned_le32(&in[12]) ^ *rkp++;

 	/*
-	 * Force the compiler to emit data independent Sbox references,
-	 * by xoring the input with Sbox values that are known to add up
-	 * to zero. This pulls the entire Sbox into the D-cache before any
-	 * data dependent lookups are done.
+	 * Prefetch the table before doing data and key-dependent loads from it.
+	 *
+	 * This is intended only as a basic constant-time hardening measure that
+	 * avoids interfering with performance too much.  Its effectiveness is
+	 * not guaranteed.  For proper constant-time AES, a CPU that supports
+	 * AES instructions should be used instead.
 	 */
-	st0[0] ^= aes_sbox[ 0] ^ aes_sbox[ 64] ^ aes_sbox[134] ^ aes_sbox[195];
-	st0[1] ^= aes_sbox[16] ^ aes_sbox[ 82] ^ aes_sbox[158] ^ aes_sbox[221];
-	st0[2] ^= aes_sbox[32] ^ aes_sbox[ 96] ^ aes_sbox[160] ^ aes_sbox[234];
-	st0[3] ^= aes_sbox[48] ^ aes_sbox[112] ^ aes_sbox[186] ^ aes_sbox[241];
+	aes_prefetch(aes_enc_tab, sizeof(aes_enc_tab));

-	for (round = 0;; round += 2, rkp += 8) {
-		st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0];
-		st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1];
-		st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2];
-		st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3];
+	do {
+		u32 w0 = enc_quarterround(w, 0, *rkp++);
+		u32 w1 = enc_quarterround(w, 1, *rkp++);
+		u32 w2 = enc_quarterround(w, 2, *rkp++);
+		u32 w3 = enc_quarterround(w, 3, *rkp++);

-		if (round == rounds - 2)
-			break;
+		w[0] = w0;
+		w[1] = w1;
+		w[2] = w2;
+		w[3] = w3;
+	} while (--n);

-		st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4];
-		st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5];
-		st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6];
-		st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7];
-	}
+	put_unaligned_le32(enclast_quarterround(w, 0, *rkp++), &out[0]);
+	put_unaligned_le32(enclast_quarterround(w, 1, *rkp++), &out[4]);
+	put_unaligned_le32(enclast_quarterround(w, 2, *rkp++), &out[8]);
+	put_unaligned_le32(enclast_quarterround(w, 3, *rkp++), &out[12]);
+}

-	put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out);
-	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
-	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
-	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+static __always_inline u32 dec_quarterround(const u32 w[4], int i, u32 rk)
+{
+	return rk ^ aes_dec_tab[(u8)w[i]] ^
+	       rol32(aes_dec_tab[(u8)(w[(i + 3) % 4] >> 8)], 8) ^
+	       rol32(aes_dec_tab[(u8)(w[(i + 2) % 4] >> 16)], 16) ^
+	       rol32(aes_dec_tab[(u8)(w[(i + 1) % 4] >> 24)], 24);
+}
+
+static __always_inline u32 declast_quarterround(const u32 w[4], int i, u32 rk)
+{
+	return rk ^ aes_inv_sbox[(u8)w[i]] ^
+	       ((u32)aes_inv_sbox[(u8)(w[(i + 3) % 4] >> 8)] << 8) ^
+	       ((u32)aes_inv_sbox[(u8)(w[(i + 2) % 4] >> 16)] << 16) ^
+	       ((u32)aes_inv_sbox[(u8)(w[(i + 1) % 4] >> 24)] << 24);
+}
+
+static void __maybe_unused aes_decrypt_generic(const u32 inv_rndkeys[],
+					       int nrounds,
+					       u8 out[AES_BLOCK_SIZE],
+					       const u8 in[AES_BLOCK_SIZE])
+{
+	const u32 *rkp = inv_rndkeys;
+	int n = nrounds - 1;
+	u32 w[4];
+
+	w[0] = get_unaligned_le32(&in[0]) ^ *rkp++;
+	w[1] = get_unaligned_le32(&in[4]) ^ *rkp++;
+	w[2] = get_unaligned_le32(&in[8]) ^ *rkp++;
+	w[3] = get_unaligned_le32(&in[12]) ^ *rkp++;
+
+	aes_prefetch(aes_dec_tab, sizeof(aes_dec_tab));
+
+	do {
+		u32 w0 = dec_quarterround(w, 0, *rkp++);
+		u32 w1 = dec_quarterround(w, 1, *rkp++);
+		u32 w2 = dec_quarterround(w, 2, *rkp++);
+		u32 w3 = dec_quarterround(w, 3, *rkp++);
+
+		w[0] = w0;
+		w[1] = w1;
+		w[2] = w2;
+		w[3] = w3;
+	} while (--n);
+
+	aes_prefetch(aes_inv_sbox, sizeof(aes_inv_sbox));
+	put_unaligned_le32(declast_quarterround(w, 0, *rkp++), &out[0]);
+	put_unaligned_le32(declast_quarterround(w, 1, *rkp++), &out[4]);
+	put_unaligned_le32(declast_quarterround(w, 2, *rkp++), &out[8]);
+	put_unaligned_le32(declast_quarterround(w, 3, *rkp++), &out[12]);
+}
+
+/*
+ * Note: the aes_prepare*key_* names reflect the fact that the implementation
+ * might not actually expand the key.  (The s390 code for example doesn't.)
+ * Where the key is expanded we use the more specific names aes_expandkey_*.
+ *
+ * aes_preparekey_arch() is passed an optional pointer 'inv_k' which points to
+ * the area to store the prepared decryption key.  It will be NULL if the user
+ * is requesting encryption-only.  aes_preparekey_arch() is also passed a valid
+ * 'key_len' and 'nrounds', corresponding to AES-128, AES-192, or AES-256.
+ */
+#ifdef CONFIG_CRYPTO_LIB_AES_ARCH
+/* An arch-specific implementation of AES is available.  Include it. */
+#include "aes.h" /* $(SRCARCH)/aes.h */
+#else
+/* No arch-specific implementation of AES is available.  Use generic code. */
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	aes_expandkey_generic(k->rndkeys, inv_k ? inv_k->inv_rndkeys : NULL,
+			      in_key, key_len);
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds, out, in);
+}
+#endif
+
+static int __aes_preparekey(struct aes_enckey *enc_key,
+			    union aes_invkey_arch *inv_k,
+			    const u8 *in_key, size_t key_len)
+{
+	if (aes_check_keylen(key_len) != 0)
+		return -EINVAL;
+	enc_key->len = key_len;
+	enc_key->nrounds = 6 + key_len / 4;
+	aes_preparekey_arch(&enc_key->k, inv_k, in_key, key_len,
+			    enc_key->nrounds);
+	return 0;
+}
+
+int aes_preparekey(struct aes_key *key, const u8 *in_key, size_t key_len)
+{
+	return __aes_preparekey((struct aes_enckey *)key, &key->inv_k,
+				in_key, key_len);
+}
+EXPORT_SYMBOL(aes_preparekey);
+
+int aes_prepareenckey(struct aes_enckey *key, const u8 *in_key, size_t key_len)
+{
+	return __aes_preparekey(key, NULL, in_key, key_len);
+}
+EXPORT_SYMBOL(aes_prepareenckey);
+
+void aes_encrypt(aes_encrypt_arg key, u8 out[AES_BLOCK_SIZE],
+		 const u8 in[AES_BLOCK_SIZE])
+{
+	aes_encrypt_arch(key.enc_key, out, in);
 }
 EXPORT_SYMBOL(aes_encrypt);

-/**
- * aes_decrypt - Decrypt a single AES block
- * @ctx:	Context struct containing the key schedule
- * @out:	Buffer to store the plaintext
- * @in:		Buffer containing the ciphertext
- */
-void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
+void aes_decrypt(const struct aes_key *key, u8 out[AES_BLOCK_SIZE],
+		 const u8 in[AES_BLOCK_SIZE])
 {
-	const u32 *rkp = ctx->key_dec + 4;
-	int rounds = 6 + ctx->key_length / 4;
-	u32 st0[4], st1[4];
-	int round;
-
-	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
-	st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
-	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
-	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
-
-	/*
-	 * Force the compiler to emit data independent Sbox references,
-	 * by xoring the input with Sbox values that are known to add up
-	 * to zero. This pulls the entire Sbox into the D-cache before any
-	 * data dependent lookups are done.
-	 */
-	st0[0] ^= aes_inv_sbox[ 0] ^ aes_inv_sbox[ 64] ^ aes_inv_sbox[129] ^ aes_inv_sbox[200];
-	st0[1] ^= aes_inv_sbox[16] ^ aes_inv_sbox[ 83] ^ aes_inv_sbox[150] ^ aes_inv_sbox[212];
-	st0[2] ^= aes_inv_sbox[32] ^ aes_inv_sbox[ 96] ^ aes_inv_sbox[160] ^ aes_inv_sbox[236];
-	st0[3] ^= aes_inv_sbox[48] ^ aes_inv_sbox[112] ^ aes_inv_sbox[187] ^ aes_inv_sbox[247];
-
-	for (round = 0;; round += 2, rkp += 8) {
-		st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0];
-		st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1];
-		st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2];
-		st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3];
-
-		if (round == rounds - 2)
-			break;
-
-		st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4];
-		st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5];
-		st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6];
-		st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7];
-	}
-
-	put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out);
-	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
-	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
-	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+	aes_decrypt_arch(key, out, in);
 }
 EXPORT_SYMBOL(aes_decrypt);

-MODULE_DESCRIPTION("Generic AES library");
+#ifdef aes_mod_init_arch
+static int __init aes_mod_init(void)
+{
+	aes_mod_init_arch();
+	return 0;
+}
+subsys_initcall(aes_mod_init);
+
+static void __exit aes_mod_exit(void)
+{
+}
+module_exit(aes_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("AES block cipher");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_AUTHOR("Eric Biggers <ebiggers@kernel.org>");
 MODULE_LICENSE("GPL v2");
--- a/lib/crypto/aescfb.c
+++ b/lib/crypto/aescfb.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <asm/irqflags.h>

-static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
+static void aescfb_encrypt_block(const struct aes_enckey *key, void *dst,
 				 const void *src)
 {
 	unsigned long flags;
@@ -25,27 +25,27 @@ static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
 	 * interrupts disabled.
 	 */
 	local_irq_save(flags);
-	aes_encrypt(ctx, dst, src);
+	aes_encrypt(key, dst, src);
 	local_irq_restore(flags);
 }

 /**
 * aescfb_encrypt - Perform AES-CFB encryption on a block of data
 *
- * @ctx:	The AES-CFB key schedule
+ * @key:	The AES-CFB key schedule
 * @dst:	Pointer to the ciphertext output buffer
 * @src:	Pointer the plaintext (may equal @dst for encryption in place)
 * @len:	The size in bytes of the plaintext and ciphertext.
 * @iv:		The initialization vector (IV) to use for this block of data
 */
-void aescfb_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+void aescfb_encrypt(const struct aes_enckey *key, u8 *dst, const u8 *src,
 		    int len, const u8 iv[AES_BLOCK_SIZE])
 {
 	u8 ks[AES_BLOCK_SIZE];
 	const u8 *v = iv;

 	while (len > 0) {
-		aescfb_encrypt_block(ctx, ks, v);
+		aescfb_encrypt_block(key, ks, v);
 		crypto_xor_cpy(dst, src, ks, min(len, AES_BLOCK_SIZE));
 		v = dst;

@@ -61,18 +61,18 @@ EXPORT_SYMBOL(aescfb_encrypt);
 /**
 * aescfb_decrypt - Perform AES-CFB decryption on a block of data
 *
- * @ctx:	The AES-CFB key schedule
+ * @key:	The AES-CFB key schedule
 * @dst:	Pointer to the plaintext output buffer
 * @src:	Pointer the ciphertext (may equal @dst for decryption in place)
 * @len:	The size in bytes of the plaintext and ciphertext.
 * @iv:		The initialization vector (IV) to use for this block of data
 */
-void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+void aescfb_decrypt(const struct aes_enckey *key, u8 *dst, const u8 *src,
 		    int len, const u8 iv[AES_BLOCK_SIZE])
 {
 	u8 ks[2][AES_BLOCK_SIZE];

-	aescfb_encrypt_block(ctx, ks[0], iv);
+	aescfb_encrypt_block(key, ks[0], iv);

 	for (int i = 0; len > 0; i ^= 1) {
 		if (len > AES_BLOCK_SIZE)
@@ -81,7 +81,7 @@ void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
 			 * performing the XOR, as that may update in place and
 			 * overwrite the ciphertext.
 			 */
-			aescfb_encrypt_block(ctx, ks[!i], src);
+			aescfb_encrypt_block(key, ks[!i], src);

 		crypto_xor_cpy(dst, src, ks[i], min(len, AES_BLOCK_SIZE));

@@ -214,15 +214,15 @@ static struct {
 static int __init libaescfb_init(void)
 {
 	for (int i = 0; i < ARRAY_SIZE(aescfb_tv); i++) {
-		struct crypto_aes_ctx ctx;
+		struct aes_enckey key;
 		u8 buf[64];

-		if (aes_expandkey(&ctx, aescfb_tv[i].key, aescfb_tv[i].klen)) {
-			pr_err("aes_expandkey() failed on vector %d\n", i);
+		if (aes_prepareenckey(&key, aescfb_tv[i].key, aescfb_tv[i].klen)) {
+			pr_err("aes_prepareenckey() failed on vector %d\n", i);
 			return -ENODEV;
 		}

-		aescfb_encrypt(&ctx, buf, aescfb_tv[i].ptext, aescfb_tv[i].len,
+		aescfb_encrypt(&key, buf, aescfb_tv[i].ptext, aescfb_tv[i].len,
 			       aescfb_tv[i].iv);
 		if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
 			pr_err("aescfb_encrypt() #1 failed on vector %d\n", i);
@@ -230,14 +230,14 @@ static int __init libaescfb_init(void)
 		}

 		/* decrypt in place */
-		aescfb_decrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+		aescfb_decrypt(&key, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
 		if (memcmp(buf, aescfb_tv[i].ptext, aescfb_tv[i].len)) {
 			pr_err("aescfb_decrypt() failed on vector %d\n", i);
 			return -ENODEV;
 		}

 		/* encrypt in place */
-		aescfb_encrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+		aescfb_encrypt(&key, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
 		if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
 			pr_err("aescfb_encrypt() #2 failed on vector %d\n", i);

--- a/lib/crypto/aesgcm.c
+++ b/lib/crypto/aesgcm.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <asm/irqflags.h>

-static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
+static void aesgcm_encrypt_block(const struct aes_enckey *key, void *dst,
 				 const void *src)
 {
 	unsigned long flags;
@@ -26,7 +26,7 @@ static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
 	 * effective when running with interrupts disabled.
 	 */
 	local_irq_save(flags);
-	aes_encrypt(ctx, dst, src);
+	aes_encrypt(key, dst, src);
 	local_irq_restore(flags);
 }

@@ -49,12 +49,12 @@ int aesgcm_expandkey(struct aesgcm_ctx *ctx, const u8 *key,
 	int ret;

 	ret = crypto_gcm_check_authsize(authsize) ?:
-	      aes_expandkey(&ctx->aes_ctx, key, keysize);
+	      aes_prepareenckey(&ctx->aes_key, key, keysize);
 	if (ret)
 		return ret;

 	ctx->authsize = authsize;
-	aesgcm_encrypt_block(&ctx->aes_ctx, &ctx->ghash_key, kin);
+	aesgcm_encrypt_block(&ctx->aes_key, &ctx->ghash_key, kin);

 	return 0;
 }
@@ -97,7 +97,7 @@ static void aesgcm_mac(const struct aesgcm_ctx *ctx, const u8 *src, int src_len,
 	aesgcm_ghash(&ghash, &ctx->ghash_key, &tail, sizeof(tail));

 	ctr[3] = cpu_to_be32(1);
-	aesgcm_encrypt_block(&ctx->aes_ctx, buf, ctr);
+	aesgcm_encrypt_block(&ctx->aes_key, buf, ctr);
 	crypto_xor_cpy(authtag, buf, (u8 *)&ghash, ctx->authsize);

 	memzero_explicit(&ghash, sizeof(ghash));
@@ -119,7 +119,7 @@ static void aesgcm_crypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src,
 		 * len', this cannot happen, so no explicit test is necessary.
 		 */
 		ctr[3] = cpu_to_be32(n++);
-		aesgcm_encrypt_block(&ctx->aes_ctx, buf, ctr);
+		aesgcm_encrypt_block(&ctx->aes_key, buf, ctr);
 		crypto_xor_cpy(dst, src, buf, min(len, AES_BLOCK_SIZE));

 		dst += AES_BLOCK_SIZE;
--- a/lib/crypto/arm/aes-cipher-core.S
+++ b/lib/crypto/arm/aes-cipher-core.S
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+	.text
+	.align		5
+
+	rk		.req	r0
+	rounds		.req	r1
+	in		.req	r2
+	out		.req	r3
+	ttab		.req	ip
+
+	t0		.req	lr
+	t1		.req	r2
+	t2		.req	r3
+
+	.macro		__select, out, in, idx
+	.if		__LINUX_ARM_ARCH__ < 7
+	and		\out, \in, #0xff << (8 * \idx)
+	.else
+	ubfx		\out, \in, #(8 * \idx), #8
+	.endif
+	.endm
+
+	.macro		__load, out, in, idx, sz, op
+	.if		__LINUX_ARM_ARCH__ < 7 && \idx > 0
+	ldr\op		\out, [ttab, \in, lsr #(8 * \idx) - \sz]
+	.else
+	ldr\op		\out, [ttab, \in, lsl #\sz]
+	.endif
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
+	__select	\out0, \in0, 0
+	__select	t0, \in1, 1
+	__load		\out0, \out0, 0, \sz, \op
+	__load		t0, t0, 1, \sz, \op
+
+	.if		\enc
+	__select	\out1, \in1, 0
+	__select	t1, \in2, 1
+	.else
+	__select	\out1, \in3, 0
+	__select	t1, \in0, 1
+	.endif
+	__load		\out1, \out1, 0, \sz, \op
+	__select	t2, \in2, 2
+	__load		t1, t1, 1, \sz, \op
+	__load		t2, t2, 2, \sz, \op
+
+	eor		\out0, \out0, t0, ror #24
+
+	__select	t0, \in3, 3
+	.if		\enc
+	__select	\t3, \in3, 2
+	__select	\t4, \in0, 3
+	.else
+	__select	\t3, \in1, 2
+	__select	\t4, \in2, 3
+	.endif
+	__load		\t3, \t3, 2, \sz, \op
+	__load		t0, t0, 3, \sz, \op
+	__load		\t4, \t4, 3, \sz, \op
+
+	.ifnb		\oldcpsr
+	/*
+	 * This is the final round and we're done with all data-dependent table
+	 * lookups, so we can safely re-enable interrupts.
+	 */
+	restore_irqs	\oldcpsr
+	.endif
+
+	eor		\out1, \out1, t1, ror #24
+	eor		\out0, \out0, t2, ror #16
+	ldm		rk!, {t1, t2}
+	eor		\out1, \out1, \t3, ror #16
+	eor		\out0, \out0, t0, ror #8
+	eor		\out1, \out1, \t4, ror #8
+	eor		\out0, \out0, t1
+	eor		\out1, \out1, t2
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab, bsz
+	push		{r3-r11, lr}
+
+	// Load keys first, to reduce latency in case they're not cached yet.
+	ldm		rk!, {r8-r11}
+
+	ldr		r4, [in]
+	ldr		r5, [in, #4]
+	ldr		r6, [in, #8]
+	ldr		r7, [in, #12]
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	rev_l		r4, t0
+	rev_l		r5, t0
+	rev_l		r6, t0
+	rev_l		r7, t0
+#endif
+
+	eor		r4, r4, r8
+	eor		r5, r5, r9
+	eor		r6, r6, r10
+	eor		r7, r7, r11
+
+	mov_l		ttab, \ttab
+	/*
+	 * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
+	 * L1 cache, assuming cacheline size >= 32.  This is a hardening measure
+	 * intended to make cache-timing attacks more difficult.  They may not
+	 * be fully prevented, however; see the paper
+	 * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+	 * ("Cache-timing attacks on AES") for a discussion of the many
+	 * difficulties involved in writing truly constant-time AES software.
+	 */
+	 save_and_disable_irqs	t0
+	.set		i, 0
+	.rept		1024 / 128
+	ldr		r8, [ttab, #i + 0]
+	ldr		r9, [ttab, #i + 32]
+	ldr		r10, [ttab, #i + 64]
+	ldr		r11, [ttab, #i + 96]
+	.set		i, i + 128
+	.endr
+	push		{t0}		// oldcpsr
+
+	tst		rounds, #2
+	bne		1f
+
+0:	\round		r8, r9, r10, r11, r4, r5, r6, r7
+	\round		r4, r5, r6, r7, r8, r9, r10, r11
+
+1:	subs		rounds, rounds, #4
+	\round		r8, r9, r10, r11, r4, r5, r6, r7
+	bls		2f
+	\round		r4, r5, r6, r7, r8, r9, r10, r11
+	b		0b
+
+2:	.ifb		\ltab
+	add		ttab, ttab, #1
+	.else
+	mov_l		ttab, \ltab
+	// Prefetch inverse S-box for final round; see explanation above
+	.set		i, 0
+	.rept		256 / 64
+	ldr		t0, [ttab, #i + 0]
+	ldr		t1, [ttab, #i + 32]
+	.set		i, i + 64
+	.endr
+	.endif
+
+	pop		{rounds}	// oldcpsr
+	\round		r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	rev_l		r4, t0
+	rev_l		r5, t0
+	rev_l		r6, t0
+	rev_l		r7, t0
+#endif
+
+	ldr		out, [sp]
+
+	str		r4, [out]
+	str		r5, [out, #4]
+	str		r6, [out, #8]
+	str		r7, [out, #12]
+
+	pop		{r3-r11, pc}
+
+	.align		3
+	.ltorg
+	.endm
+
+ENTRY(__aes_arm_encrypt)
+	do_crypt	fround, aes_enc_tab,, 2
+ENDPROC(__aes_arm_encrypt)
+
+	.align		5
+ENTRY(__aes_arm_decrypt)
+	do_crypt	iround, aes_dec_tab, crypto_aes_inv_sbox, 0
+ENDPROC(__aes_arm_decrypt)
--- a/lib/crypto/arm/aes.h
+++ b/lib/crypto/arm/aes.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AES block cipher, optimized for ARM
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Copyright 2026 Google LLC
+ */
+
+asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds,
+				  const u8 in[AES_BLOCK_SIZE],
+				  u8 out[AES_BLOCK_SIZE]);
+asmlinkage void __aes_arm_decrypt(const u32 inv_rk[], int rounds,
+				  const u8 in[AES_BLOCK_SIZE],
+				  u8 out[AES_BLOCK_SIZE]);
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	aes_expandkey_generic(k->rndkeys, inv_k ? inv_k->inv_rndkeys : NULL,
+			      in_key, key_len);
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    !IS_ALIGNED((uintptr_t)out | (uintptr_t)in, 4)) {
+		u8 bounce_buf[AES_BLOCK_SIZE] __aligned(4);
+
+		memcpy(bounce_buf, in, AES_BLOCK_SIZE);
+		__aes_arm_encrypt(key->k.rndkeys, key->nrounds, bounce_buf,
+				  bounce_buf);
+		memcpy(out, bounce_buf, AES_BLOCK_SIZE);
+		return;
+	}
+	__aes_arm_encrypt(key->k.rndkeys, key->nrounds, in, out);
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    !IS_ALIGNED((uintptr_t)out | (uintptr_t)in, 4)) {
+		u8 bounce_buf[AES_BLOCK_SIZE] __aligned(4);
+
+		memcpy(bounce_buf, in, AES_BLOCK_SIZE);
+		__aes_arm_decrypt(key->inv_k.inv_rndkeys, key->nrounds,
+				  bounce_buf, bounce_buf);
+		memcpy(out, bounce_buf, AES_BLOCK_SIZE);
+		return;
+	}
+	__aes_arm_decrypt(key->inv_k.inv_rndkeys, key->nrounds, in, out);
+}
--- a/lib/crypto/arm/nh-neon-core.S
+++ b/lib/crypto/arm/nh-neon-core.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+
+	KEY		.req	r0
+	MESSAGE		.req	r1
+	MESSAGE_LEN	.req	r2
+	HASH		.req	r3
+
+	PASS0_SUMS	.req	q0
+	PASS0_SUM_A	.req	d0
+	PASS0_SUM_B	.req	d1
+	PASS1_SUMS	.req	q1
+	PASS1_SUM_A	.req	d2
+	PASS1_SUM_B	.req	d3
+	PASS2_SUMS	.req	q2
+	PASS2_SUM_A	.req	d4
+	PASS2_SUM_B	.req	d5
+	PASS3_SUMS	.req	q3
+	PASS3_SUM_A	.req	d6
+	PASS3_SUM_B	.req	d7
+	K0		.req	q4
+	K1		.req	q5
+	K2		.req	q6
+	K3		.req	q7
+	T0		.req	q8
+	T0_L		.req	d16
+	T0_H		.req	d17
+	T1		.req	q9
+	T1_L		.req	d18
+	T1_H		.req	d19
+	T2		.req	q10
+	T2_L		.req	d20
+	T2_H		.req	d21
+	T3		.req	q11
+	T3_L		.req	d22
+	T3_H		.req	d23
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	vld1.8		{T3}, [MESSAGE]!
+
+	// Load next key stride
+	vld1.32		{\k3}, [KEY]!
+
+	// Add message words to key words
+	vadd.u32	T0, T3, \k0
+	vadd.u32	T1, T3, \k1
+	vadd.u32	T2, T3, \k2
+	vadd.u32	T3, T3, \k3
+
+	// Multiply 32x32 => 64 and accumulate
+	vmlal.u32	PASS0_SUMS, T0_L, T0_H
+	vmlal.u32	PASS1_SUMS, T1_L, T1_H
+	vmlal.u32	PASS2_SUMS, T2_L, T2_H
+	vmlal.u32	PASS3_SUMS, T3_L, T3_H
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		__le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+	vld1.32		{K0,K1}, [KEY]!
+	  vmov.u64	PASS0_SUMS, #0
+	  vmov.u64	PASS1_SUMS, #0
+	vld1.32		{K2}, [KEY]!
+	  vmov.u64	PASS2_SUMS, #0
+	  vmov.u64	PASS3_SUMS, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
+	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
+	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
+	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
+	vst1.8		{T0-T1}, [HASH]
+	bx		lr
+ENDPROC(nh_neon)
--- a/lib/crypto/arm/nh.h
+++ b/lib/crypto/arm/nh.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM32 accelerated implementation of NH
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			__le64 hash[NH_NUM_PASSES]);
+
+static bool nh_arch(const u32 *key, const u8 *message, size_t message_len,
+		    __le64 hash[NH_NUM_PASSES])
+{
+	if (static_branch_likely(&have_neon) && message_len >= 64 &&
+	    may_use_simd()) {
+		scoped_ksimd()
+			nh_neon(key, message, message_len, hash);
+		return true;
+	}
+	return false;
+}
+
+#define nh_mod_init_arch nh_mod_init_arch
+static void nh_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
--- a/lib/crypto/arm64/aes-ce-core.S
+++ b/lib/crypto/arm64/aes-ce-core.S
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.arch		armv8-a+crypto
+
+SYM_FUNC_START(__aes_ce_encrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aese		v0.16b, v2.16b
+	aesmc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aese		v0.16b, v3.16b
+	aesmc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aese		v0.16b, v1.16b
+	aesmc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aese		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+SYM_FUNC_END(__aes_ce_encrypt)
+
+SYM_FUNC_START(__aes_ce_decrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aesd		v0.16b, v2.16b
+	aesimc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aesd		v0.16b, v3.16b
+	aesimc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aesd		v0.16b, v1.16b
+	aesimc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aesd		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+SYM_FUNC_END(__aes_ce_decrypt)
+
+/*
+ * __aes_ce_sub() - use the aese instruction to perform the AES sbox
+ *                  substitution on each byte in 'input'
+ */
+SYM_FUNC_START(__aes_ce_sub)
+	dup		v1.4s, w0
+	movi		v0.16b, #0
+	aese		v0.16b, v1.16b
+	umov		w0, v0.s[0]
+	ret
+SYM_FUNC_END(__aes_ce_sub)
+
+SYM_FUNC_START(__aes_ce_invert)
+	ld1		{v0.4s}, [x1]
+	aesimc		v1.16b, v0.16b
+	st1		{v1.4s}, [x0]
+	ret
+SYM_FUNC_END(__aes_ce_invert)
--- a/lib/crypto/arm64/aes-cipher-core.S
+++ b/lib/crypto/arm64/aes-cipher-core.S
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+	.text
+
+	rk		.req	x0
+	out		.req	x1
+	in		.req	x2
+	rounds		.req	x3
+	tt		.req	x2
+
+	.macro		__pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
+	.ifc		\op\shift, b0
+	ubfiz		\reg0, \in0, #2, #8
+	ubfiz		\reg1, \in1e, #2, #8
+	.else
+	ubfx		\reg0, \in0, #\shift, #8
+	ubfx		\reg1, \in1e, #\shift, #8
+	.endif
+
+	/*
+	 * AArch64 cannot do byte size indexed loads from a table containing
+	 * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
+	 * valid instruction. So perform the shift explicitly first for the
+	 * high bytes (the low byte is shifted implicitly by using ubfiz rather
+	 * than ubfx above)
+	 */
+	.ifnc		\op, b
+	ldr		\reg0, [tt, \reg0, uxtw #2]
+	ldr		\reg1, [tt, \reg1, uxtw #2]
+	.else
+	.if		\shift > 0
+	lsl		\reg0, \reg0, #2
+	lsl		\reg1, \reg1, #2
+	.endif
+	ldrb		\reg0, [tt, \reg0, uxtw]
+	ldrb		\reg1, [tt, \reg1, uxtw]
+	.endif
+	.endm
+
+	.macro		__pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
+	ubfx		\reg0, \in0, #\shift, #8
+	ubfx		\reg1, \in1d, #\shift, #8
+	ldr\op		\reg0, [tt, \reg0, uxtw #\sz]
+	ldr\op		\reg1, [tt, \reg1, uxtw #\sz]
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
+	ldp		\out0, \out1, [rk], #8
+
+	__pair\enc	\sz, \op, w12, w13, \in0, \in1, \in3, 0
+	__pair\enc	\sz, \op, w14, w15, \in1, \in2, \in0, 8
+	__pair\enc	\sz, \op, w16, w17, \in2, \in3, \in1, 16
+	__pair\enc	\sz, \op, \t0, \t1, \in3, \in0, \in2, 24
+
+	eor		\out0, \out0, w12
+	eor		\out1, \out1, w13
+	eor		\out0, \out0, w14, ror #24
+	eor		\out1, \out1, w15, ror #24
+	eor		\out0, \out0, w16, ror #16
+	eor		\out1, \out1, w17, ror #16
+	eor		\out0, \out0, \t0, ror #8
+	eor		\out1, \out1, \t1, ror #8
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab, bsz
+	ldp		w4, w5, [in]
+	ldp		w6, w7, [in, #8]
+	ldp		w8, w9, [rk], #16
+	ldp		w10, w11, [rk, #-8]
+
+CPU_BE(	rev		w4, w4		)
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+
+	eor		w4, w4, w8
+	eor		w5, w5, w9
+	eor		w6, w6, w10
+	eor		w7, w7, w11
+
+	adr_l		tt, \ttab
+
+	tbnz		rounds, #1, 1f
+
+0:	\round		w8, w9, w10, w11, w4, w5, w6, w7
+	\round		w4, w5, w6, w7, w8, w9, w10, w11
+
+1:	subs		rounds, rounds, #4
+	\round		w8, w9, w10, w11, w4, w5, w6, w7
+	b.ls		3f
+2:	\round		w4, w5, w6, w7, w8, w9, w10, w11
+	b		0b
+3:	adr_l		tt, \ltab
+	\round		w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
+
+CPU_BE(	rev		w4, w4		)
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+
+	stp		w4, w5, [out]
+	stp		w6, w7, [out, #8]
+	ret
+	.endm
+
+SYM_FUNC_START(__aes_arm64_encrypt)
+	do_crypt	fround, aes_enc_tab, aes_enc_tab + 1, 2
+SYM_FUNC_END(__aes_arm64_encrypt)
+
+	.align		5
+SYM_FUNC_START(__aes_arm64_decrypt)
+	do_crypt	iround, aes_dec_tab, crypto_aes_inv_sbox, 0
+SYM_FUNC_END(__aes_arm64_decrypt)
--- a/lib/crypto/arm64/aes.h
+++ b/lib/crypto/arm64/aes.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AES block cipher, optimized for ARM64
+ *
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/unaligned.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes);
+
+struct aes_block {
+	u8 b[AES_BLOCK_SIZE];
+};
+
+asmlinkage void __aes_arm64_encrypt(const u32 rk[], u8 out[AES_BLOCK_SIZE],
+				    const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage void __aes_arm64_decrypt(const u32 inv_rk[], u8 out[AES_BLOCK_SIZE],
+				    const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage void __aes_ce_encrypt(const u32 rk[], u8 out[AES_BLOCK_SIZE],
+				 const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage void __aes_ce_decrypt(const u32 inv_rk[], u8 out[AES_BLOCK_SIZE],
+				 const u8 in[AES_BLOCK_SIZE], int rounds);
+asmlinkage u32 __aes_ce_sub(u32 l);
+asmlinkage void __aes_ce_invert(struct aes_block *out,
+				const struct aes_block *in);
+
+/*
+ * Expand an AES key using the crypto extensions if supported and usable or
+ * generic code otherwise.  The expanded key format is compatible between the
+ * two cases.  The outputs are @rndkeys (required) and @inv_rndkeys (optional).
+ */
+static void aes_expandkey_arm64(u32 rndkeys[], u32 *inv_rndkeys,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	/*
+	 * The AES key schedule round constants
+	 */
+	static u8 const rcon[] = {
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+	};
+
+	u32 kwords = key_len / sizeof(u32);
+	struct aes_block *key_enc, *key_dec;
+	int i, j;
+
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ||
+	    !static_branch_likely(&have_aes) || unlikely(!may_use_simd())) {
+		aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len);
+		return;
+	}
+
+	for (i = 0; i < kwords; i++)
+		rndkeys[i] = get_unaligned_le32(&in_key[i * sizeof(u32)]);
+
+	scoped_ksimd() {
+		for (i = 0; i < sizeof(rcon); i++) {
+			u32 *rki = &rndkeys[i * kwords];
+			u32 *rko = rki + kwords;
+
+			rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
+				 rcon[i] ^ rki[0];
+			rko[1] = rko[0] ^ rki[1];
+			rko[2] = rko[1] ^ rki[2];
+			rko[3] = rko[2] ^ rki[3];
+
+			if (key_len == AES_KEYSIZE_192) {
+				if (i >= 7)
+					break;
+				rko[4] = rko[3] ^ rki[4];
+				rko[5] = rko[4] ^ rki[5];
+			} else if (key_len == AES_KEYSIZE_256) {
+				if (i >= 6)
+					break;
+				rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
+				rko[5] = rko[4] ^ rki[5];
+				rko[6] = rko[5] ^ rki[6];
+				rko[7] = rko[6] ^ rki[7];
+			}
+		}
+
+		/*
+		 * Generate the decryption keys for the Equivalent Inverse
+		 * Cipher.  This involves reversing the order of the round
+		 * keys, and applying the Inverse Mix Columns transformation on
+		 * all but the first and the last one.
+		 */
+		if (inv_rndkeys) {
+			key_enc = (struct aes_block *)rndkeys;
+			key_dec = (struct aes_block *)inv_rndkeys;
+			j = nrounds;
+
+			key_dec[0] = key_enc[j];
+			for (i = 1, j--; j > 0; i++, j--)
+				__aes_ce_invert(key_dec + i, key_enc + j);
+			key_dec[i] = key_enc[0];
+		}
+	}
+}
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	aes_expandkey_arm64(k->rndkeys, inv_k ? inv_k->inv_rndkeys : NULL,
+			    in_key, key_len, nrounds);
+}
+
+/*
+ * This is here temporarily until the remaining AES mode implementations are
+ * migrated from arch/arm64/crypto/ to lib/crypto/arm64/.
+ */
+int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
+		     unsigned int key_len)
+{
+	if (aes_check_keylen(key_len) != 0)
+		return -EINVAL;
+	ctx->key_length = key_len;
+	aes_expandkey_arm64(ctx->key_enc, ctx->key_dec, in_key, key_len,
+			    6 + key_len / 4);
+	return 0;
+}
+EXPORT_SYMBOL(ce_aes_expandkey);
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_aes) && likely(may_use_simd())) {
+		scoped_ksimd()
+			__aes_ce_encrypt(key->k.rndkeys, out, in, key->nrounds);
+	} else {
+		__aes_arm64_encrypt(key->k.rndkeys, out, in, key->nrounds);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_aes) && likely(may_use_simd())) {
+		scoped_ksimd()
+			__aes_ce_decrypt(key->inv_k.inv_rndkeys, out, in,
+					 key->nrounds);
+	} else {
+		__aes_arm64_decrypt(key->inv_k.inv_rndkeys, out, in,
+				    key->nrounds);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(AES))
+		static_branch_enable(&have_aes);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
--- a/lib/crypto/arm64/nh-neon-core.S
+++ b/lib/crypto/arm64/nh-neon-core.S
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	KEY		.req	x0
+	MESSAGE		.req	x1
+	MESSAGE_LEN	.req	x2
+	HASH		.req	x3
+
+	PASS0_SUMS	.req	v0
+	PASS1_SUMS	.req	v1
+	PASS2_SUMS	.req	v2
+	PASS3_SUMS	.req	v3
+	K0		.req	v4
+	K1		.req	v5
+	K2		.req	v6
+	K3		.req	v7
+	T0		.req	v8
+	T1		.req	v9
+	T2		.req	v10
+	T3		.req	v11
+	T4		.req	v12
+	T5		.req	v13
+	T6		.req	v14
+	T7		.req	v15
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	ld1		{T3.16b}, [MESSAGE], #16
+
+	// Load next key stride
+	ld1		{\k3\().4s}, [KEY], #16
+
+	// Add message words to key words
+	add		T0.4s, T3.4s, \k0\().4s
+	add		T1.4s, T3.4s, \k1\().4s
+	add		T2.4s, T3.4s, \k2\().4s
+	add		T3.4s, T3.4s, \k3\().4s
+
+	// Multiply 32x32 => 64 and accumulate
+	mov		T4.d[0], T0.d[1]
+	mov		T5.d[0], T1.d[1]
+	mov		T6.d[0], T2.d[1]
+	mov		T7.d[0], T3.d[1]
+	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
+	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
+	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
+	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		__le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_FUNC_START(nh_neon)
+
+	ld1		{K0.4s,K1.4s}, [KEY], #32
+	  movi		PASS0_SUMS.2d, #0
+	  movi		PASS1_SUMS.2d, #0
+	ld1		{K2.4s}, [KEY], #16
+	  movi		PASS2_SUMS.2d, #0
+	  movi		PASS3_SUMS.2d, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
+	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
+	st1		{T0.16b,T1.16b}, [HASH]
+	ret
+SYM_FUNC_END(nh_neon)
--- a/lib/crypto/arm64/nh.h
+++ b/lib/crypto/arm64/nh.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM64 accelerated implementation of NH
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			__le64 hash[NH_NUM_PASSES]);
+
+static bool nh_arch(const u32 *key, const u8 *message, size_t message_len,
+		    __le64 hash[NH_NUM_PASSES])
+{
+	if (static_branch_likely(&have_neon) && message_len >= 64 &&
+	    may_use_simd()) {
+		scoped_ksimd()
+			nh_neon(key, message, message_len, hash);
+		return true;
+	}
+	return false;
+}
+
+#define nh_mod_init_arch nh_mod_init_arch
+static void nh_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+}
--- a/lib/crypto/fips-mldsa.h
+++ b/lib/crypto/fips-mldsa.h
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* ML-DSA test vector extracted from leancrypto */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_mldsa65_signature[] __initconst __maybe_unused = {
+	0xda, 0xcf, 0x8d, 0x67, 0x59, 0x60, 0x6c, 0x39, 0x2d, 0x89, 0xb6, 0xa1,
+	0xf3, 0x8c, 0x70, 0xcf, 0x25, 0x86, 0x21, 0xa1, 0x9f, 0x20, 0x9e, 0xf5,
+	0xd2, 0xdd, 0xbd, 0x99, 0xfa, 0xe4, 0xab, 0x77, 0x31, 0x65, 0x18, 0xa1,
+	0xd1, 0x3f, 0x21, 0x70, 0x36, 0xe1, 0xf9, 0x5c, 0x28, 0xb6, 0x7d, 0x34,
+	0xae, 0x66, 0xc9, 0x1c, 0x8e, 0xc6, 0xf9, 0x45, 0x8c, 0xa9, 0xb2, 0xfb,
+	0x0f, 0x5b, 0xb8, 0xf9, 0xf5, 0xe2, 0x37, 0x79, 0x12, 0xda, 0xa7, 0x72,
+	0x9e, 0x0d, 0xf8, 0x88, 0x5b, 0x34, 0x49, 0x6c, 0xed, 0xa3, 0x7f, 0x86,
+	0xd3, 0xd9, 0x2f, 0x44, 0x08, 0x0d, 0xb7, 0xdb, 0x4a, 0xce, 0x02, 0x14,
+	0x02, 0xd6, 0x40, 0x75, 0xe3, 0xc0, 0x97, 0xfc, 0x6c, 0x6a, 0x88, 0x29,
+	0x0c, 0xe2, 0x3a, 0x2b, 0x28, 0x82, 0x8f, 0x27, 0x09, 0x69, 0x91, 0xc6,
+	0xc3, 0xb7, 0x07, 0x61, 0x86, 0x8d, 0x89, 0x8a, 0xd5, 0x00, 0x3b, 0x4b,
+	0xfc, 0x6f, 0xb3, 0x3f, 0x4c, 0x93, 0x31, 0xfc, 0x88, 0x53, 0x26, 0xea,
+	0xe5, 0x3a, 0xfc, 0xc1, 0x59, 0x16, 0xf0, 0xb7, 0xac, 0xde, 0x1e, 0xd8,
+	0x74, 0x85, 0x72, 0xd9, 0xbb, 0xbe, 0x76, 0x32, 0x25, 0x9d, 0x21, 0xbc,
+	0xfd, 0x8d, 0x32, 0xfe, 0xae, 0x24, 0xe5, 0x4a, 0xcc, 0x5d, 0x15, 0x23,
+	0xd3, 0x57, 0xe7, 0xa9, 0x2c, 0x31, 0xd7, 0xc5, 0x6b, 0x70, 0x6c, 0x22,
+	0x5a, 0x13, 0x1f, 0x76, 0x13, 0x78, 0x6f, 0xac, 0x42, 0x4c, 0x46, 0x81,
+	0xa2, 0x20, 0x91, 0x30, 0xed, 0xcb, 0x90, 0xfe, 0x3c, 0xa3, 0xc7, 0xb4,
+	0x1f, 0x21, 0x1d, 0x98, 0x74, 0x6a, 0x3e, 0xc8, 0xcc, 0xd2, 0x68, 0x87,
+	0x69, 0xa9, 0xdf, 0x50, 0xd5, 0x0a, 0x8e, 0x10, 0x54, 0xab, 0xea, 0x65,
+	0x2a, 0x52, 0xd7, 0x22, 0xae, 0x2f, 0x1e, 0xc3, 0x16, 0x58, 0x20, 0x18,
+	0x6d, 0x35, 0x46, 0x31, 0x43, 0x5d, 0x62, 0xfb, 0xb1, 0x47, 0x32, 0xfa,
+	0x14, 0xcc, 0x51, 0xa3, 0xcd, 0x99, 0x4f, 0x97, 0x0f, 0xca, 0x24, 0x93,
+	0x17, 0xea, 0xa3, 0xf3, 0x1f, 0xbe, 0xb5, 0xa3, 0xac, 0x80, 0xcc, 0x20,
+	0x3b, 0xa6, 0xd3, 0x32, 0x72, 0x4e, 0xd9, 0x25, 0xf9, 0xc2, 0x24, 0x15,
+	0xbd, 0x1e, 0x1e, 0x41, 0x8c, 0x18, 0x8c, 0x58, 0xe8, 0x75, 0x20, 0xff,
+	0xa3, 0xf4, 0xd4, 0xab, 0x75, 0x78, 0x4e, 0xbb, 0x7c, 0x94, 0x93, 0x28,
+	0x5b, 0x07, 0x3a, 0x3c, 0xc9, 0xf1, 0x55, 0x3e, 0x33, 0xed, 0xf8, 0x72,
+	0x55, 0xab, 0x5a, 0xea, 0xbe, 0x65, 0xfa, 0x81, 0x50, 0xc0, 0x9d, 0x2d,
+	0xfb, 0x04, 0x25, 0x7c, 0xb9, 0xee, 0xe2, 0xa3, 0x00, 0x44, 0xd3, 0x9d,
+	0xee, 0x4f, 0x80, 0x77, 0xfb, 0x26, 0x6b, 0x07, 0xd0, 0xff, 0x82, 0x39,
+	0x0e, 0x2b, 0x47, 0xa3, 0xe7, 0x3e, 0xc5, 0x4e, 0x15, 0x8a, 0x48, 0x28,
+	0xfb, 0xf7, 0xa4, 0x86, 0xfb, 0x77, 0x60, 0xcd, 0xc5, 0x68, 0x96, 0xd7,
+	0x4c, 0x3c, 0xf2, 0x51, 0x71, 0x79, 0x2e, 0x2e, 0x57, 0x10, 0xa7, 0xfc,
+	0xd1, 0xd4, 0x61, 0x71, 0x81, 0x85, 0x74, 0x09, 0x7d, 0x80, 0xd0, 0xc2,
+	0xe9, 0xff, 0xb7, 0x88, 0x53, 0x74, 0x1e, 0xb0, 0xca, 0x65, 0x48, 0x8e,
+	0xdb, 0x59, 0x3a, 0xcb, 0x80, 0xeb, 0xfd, 0xd2, 0xc9, 0x38, 0x43, 0xae,
+	0x76, 0xf2, 0xbb, 0x51, 0xb2, 0xcb, 0xe6, 0x85, 0x31, 0xb5, 0x62, 0xd4,
+	0x5e, 0x48, 0x08, 0xf1, 0x40, 0x5b, 0x16, 0x83, 0x5e, 0xa5, 0x9c, 0x6b,
+	0x91, 0x49, 0x44, 0xff, 0x3b, 0xa9, 0x2b, 0xf3, 0x06, 0x33, 0x9e, 0x6e,
+	0x3c, 0x66, 0x7e, 0x27, 0xa2, 0x59, 0x7b, 0xe3, 0xb6, 0xb4, 0x28, 0xeb,
+	0x93, 0x35, 0x87, 0xac, 0x0e, 0x0b, 0x7e, 0xbc, 0x35, 0x28, 0x72, 0x1f,
+	0x26, 0x59, 0xd0, 0x1f, 0x63, 0xe4, 0x86, 0x5d, 0x70, 0xf3, 0xa8, 0xa4,
+	0xb8, 0xcd, 0xb3, 0xf8, 0x8d, 0xaa, 0x41, 0xd2, 0xcc, 0x0b, 0x15, 0x66,
+	0x22, 0x83, 0x92, 0xe3, 0x0b, 0xf9, 0xea, 0xa0, 0x33, 0xa1, 0x4e, 0x92,
+	0xae, 0x81, 0x95, 0xa4, 0x58, 0x3f, 0xa9, 0x15, 0x52, 0xf9, 0xda, 0xb7,
+	0x10, 0x8d, 0xc6, 0xab, 0x77, 0xe9, 0xbe, 0xad, 0xc9, 0x3a, 0x6a, 0x8d,
+	0x92, 0x6c, 0x69, 0xff, 0x31, 0x49, 0x25, 0x04, 0xc8, 0x93, 0x6f, 0xc8,
+	0xe7, 0x60, 0x7a, 0x76, 0xb5, 0xc1, 0x07, 0xef, 0xa3, 0x39, 0xa6, 0xf2,
+	0x36, 0x04, 0xde, 0x3c, 0x4a, 0x4e, 0x96, 0xbd, 0x64, 0x26, 0x80, 0x01,
+	0x88, 0x47, 0xd2, 0xa4, 0x46, 0xcd, 0xe1, 0x30, 0x7f, 0xa3, 0x00, 0x11,
+	0x38, 0x55, 0xfa, 0xeb, 0x10, 0xeb, 0xa0, 0x65, 0x04, 0x09, 0xc8, 0xde,
+	0x9c, 0x73, 0xba, 0x0c, 0xbd, 0xd3, 0xa5, 0x84, 0x5e, 0xb9, 0x3b, 0xd4,
+	0x94, 0xbd, 0xa6, 0x53, 0xbe, 0x93, 0x69, 0x3e, 0xaa, 0x32, 0x31, 0x06,
+	0xc8, 0x1b, 0x4a, 0x48, 0xb5, 0x17, 0x85, 0xbf, 0x72, 0xec, 0xf5, 0x29,
+	0x8a, 0xd8, 0xeb, 0x99, 0x8b, 0x74, 0x84, 0x57, 0x8c, 0xe1, 0x85, 0x94,
+	0xa0, 0xbc, 0x7a, 0x14, 0xf0, 0xf4, 0x8b, 0x25, 0x37, 0x43, 0xa1, 0x34,
+	0x09, 0x71, 0xca, 0x5c, 0x9f, 0x08, 0x38, 0xd9, 0x9c, 0x0c, 0x0e, 0xcb,
+	0xe4, 0xad, 0x4b, 0x2a, 0x89, 0x67, 0xf8, 0x29, 0x6c, 0x69, 0x0e, 0x5d,
+	0xca, 0xfa, 0xa6, 0x6b, 0x0e, 0xb5, 0x94, 0x17, 0x71, 0xf0, 0xc9, 0xcd,
+	0x02, 0x1d, 0xa5, 0xd5, 0xc6, 0xa7, 0xbc, 0x5f, 0x6e, 0x67, 0x43, 0x68,
+	0xce, 0xac, 0x54, 0x81, 0x2a, 0x25, 0x22, 0x52, 0x35, 0xad, 0x7b, 0xd5,
+	0x06, 0x8c, 0x00, 0xfb, 0xca, 0xc4, 0x0a, 0x49, 0x1e, 0xc8, 0xeb, 0x77,
+	0xc1, 0x63, 0x23, 0x96, 0xbd, 0x35, 0xfa, 0x13, 0xae, 0xbf, 0x1d, 0x1e,
+	0x69, 0x8d, 0xb3, 0xe3, 0x07, 0xde, 0x4e, 0xd0, 0x12, 0xa9, 0xc3, 0x36,
+	0x30, 0x46, 0xef, 0x92, 0x76, 0x17, 0x8f, 0x10, 0xe7, 0xba, 0x99, 0x4b,
+	0xdf, 0xad, 0xb8, 0x11, 0x80, 0xdf, 0xe7, 0xfd, 0x80, 0x64, 0xf7, 0x2a,
+	0xac, 0x60, 0x2a, 0x54, 0x8f, 0x4f, 0xaf, 0xaf, 0x60, 0xf9, 0x67, 0x20,
+	0x80, 0x53, 0x5c, 0xb6, 0x81, 0xa6, 0x2a, 0x74, 0x2d, 0xc5, 0x74, 0x2a,
+	0x95, 0x26, 0x13, 0x17, 0x01, 0xdd, 0x31, 0xac, 0x5a, 0x05, 0xda, 0xde,
+	0xba, 0xf6, 0x37, 0x13, 0x8d, 0xe4, 0xa8, 0x93, 0x46, 0x9e, 0xa9, 0x82,
+	0x24, 0x7e, 0xc8, 0xda, 0x63, 0x89, 0xcd, 0x33, 0xc9, 0xf7, 0xf9, 0x71,
+	0x35, 0xe6, 0xa5, 0x5f, 0x6b, 0x3b, 0xbb, 0x0c, 0xe0, 0xa4, 0x0b, 0xe3,
+	0x29, 0xc0, 0xae, 0x8e, 0xce, 0x03, 0x09, 0x73, 0x0e, 0x1e, 0x9c, 0xe9,
+	0x59, 0xb6, 0x8b, 0x78, 0x67, 0x32, 0x8b, 0xf1, 0x93, 0xcc, 0x72, 0x1b,
+	0x6f, 0xa2, 0xf1, 0x04, 0x9c, 0xfa, 0x98, 0x02, 0xca, 0xdf, 0x35, 0x3c,
+	0x38, 0xac, 0xa8, 0xdb, 0x90, 0xae, 0xaa, 0xf9, 0x70, 0xfb, 0xed, 0xbd,
+	0xa6, 0x25, 0x14, 0x58, 0x09, 0x8a, 0x36, 0xaf, 0x41, 0x09, 0x19, 0xcb,
+	0xd3, 0x25, 0x5d, 0x0e, 0xe6, 0x20, 0x14, 0x71, 0x24, 0x79, 0x19, 0x55,
+	0xaf, 0x51, 0x5b, 0xa4, 0xc0, 0x93, 0x9e, 0xdd, 0x88, 0x31, 0x13, 0x96,
+	0xbf, 0xca, 0x0a, 0xd7, 0xbc, 0xc4, 0x00, 0xa1, 0x10, 0x2d, 0x92, 0x79,
+	0xf9, 0x14, 0xdb, 0xd2, 0xba, 0x74, 0xfa, 0xa8, 0xe5, 0x40, 0x14, 0xc2,
+	0x56, 0x3c, 0x7f, 0x50, 0x07, 0x60, 0x86, 0x93, 0x51, 0x2e, 0xf9, 0x70,
+	0x61, 0x70, 0x0e, 0xa4, 0x87, 0x75, 0xcc, 0x6c, 0x72, 0xb7, 0x68, 0x23,
+	0xb7, 0x3d, 0x76, 0xaf, 0x96, 0x9b, 0x4a, 0xe5, 0x12, 0x28, 0x4a, 0x8f,
+	0x79, 0x34, 0xff, 0xec, 0x92, 0xeb, 0x6b, 0xaf, 0xc9, 0xbd, 0xc1, 0x77,
+	0x07, 0xd0, 0xfa, 0x55, 0x57, 0x10, 0x0c, 0xad, 0x29, 0x2a, 0x79, 0xd6,
+	0x09, 0x9e, 0x7d, 0x18, 0xd4, 0xd6, 0xdd, 0x72, 0x1a, 0x8f, 0x24, 0x11,
+	0x70, 0xd2, 0x52, 0x36, 0x0f, 0x38, 0x79, 0x38, 0x4a, 0x02, 0x4f, 0x73,
+	0x2a, 0xaa, 0x6a, 0xb5, 0x0c, 0x72, 0x32, 0x85, 0x21, 0x76, 0x1a, 0x8a,
+	0x7d, 0x51, 0x0e, 0xf1, 0xf9, 0x19, 0xfa, 0x6b, 0x9b, 0x22, 0x71, 0x8c,
+	0x13, 0xcc, 0xba, 0x7d, 0xee, 0xd8, 0x34, 0xf6, 0x85, 0x60, 0xe1, 0xe4,
+	0x59, 0x6e, 0x32, 0x60, 0xd9, 0xfa, 0xb7, 0x56, 0x54, 0x25, 0xd1, 0x73,
+	0x6a, 0xf2, 0xa0, 0xc7, 0xa0, 0x67, 0x10, 0x89, 0x9c, 0x27, 0x5f, 0x7f,
+	0x2e, 0x5a, 0x29, 0x70, 0x7a, 0x7b, 0xaf, 0x21, 0xd0, 0xf4, 0x06, 0xb9,
+	0x2d, 0xf1, 0xb8, 0x32, 0xed, 0xc5, 0xc9, 0xac, 0x2f, 0x54, 0x0a, 0xf9,
+	0x08, 0x39, 0x39, 0x7d, 0x1d, 0xaf, 0xb4, 0x5f, 0x4d, 0x75, 0xc3, 0xe8,
+	0x52, 0x3a, 0x47, 0x72, 0x2c, 0xa9, 0x2d, 0xcb, 0x74, 0x06, 0xfe, 0x69,
+	0xd3, 0xf3, 0x1a, 0xb2, 0xd3, 0x01, 0xed, 0x6c, 0xc1, 0xca, 0x4f, 0xaf,
+	0x11, 0x9b, 0xa2, 0x27, 0x2a, 0x59, 0x56, 0x58, 0xdf, 0x79, 0x8b, 0xc9,
+	0x87, 0xe9, 0x58, 0x81, 0x48, 0xc6, 0xb6, 0x7d, 0x60, 0x54, 0x87, 0x9c,
+	0x61, 0xbb, 0x4b, 0xbb, 0x61, 0xac, 0x0a, 0x5a, 0x66, 0x7e, 0x70, 0x8b,
+	0xfd, 0x92, 0x76, 0x4a, 0xa9, 0xa5, 0xc3, 0xf4, 0xf2, 0x93, 0x48, 0xc4,
+	0xf3, 0x91, 0x2b, 0x60, 0x04, 0x0e, 0xb0, 0x6b, 0x60, 0x5e, 0xf0, 0xf1,
+	0x54, 0x41, 0x56, 0xdc, 0x25, 0x57, 0xc3, 0xb6, 0x0b, 0x5e, 0x15, 0xb5,
+	0x2a, 0x36, 0x4f, 0xe7, 0x1d, 0x70, 0xa8, 0xa7, 0xec, 0xd6, 0x74, 0xba,
+	0xa4, 0x79, 0x83, 0x7c, 0x9e, 0x1a, 0x5d, 0x32, 0xc8, 0xcb, 0x41, 0xca,
+	0x04, 0xec, 0x0b, 0x18, 0x54, 0xe1, 0x67, 0xbf, 0xa8, 0x7a, 0xc3, 0x0f,
+	0x27, 0x2a, 0xaf, 0x2a, 0x41, 0x19, 0x1f, 0xe8, 0xa2, 0xe8, 0xfa, 0xfc,
+	0x88, 0x41, 0x46, 0xc3, 0x1c, 0x44, 0xe5, 0xee, 0x47, 0xec, 0xfe, 0xbf,
+	0xb8, 0x29, 0x2e, 0xae, 0x47, 0x0a, 0x42, 0x69, 0x8a, 0x9a, 0x94, 0x97,
+	0x9e, 0xf5, 0xb6, 0x37, 0x1c, 0x10, 0xc2, 0x99, 0xa8, 0xe9, 0x9e, 0x0e,
+	0x6e, 0xb5, 0xbe, 0xba, 0x1f, 0x77, 0xa6, 0x35, 0x02, 0x1e, 0x8c, 0xe6,
+	0x02, 0x53, 0xe2, 0x9a, 0xdd, 0x09, 0x6e, 0x9b, 0x7a, 0x36, 0x4f, 0x38,
+	0x8d, 0x4c, 0xa4, 0xb4, 0xff, 0x90, 0x76, 0x0d, 0x11, 0x7d, 0xe1, 0xe9,
+	0x7f, 0x2a, 0x4a, 0x80, 0xe0, 0xd8, 0x3c, 0x23, 0xd2, 0xa5, 0xe5, 0x39,
+	0x77, 0xbf, 0x3d, 0x71, 0x0d, 0x45, 0xbb, 0x39, 0x66, 0x1a, 0x4d, 0x59,
+	0xb7, 0xd0, 0x0a, 0xee, 0x87, 0xee, 0x1f, 0xcf, 0x6f, 0xc2, 0x50, 0xb1,
+	0xa5, 0x4c, 0xee, 0x40, 0x69, 0xd7, 0x36, 0x38, 0x14, 0xcd, 0x6a, 0x9a,
+	0x90, 0x40, 0xad, 0x76, 0xf1, 0xa6, 0xd4, 0x3c, 0x75, 0x10, 0xba, 0xcb,
+	0xab, 0x22, 0x28, 0x5f, 0x0c, 0xe0, 0xee, 0xf4, 0xfd, 0x61, 0x52, 0x0a,
+	0x59, 0xfe, 0x61, 0xc5, 0x40, 0xf9, 0x91, 0x8e, 0x36, 0x29, 0x63, 0x6c,
+	0x6e, 0x45, 0xa5, 0x42, 0xe3, 0x36, 0x90, 0xe7, 0x90, 0x9f, 0x58, 0xbb,
+	0xf9, 0x1b, 0xee, 0x2c, 0xbb, 0x3a, 0xfd, 0x3d, 0xbe, 0x3d, 0x45, 0xf0,
+	0xc2, 0x18, 0xaa, 0x46, 0x10, 0x23, 0xe9, 0x63, 0xba, 0x7f, 0xc2, 0xe1,
+	0xf4, 0x05, 0xdd, 0x4a, 0x7c, 0xa8, 0xab, 0xa9, 0xbd, 0x6f, 0xdf, 0x48,
+	0x59, 0x11, 0xd4, 0xba, 0x75, 0xb6, 0x22, 0xd4, 0xd7, 0x35, 0x6f, 0x27,
+	0x70, 0xc7, 0x3d, 0x90, 0x06, 0x39, 0x2a, 0x16, 0xd0, 0x8b, 0xd7, 0xfb,
+	0x5e, 0x85, 0x2e, 0xb0, 0xd8, 0xc7, 0xdb, 0xe5, 0x24, 0x3a, 0x6e, 0xc4,
+	0x5e, 0xd4, 0x22, 0x25, 0x14, 0xee, 0xa5, 0x30, 0x8b, 0xd6, 0x27, 0x61,
+	0x33, 0x13, 0x46, 0x0b, 0x26, 0x45, 0xa6, 0xb4, 0xfa, 0x8d, 0xa3, 0xf2,
+	0x27, 0xd2, 0xc5, 0x04, 0xaa, 0x96, 0xa4, 0x55, 0xfa, 0x40, 0xf1, 0xfc,
+	0x66, 0x33, 0x9e, 0x4b, 0x39, 0x75, 0xae, 0x7f, 0x52, 0x87, 0x7b, 0x8a,
+	0xf9, 0x7d, 0x5f, 0x8a, 0x7e, 0xf7, 0xfe, 0xc4, 0x7f, 0xf4, 0xf6, 0x9a,
+	0x86, 0x78, 0x21, 0x02, 0x94, 0x9e, 0x50, 0x2d, 0xdc, 0xd6, 0xa5, 0x53,
+	0xf1, 0xef, 0x06, 0xe8, 0xb5, 0x46, 0x81, 0xcc, 0x91, 0x4f, 0x37, 0xee,
+	0x27, 0xcb, 0x91, 0xad, 0xff, 0x1d, 0xd1, 0x00, 0xa8, 0x96, 0x22, 0xaa,
+	0x63, 0x23, 0x2a, 0x7a, 0x75, 0x6f, 0xe9, 0x2d, 0x26, 0xde, 0x11, 0x97,
+	0x4b, 0x17, 0x3f, 0xde, 0x51, 0x1a, 0x22, 0xed, 0x38, 0x6f, 0x3e, 0x7a,
+	0xd0, 0xd6, 0x60, 0x06, 0x7e, 0x3f, 0xa4, 0x29, 0xfa, 0x18, 0x91, 0xda,
+	0x73, 0x38, 0xe3, 0xe3, 0xb5, 0xc0, 0x5b, 0x4e, 0xe8, 0x94, 0xea, 0x45,
+	0x6e, 0x5b, 0x50, 0xaa, 0x38, 0xb6, 0x6f, 0xdb, 0x90, 0x1b, 0x3b, 0x82,
+	0xbb, 0x0d, 0x38, 0xe3, 0xca, 0xd9, 0xf1, 0x2e, 0x27, 0x4c, 0x2c, 0x5a,
+	0x42, 0xdf, 0x44, 0xc8, 0x07, 0xe4, 0x95, 0xb5, 0xec, 0x91, 0x34, 0x1c,
+	0x9a, 0x0c, 0x50, 0x1a, 0xce, 0x67, 0xe4, 0x4b, 0x87, 0x61, 0x43, 0x95,
+	0x95, 0xb8, 0x8a, 0xf4, 0xc9, 0x92, 0x33, 0x33, 0xe3, 0xfe, 0x98, 0x2a,
+	0xae, 0x8e, 0xf2, 0x6b, 0x13, 0x7c, 0xe4, 0x44, 0x40, 0x66, 0xea, 0x0c,
+	0xe4, 0xdb, 0x16, 0x65, 0xa8, 0x8b, 0x37, 0x08, 0xec, 0x1e, 0xfc, 0xa6,
+	0xd0, 0x9b, 0x9e, 0x0a, 0xd2, 0xe3, 0xcf, 0x5d, 0xb2, 0xaf, 0x8e, 0x05,
+	0x7d, 0x8d, 0x84, 0xbc, 0x9f, 0xb1, 0xe6, 0x6a, 0x2e, 0x4b, 0x6d, 0x64,
+	0x91, 0x17, 0x9d, 0xb5, 0x35, 0x15, 0x02, 0xe9, 0x1b, 0x85, 0xc1, 0x89,
+	0xc2, 0x5a, 0x32, 0x3a, 0x80, 0x78, 0x5e, 0xcc, 0x50, 0x26, 0xf5, 0x11,
+	0x01, 0x79, 0xf3, 0xaf, 0xb6, 0x40, 0x00, 0x73, 0x8f, 0xeb, 0x5a, 0xd1,
+	0x26, 0x00, 0xe2, 0xa3, 0xcd, 0xfd, 0xaa, 0x15, 0x5b, 0x98, 0x2a, 0x76,
+	0x41, 0x07, 0xc2, 0xde, 0xb6, 0x71, 0xe7, 0xc3, 0xe9, 0x92, 0xb3, 0xd8,
+	0xfe, 0xaf, 0x12, 0x61, 0x86, 0x5b, 0x6e, 0x74, 0x45, 0x7b, 0x9b, 0x6f,
+	0x1a, 0x13, 0x84, 0xf6, 0x31, 0x5f, 0x5b, 0x6c, 0xde, 0x47, 0xb8, 0x73,
+	0x32, 0xc7, 0x94, 0x92, 0xa5, 0xc3, 0x65, 0xdf, 0x96, 0x6c, 0xfd, 0xb7,
+	0x80, 0xfb, 0x47, 0xba, 0x6e, 0x43, 0xb3, 0x7e, 0x86, 0xc9, 0x97, 0x45,
+	0xde, 0x3f, 0x3a, 0xf6, 0xb0, 0x9e, 0x9a, 0xcb, 0xfd, 0xf2, 0x5c, 0xba,
+	0x6e, 0x3f, 0xed, 0xfa, 0x74, 0x84, 0xe2, 0xb1, 0xae, 0x66, 0x57, 0x0b,
+	0x96, 0x6c, 0x77, 0xe4, 0x8a, 0x67, 0x97, 0xc7, 0xe0, 0x44, 0xb2, 0x83,
+	0x2d, 0x3c, 0x2e, 0x01, 0x19, 0x2e, 0x4c, 0x74, 0xe1, 0x35, 0x73, 0xeb,
+	0x85, 0x63, 0x8c, 0x3a, 0xb8, 0xbc, 0x25, 0x6a, 0x8d, 0xaf, 0xd2, 0xfb,
+	0xef, 0xd3, 0x12, 0x93, 0x0b, 0x39, 0xfa, 0x66, 0xbe, 0x3b, 0xfd, 0x6c,
+	0x0b, 0xbb, 0xb2, 0x5a, 0x78, 0xa1, 0xcf, 0x8c, 0x7d, 0x60, 0x55, 0xeb,
+	0x33, 0x4e, 0x8e, 0xf9, 0x19, 0x4d, 0x42, 0xd4, 0xf8, 0xd8, 0xba, 0xad,
+	0x0a, 0x6e, 0x62, 0xd4, 0xe1, 0x6a, 0xcc, 0xea, 0x09, 0x91, 0x8e, 0x62,
+	0xc9, 0x1e, 0x9e, 0x48, 0xaa, 0xde, 0xf7, 0xa2, 0x5a, 0xcb, 0x83, 0x20,
+	0xe8, 0xf5, 0xd1, 0xfe, 0x9d, 0x18, 0x2f, 0xd6, 0xf8, 0x97, 0x17, 0xce,
+	0xc2, 0x05, 0x08, 0xef, 0x61, 0x70, 0x9d, 0x95, 0x79, 0x59, 0x4c, 0x06,
+	0x24, 0x3d, 0x24, 0x69, 0xff, 0x46, 0xda, 0xbc, 0x71, 0x7a, 0x74, 0x93,
+	0x58, 0xf5, 0xc8, 0x91, 0xfb, 0x66, 0xed, 0x78, 0x8f, 0xf8, 0x28, 0xa8,
+	0x1d, 0xa5, 0x3a, 0x13, 0x76, 0xc2, 0xcc, 0xba, 0xb9, 0x56, 0x29, 0x74,
+	0xd6, 0x14, 0x75, 0x58, 0xe6, 0x2e, 0x79, 0x6e, 0x9d, 0x41, 0x94, 0x8a,
+	0xcf, 0xf1, 0xb1, 0xe0, 0x36, 0xe5, 0x89, 0x9a, 0x95, 0xa1, 0x11, 0xd1,
+	0xbe, 0x45, 0xe4, 0xb3, 0xb0, 0x62, 0x32, 0x1d, 0xba, 0xe0, 0xde, 0x57,
+	0x81, 0x0e, 0x01, 0x9b, 0x52, 0x3d, 0xd5, 0xde, 0x3b, 0x3a, 0xdd, 0x8f,
+	0xe3, 0x2e, 0xce, 0x1e, 0x89, 0x4d, 0x81, 0xf0, 0xf6, 0x20, 0x63, 0x7a,
+	0x4c, 0xbb, 0x66, 0xe0, 0xbe, 0x2b, 0xee, 0xd0, 0x3b, 0x60, 0x1e, 0x65,
+	0xd1, 0x2c, 0x7c, 0x5c, 0x6c, 0x16, 0x5b, 0x90, 0xc8, 0x05, 0x10, 0xf2,
+	0xde, 0x33, 0x90, 0x35, 0x69, 0x24, 0x3f, 0xc1, 0x8f, 0x1e, 0x4a, 0x60,
+	0xf1, 0x03, 0x65, 0x46, 0x40, 0x76, 0xe9, 0x83, 0x97, 0xda, 0x0b, 0xb8,
+	0x22, 0xfa, 0x55, 0x99, 0xfd, 0x18, 0x24, 0xd2, 0x66, 0xb0, 0x7b, 0x70,
+	0x56, 0x93, 0xad, 0x09, 0x95, 0x8e, 0x1f, 0x2f, 0xe8, 0x12, 0x55, 0xd4,
+	0x1f, 0xde, 0x09, 0x85, 0x05, 0xd1, 0xd5, 0x10, 0x2c, 0x8c, 0x6b, 0x53,
+	0x28, 0xce, 0x06, 0xc5, 0x52, 0x0f, 0xfa, 0x09, 0x09, 0x23, 0x1b, 0xe3,
+	0xbf, 0xb1, 0x89, 0x72, 0x26, 0x0d, 0xa6, 0xbb, 0x7d, 0x9e, 0xdc, 0xf8,
+	0xf5, 0x0b, 0x8c, 0xe0, 0xbc, 0x97, 0x3b, 0x72, 0xdd, 0xf5, 0x9d, 0xc5,
+	0xb6, 0x37, 0x2c, 0x76, 0x5b, 0x58, 0x67, 0xdb, 0xed, 0x3b, 0x6e, 0xe5,
+	0xe5, 0x6d, 0x6f, 0x0d, 0x7e, 0xff, 0xa9, 0x57, 0x4a, 0x84, 0x85, 0x82,
+	0xac, 0x00, 0x50, 0xa3, 0x4f, 0x87, 0xfe, 0x2a, 0x40, 0x52, 0x54, 0x81,
+	0x69, 0x42, 0x0b, 0x0c, 0xd7, 0x18, 0x98, 0x01, 0x8c, 0x5a, 0xa2, 0xf4,
+	0xe8, 0x61, 0xd1, 0x38, 0xfd, 0x0f, 0x63, 0x75, 0xd3, 0x4b, 0x1d, 0xdc,
+	0xdf, 0xb2, 0xeb, 0x94, 0x97, 0x5c, 0x2a, 0xb4, 0x12, 0x5c, 0x49, 0x2b,
+	0xfc, 0xd0, 0x8d, 0xfb, 0xe7, 0xb3, 0xcb, 0x0f, 0x3c, 0x2e, 0x04, 0x36,
+	0xa8, 0x03, 0xc9, 0xd7, 0x11, 0x2d, 0x2a, 0x93, 0xff, 0xda, 0x26, 0xb0,
+	0x54, 0x7e, 0xaf, 0x30, 0x7d, 0xce, 0x46, 0x8a, 0x3d, 0x7c, 0xa4, 0x7a,
+	0x2c, 0xfa, 0xba, 0xa1, 0xc9, 0x41, 0xd3, 0xb8, 0x84, 0x03, 0x78, 0xdd,
+	0xe9, 0x57, 0x19, 0x62, 0x62, 0xff, 0x5b, 0x3b, 0x48, 0x62, 0x0e, 0xee,
+	0x19, 0xb0, 0x32, 0x6e, 0x6a, 0x07, 0xd8, 0x4e, 0x25, 0x76, 0xa7, 0xe3,
+	0x98, 0xa1, 0x6f, 0xb6, 0x99, 0x32, 0x67, 0x7d, 0x46, 0x42, 0x4a, 0x82,
+	0xd1, 0x29, 0x1b, 0x87, 0xeb, 0x4b, 0x9e, 0xdf, 0x69, 0x75, 0xbd, 0x4f,
+	0xd3, 0xde, 0xc9, 0x83, 0xe6, 0xd6, 0xea, 0x03, 0x81, 0x12, 0xf3, 0x5d,
+	0x99, 0xf1, 0xb1, 0xd9, 0x3e, 0xbe, 0xf3, 0xa8, 0xdc, 0xb6, 0xf8, 0x4b,
+	0x9e, 0x26, 0x3f, 0xf0, 0x7c, 0xb3, 0xf4, 0xca, 0x00, 0x6c, 0x6c, 0xe5,
+	0x43, 0xa1, 0xfd, 0x3a, 0xf8, 0x8e, 0xe3, 0x9f, 0x88, 0xc5, 0x44, 0xfd,
+	0x24, 0x69, 0x76, 0xd5, 0xcb, 0xdc, 0x9d, 0x12, 0xf3, 0x13, 0x7e, 0xe7,
+	0xc3, 0xa8, 0x6a, 0xb2, 0xe0, 0xb3, 0x1d, 0xab, 0x3b, 0xc9, 0x77, 0x3d,
+	0x0f, 0xc3, 0xbe, 0x4b, 0x8b, 0x28, 0xbd, 0x7c, 0xe6, 0xb2, 0x06, 0x1f,
+	0xf9, 0x8f, 0x16, 0x62, 0xbf, 0xc7, 0x55, 0x73, 0xd4, 0xf1, 0x5a, 0x95,
+	0x80, 0xa3, 0x4e, 0xaa, 0x60, 0x17, 0x3c, 0xc9, 0x5e, 0xd4, 0x0c, 0x56,
+	0x7a, 0x77, 0x8e, 0x7f, 0x67, 0x08, 0x2f, 0xd9, 0x21, 0x19, 0xfd, 0x86,
+	0x8c, 0x23, 0x8d, 0xf6, 0x92, 0x1f, 0x36, 0x2c, 0x7c, 0x83, 0xbd, 0x2f,
+	0x6c, 0x63, 0x7c, 0xb7, 0x93, 0x74, 0x1b, 0xc2, 0x95, 0x34, 0x26, 0x1e,
+	0x07, 0x87, 0x3a, 0xb6, 0xe2, 0x39, 0x71, 0x9b, 0x20, 0xcd, 0x63, 0xf0,
+	0xbf, 0x48, 0xb5, 0x0e, 0x49, 0x86, 0x50, 0x80, 0xbd, 0xd6, 0x0e, 0xab,
+	0xd5, 0x69, 0x1b, 0xa4, 0xb3, 0x63, 0x3c, 0x8f, 0xcb, 0x42, 0xdb, 0xd7,
+	0x1a, 0xf4, 0xdf, 0x9e, 0x25, 0xfc, 0xd4, 0x00, 0xcb, 0xec, 0x57, 0x69,
+	0x30, 0x15, 0x4d, 0x7a, 0x69, 0x28, 0x2f, 0x2b, 0x34, 0x26, 0xd1, 0xe7,
+	0x01, 0x42, 0x5e, 0x02, 0xe2, 0x75, 0xe8, 0x52, 0x8a, 0xb4, 0x71, 0xfa,
+	0xc3, 0x3d, 0xe6, 0xac, 0xeb, 0xf3, 0x93, 0xe0, 0x37, 0xcd, 0x66, 0x92,
+	0x66, 0x2c, 0xfe, 0x4b, 0xd6, 0x3c, 0xf1, 0x57, 0xe5, 0xcf, 0xf5, 0xd0,
+	0xdb, 0x0e, 0x1f, 0x82, 0x65, 0x3b, 0xab, 0x69, 0x42, 0x53, 0x7d, 0xa4,
+	0x7c, 0xb7, 0x86, 0xeb, 0x23, 0x45, 0xa8, 0x4a, 0x73, 0xfc, 0x38, 0xc6,
+	0xe5, 0x2c, 0xab, 0x80, 0xfb, 0x23, 0xb2, 0x0c, 0x53, 0x28, 0x21, 0x37,
+	0x54, 0x9c, 0x72, 0x51, 0x0f, 0x44, 0x50, 0xd3, 0xe1, 0xd5, 0xb2, 0x27,
+	0x83, 0xb6, 0xe9, 0x4d, 0x64, 0x5c, 0x17, 0x0f, 0xe0, 0x13, 0xe4, 0x26,
+	0x6b, 0xd0, 0xd8, 0x25, 0xe3, 0x69, 0x6a, 0x95, 0x3f, 0x4a, 0x4e, 0xa0,
+	0x58, 0xbc, 0x28, 0x47, 0x8b, 0x68, 0xe4, 0x41, 0x90, 0x46, 0x1b, 0x84,
+	0xa0, 0x7b, 0x46, 0x46, 0x03, 0xee, 0x21, 0x0d, 0x34, 0xed, 0xff, 0x15,
+	0x57, 0x06, 0xdf, 0x71, 0x09, 0xb2, 0x66, 0x0d, 0x6e, 0xcc, 0xa5, 0x0c,
+	0xaf, 0x3f, 0x24, 0x8f, 0xd1, 0xc8, 0x44, 0x86, 0xaf, 0xbf, 0xeb, 0x2f,
+	0xb9, 0xee, 0xa7, 0xcf, 0xe4, 0xe8, 0xec, 0x47, 0x09, 0xd8, 0x95, 0x9e,
+	0x3c, 0xda, 0x92, 0x41, 0x61, 0xf5, 0xc3, 0xec, 0x00, 0xe4, 0xa3, 0x0d,
+	0x4a, 0xb3, 0xf6, 0x82, 0x05, 0x38, 0x70, 0x6a, 0xd1, 0x28, 0x2c, 0xb3,
+	0xc6, 0xbb, 0x38, 0xb3, 0x06, 0x7f, 0xd6, 0x4c, 0xe7, 0xfb, 0xef, 0x0d,
+	0x52, 0x66, 0xbe, 0xd8, 0xa6, 0x6f, 0xe8, 0xd9, 0x42, 0x4f, 0xad, 0xe8,
+	0xe8, 0x6c, 0xf9, 0xe9, 0x42, 0xd9, 0x66, 0x6e, 0xec, 0xfe, 0xf5, 0x91,
+	0xbf, 0x0a, 0x98, 0xd8, 0x7b, 0x23, 0x12, 0xa6, 0x04, 0xa8, 0xb3, 0x61,
+	0x13, 0x65, 0xc0, 0xe2, 0x82, 0xb9, 0xb2, 0x38, 0x07, 0x06, 0xca, 0x64,
+	0x6c, 0x23, 0x93, 0x60, 0x1d, 0x4d, 0x38, 0x5e, 0x8e, 0x90, 0x16, 0x4a,
+	0xfd, 0xb3, 0xcd, 0x84, 0x9c, 0xa5, 0xfa, 0x73, 0x2d, 0xcb, 0x87, 0x31,
+	0x3d, 0xf8, 0xfc, 0xeb, 0xa7, 0x56, 0x2f, 0x5b, 0x95, 0x9a, 0xc6, 0x82,
+	0x29, 0x86, 0x47, 0xe2, 0xc2, 0x84, 0x01, 0xaf, 0xc8, 0x0b, 0x2d, 0xfb,
+	0x34, 0xba, 0x5d, 0x9d, 0xd1, 0x85, 0xd5, 0x1e, 0x63, 0xcb, 0x3c, 0xa8,
+	0xfa, 0x79, 0xef, 0x12, 0xa6, 0xb5, 0xdb, 0xc5, 0x1d, 0x6a, 0xa7, 0x54,
+	0x58, 0x0c, 0xbe, 0x61, 0xe5, 0x96, 0x7f, 0x4a, 0x3b, 0x59, 0x32, 0x2d,
+	0x06, 0x44, 0x83, 0x5c, 0xad, 0xe9, 0xfe, 0x7c, 0xd7, 0x5b, 0x34, 0xa1,
+	0xa3, 0xad, 0x9a, 0xbf, 0xd5, 0x30, 0xf0, 0x22, 0xfc, 0x94, 0x7f, 0xd4,
+	0xa4, 0xca, 0x88, 0x31, 0xe7, 0xf2, 0x89, 0x2d, 0xda, 0xe6, 0x91, 0xa6,
+	0x27, 0x22, 0x74, 0x9f, 0xc6, 0x72, 0x4f, 0xf6, 0xa9, 0xfe, 0x7a, 0xf0,
+	0xa8, 0x6b, 0x6c, 0x9f, 0xe9, 0x2a, 0x9b, 0x23, 0x9e, 0xb8, 0x2b, 0x29,
+	0x65, 0xa7, 0x5d, 0xbd, 0x10, 0xe4, 0x56, 0x02, 0x94, 0xdd, 0xd1, 0xab,
+	0x9b, 0x82, 0x2d, 0x8d, 0xf6, 0xd3, 0x65, 0x63, 0x4a, 0xc4, 0x86, 0x61,
+	0x37, 0x9f, 0xdb, 0x4b, 0x34, 0x20, 0x0a, 0xca, 0x45, 0x6c, 0x06, 0xc4,
+	0x9c, 0x74, 0x4d, 0x83, 0x6a, 0x8d, 0xad, 0xc6, 0x61, 0x3a, 0x8d, 0xde,
+	0x6c, 0xf9, 0x8e, 0x33, 0xa2, 0xee, 0x99, 0xc7, 0xe4, 0x52, 0xb2, 0x44,
+	0x6f, 0x2f, 0x0f, 0x41, 0xa9, 0x1a, 0xd3, 0x96, 0x42, 0xc6, 0x49, 0x12,
+	0x6a, 0xf0, 0x29, 0xa9, 0x0c, 0x9c, 0x50, 0x5d, 0x1d, 0xd1, 0x42, 0x7e,
+	0x6f, 0x36, 0x48, 0x0f, 0x58, 0x14, 0x94, 0xc0, 0x10, 0x1e, 0xe0, 0xb2,
+	0xdd, 0xba, 0x57, 0x91, 0x4d, 0xd5, 0xdc, 0xa6, 0x4c, 0x68, 0x00, 0x6c,
+	0xb3, 0x5d, 0x32, 0x13, 0xbe, 0xa8, 0xc3, 0xfb, 0xd4, 0x19, 0x40, 0xf5,
+	0x6f, 0x63, 0xa1, 0x07, 0xbf, 0xa2, 0x8b, 0xfc, 0xfe, 0xf8, 0xa1, 0x33,
+	0x70, 0x07, 0x6d, 0xc5, 0x72, 0xa0, 0x39, 0xd6, 0xd7, 0x76, 0x6c, 0xfa,
+	0x1f, 0x04, 0xd6, 0x23, 0xbf, 0x66, 0x78, 0x92, 0x00, 0x11, 0x8a, 0x75,
+	0x67, 0x44, 0xa6, 0x7c, 0xd0, 0x14, 0xe6, 0xd0, 0x31, 0x6d, 0xdb, 0xc5,
+	0xb1, 0xa7, 0x99, 0xc3, 0xaf, 0x18, 0x7a, 0x26, 0x46, 0xad, 0x6d, 0x0c,
+	0xb6, 0xb5, 0xad, 0xc1, 0xcf, 0x60, 0x99, 0xf5, 0x9f, 0x88, 0xaf, 0x0e,
+	0x37, 0x15, 0xf9, 0x2b, 0x1a, 0x5f, 0xfb, 0xc9, 0xf8, 0xd4, 0xf0, 0x97,
+	0xd2, 0x91, 0xf4, 0x94, 0xa2, 0xd3, 0x3b, 0x8b, 0x0c, 0x22, 0xa0, 0xac,
+	0xb3, 0xb5, 0xdf, 0xf2, 0x27, 0x38, 0x47, 0x53, 0x5b, 0x6e, 0x8f, 0x98,
+	0x9e, 0xad, 0xb6, 0xf5, 0x0e, 0x17, 0x20, 0x35, 0x54, 0x6b, 0x73, 0xa6,
+	0x64, 0x65, 0xac, 0xb8, 0xc1, 0xd3, 0xf7, 0x07, 0x82, 0x93, 0x9d, 0xcb,
+	0xcc, 0xe9, 0x0c, 0x51, 0x52, 0x85, 0x8b, 0x95, 0xa6, 0xb1, 0xce, 0xdc,
+	0xfa, 0x00, 0x00, 0x08, 0x14, 0x1c, 0x23, 0x2a, 0x35,
+};
+
+static const u8 fips_test_mldsa65_public_key[] __initconst __maybe_unused = {
+	0x9f, 0x55, 0x1e, 0x7f, 0x9c, 0x08, 0xb2, 0x83, 0xfd, 0x5b, 0xa2, 0xac,
+	0x4f, 0x26, 0xc2, 0xf5, 0x06, 0x05, 0x96, 0x08, 0x24, 0xad, 0xec, 0xe4,
+	0x99, 0xcc, 0x6c, 0xbd, 0x55, 0x37, 0x15, 0x94, 0xab, 0x31, 0x9e, 0x56,
+	0xe5, 0xe4, 0x55, 0xec, 0x4d, 0x49, 0x5b, 0x5a, 0x7a, 0xe8, 0xc3, 0x4a,
+	0x08, 0x44, 0x4a, 0xc2, 0x2d, 0xe4, 0x61, 0x33, 0x90, 0x20, 0x71, 0x45,
+	0xa5, 0x45, 0xd0, 0x83, 0x2b, 0x32, 0x6c, 0xa7, 0x9e, 0x76, 0xcd, 0xfb,
+	0x58, 0x15, 0x9e, 0x74, 0x0d, 0x67, 0x57, 0xb1, 0x06, 0x5b, 0x5d, 0xd5,
+	0x1c, 0xbb, 0x95, 0x40, 0x1c, 0x71, 0x31, 0x03, 0xef, 0xff, 0x04, 0x6b,
+	0xdd, 0xa2, 0xf0, 0x32, 0x00, 0x72, 0xbc, 0x87, 0xb6, 0x2c, 0x1f, 0x90,
+	0x7f, 0x92, 0xa0, 0xb2, 0x04, 0xdd, 0xa9, 0xaf, 0x7f, 0x01, 0x28, 0x4c,
+	0xb2, 0x57, 0x2d, 0x56, 0x93, 0xd0, 0xc7, 0x54, 0x02, 0x90, 0x57, 0x70,
+	0x23, 0x57, 0xe8, 0xe7, 0x33, 0x32, 0x98, 0xfc, 0x9b, 0x8e, 0x6e, 0x7b,
+	0xaa, 0x5d, 0xb5, 0x4e, 0xe0, 0x5d, 0x97, 0xa3, 0xea, 0x43, 0x7e, 0xb3,
+	0xa4, 0x8c, 0xcf, 0xdc, 0xc0, 0x51, 0xa7, 0x99, 0x45, 0x3d, 0x3c, 0xa0,
+	0xba, 0xc5, 0xff, 0xe1, 0x89, 0xb3, 0x7d, 0xc3, 0xdc, 0xe2, 0x23, 0x81,
+	0xff, 0xa9, 0xc7, 0x93, 0xc6, 0x67, 0xad, 0x94, 0xcf, 0xeb, 0x91, 0x78,
+	0x15, 0x25, 0xf7, 0xf5, 0x06, 0x08, 0x2f, 0x0c, 0xee, 0x0b, 0x6a, 0x06,
+	0x59, 0xe0, 0x1f, 0x2e, 0x5a, 0x12, 0x06, 0xf5, 0xf4, 0x8e, 0x75, 0x57,
+	0xa9, 0x33, 0x23, 0x0f, 0xc2, 0x6f, 0x02, 0xf8, 0x68, 0x0f, 0x62, 0x02,
+	0x81, 0xfe, 0x03, 0x7c, 0xaf, 0xd7, 0x42, 0x5b, 0xcc, 0xe7, 0x2b, 0xea,
+	0x49, 0xab, 0x03, 0x6d, 0x0a, 0x02, 0xae, 0x47, 0x79, 0xce, 0xfd, 0x18,
+	0x76, 0x07, 0x9e, 0xa6, 0xbf, 0x7e, 0x8d, 0x73, 0xf9, 0x44, 0xeb, 0x8c,
+	0xc5, 0x59, 0xb7, 0x19, 0xf6, 0x73, 0x53, 0x42, 0x2a, 0x55, 0x7b, 0xb4,
+	0x56, 0x49, 0x08, 0x9e, 0x9a, 0x65, 0x60, 0x70, 0x1d, 0xbd, 0xc6, 0x85,
+	0x29, 0xde, 0xfe, 0x44, 0xae, 0xdf, 0x25, 0xfd, 0x5b, 0x74, 0x6c, 0x96,
+	0xe6, 0x81, 0x37, 0x80, 0xe0, 0x9e, 0xf3, 0x75, 0x63, 0xb4, 0xc9, 0x2f,
+	0x71, 0xe6, 0xeb, 0xdf, 0xaf, 0x7e, 0xff, 0x9e, 0xe0, 0xbf, 0xca, 0xca,
+	0x11, 0xed, 0xc6, 0x04, 0xd8, 0x49, 0x13, 0x2c, 0x63, 0xf1, 0xb3, 0x17,
+	0x74, 0xd9, 0x50, 0x3f, 0xb9, 0x29, 0x0e, 0x48, 0xa7, 0xf0, 0xdc, 0x78,
+	0x18, 0x0e, 0x9f, 0xb7, 0xde, 0x36, 0x79, 0x67, 0xa4, 0x23, 0x08, 0xe7,
+	0x62, 0xe8, 0xa4, 0xe5, 0xcf, 0xff, 0x35, 0x55, 0x36, 0x2e, 0x3a, 0xe4,
+	0x45, 0x6a, 0x80, 0xf2, 0xca, 0xe7, 0x40, 0x79, 0x14, 0xc4, 0x62, 0x38,
+	0xbb, 0xd0, 0x4e, 0x6c, 0xb5, 0x85, 0x42, 0x3f, 0x35, 0xf7, 0xd7, 0x54,
+	0xb8, 0x2b, 0x8b, 0xd5, 0x6f, 0x16, 0x61, 0x27, 0x23, 0xac, 0xdb, 0xea,
+	0x9b, 0x3b, 0x99, 0xcd, 0x79, 0xe6, 0x12, 0x09, 0x99, 0x09, 0xa4, 0xe1,
+	0x88, 0x25, 0x00, 0x9e, 0x60, 0x16, 0x63, 0xd7, 0x42, 0x9b, 0xcc, 0x36,
+	0x9a, 0x8d, 0xa3, 0x75, 0x36, 0xa1, 0xa8, 0xfc, 0xa2, 0xfe, 0x29, 0x26,
+	0x4c, 0x93, 0x21, 0x44, 0x6b, 0x1c, 0xba, 0xbd, 0xef, 0xff, 0x6d, 0x1f,
+	0x2b, 0x6c, 0x66, 0x81, 0x9a, 0x3a, 0x1d, 0x0b, 0xd7, 0x24, 0xd4, 0xb8,
+	0x93, 0xb5, 0x22, 0xf9, 0xd2, 0xf4, 0xa5, 0x05, 0x78, 0x38, 0xae, 0x58,
+	0xf6, 0x50, 0x8f, 0x47, 0x1d, 0xf3, 0xfb, 0x0d, 0x04, 0x14, 0xd1, 0xd6,
+	0xd8, 0x2e, 0xf2, 0xbd, 0xf5, 0x71, 0x86, 0x4c, 0xdd, 0x61, 0x24, 0x18,
+	0x5b, 0x54, 0xf5, 0xcd, 0x99, 0x89, 0x01, 0x8e, 0xd1, 0x19, 0x52, 0xbc,
+	0x45, 0xed, 0x0e, 0xec, 0x72, 0x2f, 0x5a, 0xe7, 0xdf, 0x36, 0x1c, 0x57,
+	0x9f, 0xb2, 0x8b, 0xf2, 0x78, 0x1b, 0x3e, 0xc5, 0x48, 0x1f, 0x27, 0x04,
+	0x76, 0x10, 0x44, 0xee, 0x5c, 0x68, 0x8f, 0xca, 0xd7, 0x31, 0xfc, 0x5c,
+	0x40, 0x03, 0x2e, 0xbd, 0x1d, 0x59, 0x13, 0x57, 0xbc, 0x33, 0xc6, 0xa1,
+	0xa3, 0xe5, 0x55, 0x79, 0x9b, 0x7e, 0x49, 0xbb, 0x23, 0x96, 0xc3, 0x1c,
+	0xfe, 0x66, 0xeb, 0x5b, 0x5f, 0xe5, 0x03, 0xc9, 0xa4, 0xac, 0x4d, 0xc4,
+	0x50, 0xbb, 0xd3, 0xc1, 0x91, 0x48, 0xe0, 0x93, 0x92, 0x2a, 0xdb, 0x41,
+	0x37, 0x98, 0xbc, 0xa2, 0x7a, 0x09, 0x92, 0x0b, 0x1c, 0xe6, 0x4b, 0x1e,
+	0x8e, 0x78, 0x81, 0x74, 0x7d, 0x6b, 0x71, 0xd5, 0xe7, 0x0e, 0x7b, 0xc2,
+	0x74, 0x5d, 0x89, 0xf1, 0xfa, 0x59, 0xaa, 0xf7, 0x86, 0x66, 0x7e, 0xc2,
+	0x9c, 0xf4, 0xd5, 0x8d, 0xc0, 0xb7, 0xb7, 0xa2, 0xd5, 0xcd, 0x51, 0xc3,
+	0x7d, 0xa9, 0x5e, 0x46, 0xba, 0x06, 0xa3, 0x4d, 0x60, 0xd6, 0x68, 0xc6,
+	0xf9, 0x63, 0x88, 0x17, 0x5c, 0x20, 0xe1, 0xc4, 0x0f, 0x3f, 0xc1, 0xa9,
+	0xa7, 0x3e, 0x39, 0xef, 0x2f, 0xaf, 0xc4, 0x69, 0x29, 0xe3, 0xd4, 0x8d,
+	0xe0, 0x0e, 0x88, 0xc2, 0x93, 0x43, 0xfb, 0x28, 0xcf, 0x5d, 0x85, 0x50,
+	0xf7, 0xeb, 0x42, 0xf5, 0x87, 0xde, 0xa5, 0x65, 0xef, 0x43, 0x0c, 0x57,
+	0x76, 0x09, 0xf4, 0x5f, 0xde, 0x81, 0x0a, 0xd9, 0x59, 0x41, 0xa4, 0x6a,
+	0xb7, 0x05, 0xc7, 0xa5, 0xfe, 0x49, 0xd5, 0x9b, 0x57, 0x13, 0x14, 0x66,
+	0xe2, 0xb9, 0xcc, 0x09, 0x35, 0xd4, 0xb0, 0xe0, 0xd1, 0x0d, 0x7e, 0x50,
+	0x48, 0x45, 0x21, 0x00, 0x67, 0xb2, 0xad, 0xa7, 0x46, 0xe2, 0x6f, 0x70,
+	0xe5, 0x3c, 0x88, 0x04, 0xaa, 0x21, 0xde, 0x03, 0xb6, 0x6f, 0xfe, 0x43,
+	0x51, 0xdc, 0x2e, 0x5c, 0x6c, 0x77, 0x8f, 0x8e, 0x9d, 0x1a, 0x5b, 0x35,
+	0xc5, 0xe4, 0x48, 0x82, 0x17, 0x4b, 0xf0, 0xea, 0xc9, 0x0e, 0xd2, 0x8f,
+	0xcd, 0xd5, 0x01, 0xbd, 0x7f, 0x0f, 0xf5, 0xae, 0x92, 0x28, 0x1e, 0x2c,
+	0xf4, 0xe9, 0x03, 0xf7, 0x0a, 0xeb, 0x84, 0x18, 0xa1, 0x37, 0x38, 0x8a,
+	0x11, 0xa2, 0x5d, 0x8c, 0xf6, 0xe4, 0x3f, 0x5b, 0x87, 0x07, 0x6b, 0xb4,
+	0x07, 0xe0, 0x8f, 0x30, 0xc4, 0xfa, 0x27, 0xae, 0xfc, 0x02, 0xd1, 0x21,
+	0x5c, 0xbc, 0x0b, 0x93, 0x6e, 0x7e, 0xf9, 0x6b, 0x80, 0x7a, 0x25, 0x84,
+	0x20, 0xf1, 0x6a, 0xfa, 0x75, 0xed, 0x57, 0x61, 0x62, 0xa7, 0xf6, 0x5b,
+	0xe1, 0xb0, 0x38, 0xc8, 0xe9, 0x6d, 0x3f, 0xef, 0x1e, 0x99, 0x0b, 0xb7,
+	0xc8, 0x9f, 0x76, 0x5c, 0x04, 0x1f, 0x02, 0x92, 0x00, 0xa7, 0x38, 0x3d,
+	0x00, 0x3b, 0xa7, 0xbc, 0x39, 0x6e, 0xab, 0xf5, 0x10, 0xa8, 0xba, 0xd6,
+	0x28, 0x6b, 0x0e, 0x00, 0x48, 0xf9, 0x3b, 0x5c, 0xde, 0x59, 0x93, 0x46,
+	0xd6, 0x61, 0x52, 0x81, 0x71, 0x0f, 0x0e, 0x61, 0xac, 0xc6, 0x7f, 0x15,
+	0x93, 0xa7, 0xc1, 0x16, 0xb5, 0xef, 0x85, 0xd1, 0xa7, 0x61, 0xc2, 0x85,
+	0x1d, 0x61, 0xc6, 0xae, 0xb3, 0x9e, 0x8d, 0x23, 0xa3, 0xc8, 0xd5, 0xf2,
+	0xc7, 0x1b, 0x7e, 0xef, 0xd2, 0xdf, 0x25, 0xaf, 0x4e, 0x81, 0x15, 0x59,
+	0xe5, 0x36, 0xb1, 0xf1, 0xd5, 0xda, 0x58, 0xd8, 0xd9, 0x0d, 0x6d, 0xc9,
+	0x25, 0xb5, 0xe8, 0x1d, 0x3b, 0xca, 0x2d, 0xab, 0xf2, 0xe2, 0xe9, 0x55,
+	0xd7, 0xf4, 0xc7, 0xd0, 0x57, 0x7a, 0x86, 0x15, 0x0a, 0x5a, 0x8b, 0xd7,
+	0x3f, 0x66, 0x0f, 0x80, 0xb4, 0xe0, 0x5c, 0x33, 0xed, 0xaf, 0x1b, 0x3b,
+	0x6d, 0x1c, 0xd9, 0x8c, 0xb5, 0x96, 0xa3, 0xfb, 0xcf, 0xcc, 0x97, 0x1c,
+	0xae, 0x06, 0x19, 0x41, 0x61, 0xf8, 0x97, 0x6b, 0x82, 0x5e, 0x1c, 0xbf,
+	0x6f, 0x43, 0x3d, 0xe5, 0x00, 0xf5, 0xfe, 0x66, 0x48, 0x26, 0x31, 0xa1,
+	0x72, 0x67, 0x6e, 0xd4, 0x5b, 0x6f, 0x66, 0xde, 0x70, 0x8b, 0x2b, 0xc3,
+	0xa2, 0x30, 0xe9, 0x55, 0xc8, 0xff, 0xf8, 0xd0, 0xdd, 0xa9, 0x21, 0x85,
+	0x6e, 0x6c, 0x82, 0x66, 0xcc, 0x52, 0xf0, 0x9e, 0x1e, 0xb5, 0x3a, 0xff,
+	0x4c, 0xf3, 0xae, 0x02, 0xc3, 0x4b, 0x76, 0x25, 0xbd, 0xb0, 0x21, 0x54,
+	0x61, 0xda, 0x16, 0xd3, 0x23, 0x86, 0x41, 0xa1, 0x4c, 0x59, 0x15, 0x95,
+	0x65, 0x85, 0xb6, 0x8e, 0xa6, 0x37, 0xc0, 0xa2, 0x71, 0x1d, 0x67, 0x44,
+	0x7b, 0xe5, 0x4c, 0x4f, 0xb6, 0x2c, 0x46, 0xf7, 0x29, 0xa5, 0xf2, 0xd3,
+	0x51, 0x19, 0x91, 0x4d, 0xa7, 0xb5, 0x05, 0xb9, 0x6e, 0x61, 0x6e, 0xf8,
+	0xc0, 0x01, 0xe5, 0x41, 0x0a, 0x89, 0x64, 0x77, 0xf2, 0xc8, 0x63, 0x2d,
+	0x9d, 0x27, 0x7f, 0x47, 0x30, 0x39, 0xdf, 0xb6, 0x6e, 0x4f, 0x00, 0x3f,
+	0x15, 0xc6, 0xaf, 0x62, 0xdf, 0x3f, 0x47, 0xe8, 0x42, 0x90, 0x77, 0x23,
+	0x7a, 0xaa, 0x99, 0x53, 0x03, 0x63, 0x60, 0x59, 0x07, 0x52, 0x3c, 0xb5,
+	0x67, 0x59, 0xfe, 0x08, 0xe6, 0x43, 0x0f, 0x3b, 0x08, 0x7c, 0xc7, 0x07,
+	0x3c, 0xfa, 0x65, 0xea, 0x69, 0x51, 0x41, 0x31, 0xb3, 0x05, 0x69, 0xba,
+	0x2c, 0xbf, 0x89, 0x25, 0x9e, 0xfe, 0x07, 0x13, 0x78, 0x0e, 0x16, 0x54,
+	0xdf, 0x23, 0xdf, 0x10, 0x69, 0x79, 0xd0, 0x33, 0xd7, 0x21, 0x8b, 0xc8,
+	0x2a, 0xd0, 0x74, 0x0a, 0xfa, 0xb1, 0x6f, 0xa3, 0xcb, 0x1d, 0xca, 0x4f,
+	0x00, 0x46, 0x6c, 0x42, 0x09, 0xe0, 0x30, 0x89, 0x08, 0x33, 0x9b, 0x7b,
+	0x7b, 0x0f, 0x69, 0x5c, 0x0d, 0x34, 0x91, 0xfc, 0xfe, 0x22, 0x82, 0x02,
+	0xcd, 0xfa, 0x97, 0xe8, 0x28, 0x1d, 0xbc, 0x13, 0x0b, 0xfd, 0x47, 0xa1,
+	0x7e, 0xa2, 0x86, 0x4d, 0x6f, 0x12, 0x51, 0x35, 0x7d, 0x76, 0x8a, 0x58,
+	0x05, 0xb6, 0x39, 0xa1, 0x2f, 0xd7, 0xda, 0xaf, 0x00, 0xa0, 0x1a, 0x94,
+	0xd8, 0x23, 0x34, 0x99, 0x5c, 0xaf, 0xcc, 0x15, 0x4b, 0x56, 0xb2, 0xd2,
+	0x81, 0x07, 0xd3, 0xf3, 0x47, 0xa2, 0x45, 0x93, 0xcb, 0xae, 0xa7, 0x6b,
+	0x3f, 0xf9, 0xea, 0xfc, 0x0e, 0x64, 0xf2, 0x93, 0x7f, 0x24, 0x22, 0x73,
+	0x86, 0xc7, 0x2d, 0x75, 0x9b, 0x41, 0x8b, 0xfb, 0x3b, 0x26, 0x2a, 0xe5,
+	0x0b, 0xd4, 0x00, 0xe3, 0x2c, 0x69, 0x49, 0x62, 0x6c, 0x13, 0x58, 0x6e,
+	0xac, 0x43, 0xe5, 0x2b, 0x3b, 0x88, 0xdc, 0xd4, 0x41, 0xe8, 0xee, 0x4e,
+	0xc3, 0x28, 0x91, 0x17, 0x9a, 0x5a, 0xdb, 0x80, 0x8b, 0x4d, 0x64, 0xcc,
+	0xbe, 0x66, 0xa4, 0x62, 0xfb, 0x13, 0x44, 0x10, 0xd9, 0xe4, 0xd5, 0xa5,
+	0xae, 0x9e, 0x42, 0x50, 0xfc, 0x78, 0xad, 0xfa, 0xc4, 0xd0, 0x5a, 0x60,
+	0x9b, 0x45, 0x2b, 0x61, 0x5c, 0x57, 0xb5, 0x92, 0x28, 0xe9, 0xf5, 0x35,
+	0x67, 0xc1, 0x5e, 0xa8, 0x1c, 0x99, 0x36, 0x38, 0xb8, 0x5c, 0xff, 0x3d,
+	0xa0, 0xfc, 0xb0, 0xbc, 0x3d, 0x2c, 0xb4, 0x36, 0x17, 0xb4, 0x6d, 0xb5,
+	0x39, 0x45, 0xa9, 0x2a, 0x6b, 0xa2, 0x24, 0x44, 0x30, 0xab, 0x2c, 0x82,
+	0x36, 0xdc, 0xd6, 0x36, 0x5d, 0x0a, 0xdc, 0xee, 0x0f, 0x2b, 0x28, 0x99,
+	0xdc, 0x67, 0x0d, 0xea, 0x6e, 0x42, 0xb9, 0x45, 0x7f, 0xd2, 0x96, 0x1e,
+	0x60, 0x42, 0xeb, 0x1e, 0x5f, 0x8e, 0xa9, 0xdc, 0xd3, 0x8a, 0xd6, 0xbd,
+	0x4e, 0x1f, 0x42, 0x75, 0x1d, 0xe2, 0xc6, 0x11, 0xc9, 0x80, 0x1f, 0xfe,
+	0x99, 0x52, 0x4d, 0x7b, 0x35, 0xf7, 0xb7, 0xc3, 0xee, 0xd6, 0x94, 0xf5,
+	0x74, 0xa0, 0x69, 0xcd, 0x1f, 0x2b, 0xd0, 0x87, 0xf7, 0x8c, 0x69, 0xc5,
+	0x96, 0x70, 0x91, 0xe8, 0x3d, 0xd2, 0xcc, 0xf1, 0x4c, 0xcd, 0xe2, 0x14,
+	0x00, 0x10, 0x4a, 0xd9, 0x6a, 0x5d, 0x65, 0x2c, 0x4b, 0x79, 0x0c, 0xc4,
+	0x78, 0x5e, 0xc8, 0xc5, 0x37, 0x74, 0x6d, 0x50, 0x5c, 0x34, 0x1f, 0xe0,
+	0xf4, 0xe3, 0xe1, 0x86, 0x68, 0xb1, 0xea, 0x70, 0xf0, 0xae, 0xe4, 0x59,
+	0xa1, 0x08, 0x7e, 0x35, 0xa3, 0x16, 0xd2, 0xb0, 0xa3, 0xd4, 0xb0, 0x74,
+	0x8c, 0x05, 0x79, 0x73, 0xfb, 0xe6, 0x65, 0x96, 0x15, 0x07, 0xd5, 0xaf,
+	0x88, 0x9e, 0x6b, 0xf0, 0xbb, 0x3f, 0xe6, 0xd1, 0x6a, 0xe7, 0xc9, 0xae,
+	0xd9, 0xb0, 0x16, 0x1c, 0x40, 0xeb, 0xdb, 0xc1, 0xbf, 0x83, 0xdb, 0x8a,
+	0x4f, 0x96, 0xca, 0xd7, 0x22, 0x06, 0x87, 0x08, 0x9d, 0x65, 0x2f, 0xd9,
+	0x8e, 0x95, 0x6c, 0xcc, 0xbf, 0x76, 0x2a, 0xea, 0x5c, 0x8e, 0x5b, 0x17,
+	0x0f, 0x75, 0x7b, 0xfa, 0xf9, 0xfb, 0xaa, 0x92, 0xc7, 0x7e, 0x63, 0x63,
+	0x54, 0xa4, 0xff, 0xf6, 0xc0, 0xc0, 0xf5, 0x70, 0xd8, 0xe3, 0xa4, 0x79,
+	0x16, 0xf0, 0x6f, 0x90, 0x5e, 0xb7, 0xab, 0x6f, 0xab, 0x75, 0x3b, 0xe1,
+	0x4c, 0xa8, 0x0b, 0x72, 0x5f, 0x5f, 0x11, 0x22, 0x36, 0x71, 0x20, 0xd3,
+	0x5b, 0x5e, 0x07, 0x06, 0x76, 0x1a, 0xcc, 0x5e, 0x7c, 0x97, 0x7d, 0xb2,
+	0x6b, 0xf8, 0x39, 0x89, 0x37, 0xb6, 0x6d, 0xea, 0x74, 0x57, 0x28, 0xd7,
+	0x0e, 0x9b, 0xeb, 0x28, 0x88, 0x90, 0xfd, 0x2d, 0x16, 0x21, 0x74, 0x26,
+	0xc5, 0xb8, 0x44, 0xad, 0x9f, 0x97, 0xf9, 0x65, 0x36, 0xd8, 0x00, 0x59,
+	0x17, 0x49, 0xf9, 0xc7, 0xb3, 0x84, 0xb9, 0xe2, 0x95, 0xe0, 0xd1, 0x7f,
+	0x5f, 0xaa, 0xd7, 0xfd, 0x6a, 0x6a, 0x83, 0x14, 0x46, 0x1d, 0x12, 0x8d,
+	0x09, 0xc3, 0xa5, 0xca, 0x72, 0xa3, 0x25, 0x65, 0xb6, 0x40, 0x25, 0x04,
+	0x51, 0xab, 0x22, 0xeb, 0xd7, 0x69, 0xc9, 0x22, 0x9c, 0xa0, 0x19, 0x5c,
+	0x1a, 0xfd, 0x41, 0x8f, 0x98, 0xc5, 0x71, 0xb8, 0x6f, 0x76, 0xae, 0xfa,
+	0x9b, 0x03, 0xab, 0x43, 0x81, 0x3b, 0x66, 0xae, 0xf0, 0xd2, 0xb7, 0xee,
+	0x9a, 0xe3, 0xae, 0x45, 0xc1, 0x86, 0xb0, 0xce, 0x9e, 0x2b, 0xec, 0xb8,
+	0xcf, 0xca, 0x0e, 0x8c, 0x33, 0xfa, 0xa7, 0xef, 0xf7, 0xfc, 0xa1, 0x41,
+	0x49, 0xd3, 0x6d, 0xb5, 0x58, 0xe4, 0x0e, 0x24, 0xd2, 0x8a, 0x74, 0xc9,
+	0x56, 0x2e, 0x53, 0xc7, 0x7a, 0x38, 0x0f, 0x4b, 0xd9, 0xf9, 0x2f, 0xfa,
+	0x7d, 0xee, 0x14, 0x18, 0xce, 0x75, 0x42, 0x6c, 0x03, 0x34, 0xce, 0x80,
+	0xec, 0xf2, 0x05, 0xf0, 0xdf, 0xcd, 0xf8, 0xdb, 0x26, 0x7d, 0xb6, 0x3d,
+	0x28, 0x24, 0x7e, 0x7e, 0x39, 0x9f, 0xa6, 0xc6, 0xeb, 0x2a, 0xc8, 0x17,
+	0x94, 0xa9, 0x89, 0xf5, 0xdf, 0xcb, 0x77, 0xfd, 0xc9, 0x9e, 0x68, 0x98,
+	0x7d, 0x04, 0x50, 0x3c, 0x64, 0x1d, 0x66, 0xb0, 0x97, 0x06, 0xb6, 0x08,
+	0x5b, 0xe4, 0x17, 0x44, 0xd6, 0x94, 0x39, 0x6b, 0x03, 0x2c, 0xcb, 0x5a,
+	0x8d, 0x86, 0x08, 0x23, 0x4f, 0x95, 0xa8, 0x1a,
+};
+
+static const u8 fips_test_mldsa65_message[] __initconst __maybe_unused = {
+	0x1a, 0x84, 0x21, 0x0d, 0x35, 0x7a, 0x88, 0xc8, 0x6a, 0x11, 0xe3,
+	0x15, 0x24, 0xec, 0x0d, 0x2e, 0x76, 0xb9, 0xcf, 0x2b, 0x04, 0x25,
+	0x16, 0xae, 0x62, 0x42, 0xa0, 0x20, 0x68, 0x25, 0x3e, 0xb4, 0x75,
+	0xa7, 0x1d, 0x64, 0xc3, 0xd1, 0x08, 0x07, 0x67, 0xb6, 0xf7, 0x76,
+	0x76, 0xf6, 0xd6, 0x62, 0x66, 0x04, 0x89, 0x0c, 0x8f, 0x07, 0xac,
+	0xc8, 0x51, 0x77, 0xd9, 0x47, 0x5e, 0xb5, 0x22, 0x20,
+};
--- a/lib/crypto/md5.c
+++ b/lib/crypto/md5.c
@@ -29,7 +29,7 @@ static const struct md5_block_state md5_iv = {
 #define F4(x, y, z) (y ^ (x | ~z))

 #define MD5STEP(f, w, x, y, z, in, s) \
-	(w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+	(w += f(x, y, z) + in, w = rol32(w, s) + x)

 static void md5_block_generic(struct md5_block_state *state,
 			      const u8 data[MD5_BLOCK_SIZE])
--- a/lib/crypto/mldsa.c
+++ b/lib/crypto/mldsa.c
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Support for verifying ML-DSA signatures
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/mldsa.h>
+#include <crypto/sha3.h>
+#include <kunit/visibility.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include "fips-mldsa.h"
+
+#define Q 8380417 /* The prime q = 2^23 - 2^13 + 1 */
+#define QINV_MOD_2_32 58728449 /* Multiplicative inverse of q mod 2^32 */
+#define N 256 /* Number of components per ring element */
+#define D 13 /* Number of bits dropped from the public key vector t */
+#define RHO_LEN 32 /* Length of the public random seed in bytes */
+#define MAX_W1_ENCODED_LEN 192 /* Max encoded length of one element of w'_1 */
+
+/*
+ * The zetas array in Montgomery form, i.e. with extra factor of 2^32.
+ * Reference: FIPS 204 Section 7.5 "NTT and NTT^-1"
+ * Generated by the following Python code:
+ * q=8380417; [a%q - q*(a%q > q//2) for a in [1753**(int(f'{i:08b}'[::-1], 2)) << 32 for i in range(256)]]
+ */
+static const s32 zetas_times_2_32[N] = {
+	-4186625, 25847,    -2608894, -518909,	237124,	  -777960,  -876248,
+	466468,	  1826347,  2353451,  -359251,	-2091905, 3119733,  -2884855,
+	3111497,  2680103,  2725464,  1024112,	-1079900, 3585928,  -549488,
+	-1119584, 2619752,  -2108549, -2118186, -3859737, -1399561, -3277672,
+	1757237,  -19422,   4010497,  280005,	2706023,  95776,    3077325,
+	3530437,  -1661693, -3592148, -2537516, 3915439,  -3861115, -3043716,
+	3574422,  -2867647, 3539968,  -300467,	2348700,  -539299,  -1699267,
+	-1643818, 3505694,  -3821735, 3507263,	-2140649, -1600420, 3699596,
+	811944,	  531354,   954230,   3881043,	3900724,  -2556880, 2071892,
+	-2797779, -3930395, -1528703, -3677745, -3041255, -1452451, 3475950,
+	2176455,  -1585221, -1257611, 1939314,	-4083598, -1000202, -3190144,
+	-3157330, -3632928, 126922,   3412210,	-983419,  2147896,  2715295,
+	-2967645, -3693493, -411027,  -2477047, -671102,  -1228525, -22981,
+	-1308169, -381987,  1349076,  1852771,	-1430430, -3343383, 264944,
+	508951,	  3097992,  44288,    -1100098, 904516,	  3958618,  -3724342,
+	-8578,	  1653064,  -3249728, 2389356,	-210977,  759969,   -1316856,
+	189548,	  -3553272, 3159746,  -1851402, -2409325, -177440,  1315589,
+	1341330,  1285669,  -1584928, -812732,	-1439742, -3019102, -3881060,
+	-3628969, 3839961,  2091667,  3407706,	2316500,  3817976,  -3342478,
+	2244091,  -2446433, -3562462, 266997,	2434439,  -1235728, 3513181,
+	-3520352, -3759364, -1197226, -3193378, 900702,	  1859098,  909542,
+	819034,	  495491,   -1613174, -43260,	-522500,  -655327,  -3122442,
+	2031748,  3207046,  -3556995, -525098,	-768622,  -3595838, 342297,
+	286988,	  -2437823, 4108315,  3437287,	-3342277, 1735879,  203044,
+	2842341,  2691481,  -2590150, 1265009,	4055324,  1247620,  2486353,
+	1595974,  -3767016, 1250494,  2635921,	-3548272, -2994039, 1869119,
+	1903435,  -1050970, -1333058, 1237275,	-3318210, -1430225, -451100,
+	1312455,  3306115,  -1962642, -1279661, 1917081,  -2546312, -1374803,
+	1500165,  777191,   2235880,  3406031,	-542412,  -2831860, -1671176,
+	-1846953, -2584293, -3724270, 594136,	-3776993, -2013608, 2432395,
+	2454455,  -164721,  1957272,  3369112,	185531,	  -1207385, -3183426,
+	162844,	  1616392,  3014001,  810149,	1652634,  -3694233, -1799107,
+	-3038916, 3523897,  3866901,  269760,	2213111,  -975884,  1717735,
+	472078,	  -426683,  1723600,  -1803090, 1910376,  -1667432, -1104333,
+	-260646,  -3833893, -2939036, -2235985, -420899,  -2286327, 183443,
+	-976891,  1612842,  -3545687, -554416,	3919660,  -48306,   -1362209,
+	3937738,  1400424,  -846154,  1976782
+};
+
+/* Reference: FIPS 204 Section 4 "Parameter Sets" */
+static const struct mldsa_parameter_set {
+	u8 k; /* num rows in the matrix A */
+	u8 l; /* num columns in the matrix A */
+	u8 ctilde_len; /* length of commitment hash ctilde in bytes; lambda/4 */
+	u8 omega; /* max num of 1's in the hint vector h */
+	u8 tau; /* num of +-1's in challenge c */
+	u8 beta; /* tau times eta */
+	u16 pk_len; /* length of public keys in bytes */
+	u16 sig_len; /* length of signatures in bytes */
+	s32 gamma1; /* coefficient range of y */
+} mldsa_parameter_sets[] = {
+	[MLDSA44] = {
+		.k = 4,
+		.l = 4,
+		.ctilde_len = 32,
+		.omega = 80,
+		.tau = 39,
+		.beta = 78,
+		.pk_len = MLDSA44_PUBLIC_KEY_SIZE,
+		.sig_len = MLDSA44_SIGNATURE_SIZE,
+		.gamma1 = 1 << 17,
+	},
+	[MLDSA65] = {
+		.k = 6,
+		.l = 5,
+		.ctilde_len = 48,
+		.omega = 55,
+		.tau = 49,
+		.beta = 196,
+		.pk_len = MLDSA65_PUBLIC_KEY_SIZE,
+		.sig_len = MLDSA65_SIGNATURE_SIZE,
+		.gamma1 = 1 << 19,
+	},
+	[MLDSA87] = {
+		.k = 8,
+		.l = 7,
+		.ctilde_len = 64,
+		.omega = 75,
+		.tau = 60,
+		.beta = 120,
+		.pk_len = MLDSA87_PUBLIC_KEY_SIZE,
+		.sig_len = MLDSA87_SIGNATURE_SIZE,
+		.gamma1 = 1 << 19,
+	},
+};
+
+/*
+ * An element of the ring R_q (normal form) or the ring T_q (NTT form).  It
+ * consists of N integers mod q: either the polynomial coefficients of the R_q
+ * element or the components of the T_q element.  In either case, whether they
+ * are fully reduced to [0, q - 1] varies in the different parts of the code.
+ */
+struct mldsa_ring_elem {
+	s32 x[N];
+};
+
+struct mldsa_verification_workspace {
+	/* SHAKE context for computing c, mu, and ctildeprime */
+	struct shake_ctx shake;
+	/* The fields in this union are used in their order of declaration. */
+	union {
+		/* The hash of the public key */
+		u8 tr[64];
+		/* The message representative mu */
+		u8 mu[64];
+		/* Temporary space for rej_ntt_poly() */
+		u8 block[SHAKE128_BLOCK_SIZE + 1];
+		/* Encoded element of w'_1 */
+		u8 w1_encoded[MAX_W1_ENCODED_LEN];
+		/* The commitment hash.  Real length is params->ctilde_len */
+		u8 ctildeprime[64];
+	};
+	/* SHAKE context for generating elements of the matrix A */
+	struct shake_ctx a_shake;
+	/*
+	 * An element of the matrix A generated from the public seed, or an
+	 * element of the vector t_1 decoded from the public key and pre-scaled
+	 * by 2^d.  Both are in NTT form.  To reduce memory usage, we generate
+	 * or decode these elements only as needed.
+	 */
+	union {
+		struct mldsa_ring_elem a;
+		struct mldsa_ring_elem t1_scaled;
+	};
+	/* The challenge c, generated from ctilde */
+	struct mldsa_ring_elem c;
+	/* A temporary element used during calculations */
+	struct mldsa_ring_elem tmp;
+
+	/* The following fields are variable-length: */
+
+	/* The signer's response vector */
+	struct mldsa_ring_elem z[/* l */];
+
+	/* The signer's hint vector */
+	/* u8 h[k * N]; */
+};
+
+/*
+ * Compute a * b * 2^-32 mod q.  a * b must be in the range [-2^31 * q, 2^31 * q
+ * - 1] before reduction.  The return value is in the range [-q + 1, q - 1].
+ *
+ * To reduce mod q efficiently, this uses Montgomery reduction with R=2^32.
+ * That's where the factor of 2^-32 comes from.  The caller must include a
+ * factor of 2^32 at some point to compensate for that.
+ *
+ * To keep the input and output ranges very close to symmetric, this
+ * specifically does a "signed" Montgomery reduction.  That is, when computing
+ * d = c * q^-1 mod 2^32, this chooses a representative in [S32_MIN, S32_MAX]
+ * rather than [0, U32_MAX], i.e. s32 rather than u32.  This matters in the
+ * wider multiplication d * Q when d keeps its value via sign extension.
+ *
+ * Reference: FIPS 204 Appendix A "Montgomery Multiplication".  But, it doesn't
+ * explain it properly: it has an off-by-one error in the upper end of the input
+ * range, it doesn't clarify that the signed version should be used, and it
+ * gives an unnecessarily large output range.  A better citation is perhaps the
+ * Dilithium reference code, which functionally matches the below code and
+ * merely has the (benign) off-by-one error in its documentation.
+ */
+static inline s32 Zq_mult(s32 a, s32 b)
+{
+	/* Compute the unreduced product c. */
+	s64 c = (s64)a * b;
+
+	/*
+	 * Compute d = c * q^-1 mod 2^32.  Generate a signed result, as
+	 * explained above, but do the actual multiplication using an unsigned
+	 * type to avoid signed integer overflow which is undefined behavior.
+	 */
+	s32 d = (u32)c * QINV_MOD_2_32;
+
+	/*
+	 * Compute e = c - d * q.  This makes the low 32 bits zero, since
+	 *   c - (c * q^-1) * q mod 2^32
+	 * = c - c * (q^-1 * q) mod 2^32
+	 * = c - c * 1 mod 2^32
+	 * = c - c mod 2^32
+	 * = 0 mod 2^32
+	 */
+	s64 e = c - (s64)d * Q;
+
+	/* Finally, return e * 2^-32. */
+	return e >> 32;
+}
+
+/*
+ * Convert @w to its number-theoretically-transformed representation in-place.
+ * Reference: FIPS 204 Algorithm 41, NTT
+ *
+ * To prevent intermediate overflows, all input coefficients must have absolute
+ * value < q.  All output components have absolute value < 9*q.
+ */
+static void ntt(struct mldsa_ring_elem *w)
+{
+	int m = 0; /* index in zetas_times_2_32 */
+
+	for (int len = 128; len >= 1; len /= 2) {
+		for (int start = 0; start < 256; start += 2 * len) {
+			const s32 z = zetas_times_2_32[++m];
+
+			for (int j = start; j < start + len; j++) {
+				s32 t = Zq_mult(z, w->x[j + len]);
+
+				w->x[j + len] = w->x[j] - t;
+				w->x[j] += t;
+			}
+		}
+	}
+}
+
+/*
+ * Convert @w from its number-theoretically-transformed representation in-place.
+ * Reference: FIPS 204 Algorithm 42, NTT^-1
+ *
+ * This also multiplies the coefficients by 2^32, undoing an extra factor of
+ * 2^-32 introduced earlier, and reduces the coefficients to [0, q - 1].
+ */
+static void invntt_and_mul_2_32(struct mldsa_ring_elem *w)
+{
+	int m = 256; /* index in zetas_times_2_32 */
+
+	/* Prevent intermediate overflows. */
+	for (int j = 0; j < 256; j++)
+		w->x[j] %= Q;
+
+	for (int len = 1; len < 256; len *= 2) {
+		for (int start = 0; start < 256; start += 2 * len) {
+			const s32 z = -zetas_times_2_32[--m];
+
+			for (int j = start; j < start + len; j++) {
+				s32 t = w->x[j];
+
+				w->x[j] = t + w->x[j + len];
+				w->x[j + len] = Zq_mult(z, t - w->x[j + len]);
+			}
+		}
+	}
+	/*
+	 * Multiply by 2^32 * 256^-1.  2^32 cancels the factor of 2^-32 from
+	 * earlier Montgomery multiplications.  256^-1 is for NTT^-1.  This
+	 * itself uses Montgomery multiplication, so *another* 2^32 is needed.
+	 * Thus the actual multiplicand is 2^32 * 2^32 * 256^-1 mod q = 41978.
+	 *
+	 * Finally, also reduce from [-q + 1, q - 1] to [0, q - 1].
+	 */
+	for (int j = 0; j < 256; j++) {
+		w->x[j] = Zq_mult(w->x[j], 41978);
+		w->x[j] += (w->x[j] >> 31) & Q;
+	}
+}
+
+/*
+ * Decode an element of t_1, i.e. the high d bits of t = A*s_1 + s_2.
+ * Reference: FIPS 204 Algorithm 23, pkDecode.
+ * Also multiply it by 2^d and convert it to NTT form.
+ */
+static const u8 *decode_t1_elem(struct mldsa_ring_elem *out,
+				const u8 *t1_encoded)
+{
+	for (int j = 0; j < N; j += 4, t1_encoded += 5) {
+		u32 v = get_unaligned_le32(t1_encoded);
+
+		out->x[j + 0] = ((v >> 0) & 0x3ff) << D;
+		out->x[j + 1] = ((v >> 10) & 0x3ff) << D;
+		out->x[j + 2] = ((v >> 20) & 0x3ff) << D;
+		out->x[j + 3] = ((v >> 30) | (t1_encoded[4] << 2)) << D;
+		static_assert(0x3ff << D < Q); /* All coefficients < q. */
+	}
+	ntt(out);
+	return t1_encoded; /* Return updated pointer. */
+}
+
+/*
+ * Decode the signer's response vector 'z' from the signature.
+ * Reference: FIPS 204 Algorithm 27, sigDecode.
+ *
+ * This also validates that the coefficients of z are in range, corresponding
+ * the infinity norm check at the end of Algorithm 8, ML-DSA.Verify_internal.
+ *
+ * Finally, this also converts z to NTT form.
+ */
+static bool decode_z(struct mldsa_ring_elem z[/* l */], int l, s32 gamma1,
+		     int beta, const u8 **sig_ptr)
+{
+	const u8 *sig = *sig_ptr;
+
+	for (int i = 0; i < l; i++) {
+		if (l == 4) { /* ML-DSA-44? */
+			/* 18-bit coefficients: decode 4 from 9 bytes. */
+			for (int j = 0; j < N; j += 4, sig += 9) {
+				u64 v = get_unaligned_le64(sig);
+
+				z[i].x[j + 0] = (v >> 0) & 0x3ffff;
+				z[i].x[j + 1] = (v >> 18) & 0x3ffff;
+				z[i].x[j + 2] = (v >> 36) & 0x3ffff;
+				z[i].x[j + 3] = (v >> 54) | (sig[8] << 10);
+			}
+		} else {
+			/* 20-bit coefficients: decode 4 from 10 bytes. */
+			for (int j = 0; j < N; j += 4, sig += 10) {
+				u64 v = get_unaligned_le64(sig);
+
+				z[i].x[j + 0] = (v >> 0) & 0xfffff;
+				z[i].x[j + 1] = (v >> 20) & 0xfffff;
+				z[i].x[j + 2] = (v >> 40) & 0xfffff;
+				z[i].x[j + 3] =
+					(v >> 60) |
+					(get_unaligned_le16(&sig[8]) << 4);
+			}
+		}
+		for (int j = 0; j < N; j++) {
+			z[i].x[j] = gamma1 - z[i].x[j];
+			if (z[i].x[j] <= -(gamma1 - beta) ||
+			    z[i].x[j] >= gamma1 - beta)
+				return false;
+		}
+		ntt(&z[i]);
+	}
+	*sig_ptr = sig; /* Return updated pointer. */
+	return true;
+}
+
+/*
+ * Decode the signer's hint vector 'h' from the signature.
+ * Reference: FIPS 204 Algorithm 21, HintBitUnpack
+ *
+ * Note that there are several ways in which the hint vector can be malformed.
+ */
+static bool decode_hint_vector(u8 h[/* k * N */], int k, int omega, const u8 *y)
+{
+	int index = 0;
+
+	memset(h, 0, k * N);
+	for (int i = 0; i < k; i++) {
+		int count = y[omega + i]; /* num 1's in elems 0 through i */
+		int prev = -1;
+
+		/* Cumulative count mustn't decrease or exceed omega. */
+		if (count < index || count > omega)
+			return false;
+		for (; index < count; index++) {
+			if (prev >= y[index]) /* Coefficients out of order? */
+				return false;
+			prev = y[index];
+			h[i * N + y[index]] = 1;
+		}
+	}
+	return mem_is_zero(&y[index], omega - index);
+}
+
+/*
+ * Expand @seed into an element of R_q @c with coefficients in {-1, 0, 1},
+ * exactly @tau of them nonzero.  Reference: FIPS 204 Algorithm 29, SampleInBall
+ */
+static void sample_in_ball(struct mldsa_ring_elem *c, const u8 *seed,
+			   size_t seed_len, int tau, struct shake_ctx *shake)
+{
+	u64 signs;
+	u8 j;
+
+	shake256_init(shake);
+	shake_update(shake, seed, seed_len);
+	shake_squeeze(shake, (u8 *)&signs, sizeof(signs));
+	le64_to_cpus(&signs);
+	*c = (struct mldsa_ring_elem){};
+	for (int i = N - tau; i < N; i++, signs >>= 1) {
+		do {
+			shake_squeeze(shake, &j, 1);
+		} while (j > i);
+		c->x[i] = c->x[j];
+		c->x[j] = 1 - 2 * (s32)(signs & 1);
+	}
+}
+
+/*
+ * Expand the public seed @rho and @row_and_column into an element of T_q @out.
+ * Reference: FIPS 204 Algorithm 30, RejNTTPoly
+ *
+ * @shake and @block are temporary space used by the expansion.  @block has
+ * space for one SHAKE128 block, plus an extra byte to allow reading a u32 from
+ * the final 3-byte group without reading out-of-bounds.
+ */
+static void rej_ntt_poly(struct mldsa_ring_elem *out, const u8 rho[RHO_LEN],
+			 __le16 row_and_column, struct shake_ctx *shake,
+			 u8 block[SHAKE128_BLOCK_SIZE + 1])
+{
+	shake128_init(shake);
+	shake_update(shake, rho, RHO_LEN);
+	shake_update(shake, (u8 *)&row_and_column, sizeof(row_and_column));
+	for (int i = 0; i < N;) {
+		shake_squeeze(shake, block, SHAKE128_BLOCK_SIZE);
+		block[SHAKE128_BLOCK_SIZE] = 0; /* for KMSAN */
+		static_assert(SHAKE128_BLOCK_SIZE % 3 == 0);
+		for (int j = 0; j < SHAKE128_BLOCK_SIZE && i < N; j += 3) {
+			u32 x = get_unaligned_le32(&block[j]) & 0x7fffff;
+
+			if (x < Q) /* Ignore values >= q. */
+				out->x[i++] = x;
+		}
+	}
+}
+
+/*
+ * Return the HighBits of r adjusted according to hint h
+ * Reference: FIPS 204 Algorithm 40, UseHint
+ *
+ * This is needed because of the public key compression in ML-DSA.
+ *
+ * h is either 0 or 1, r is in [0, q - 1], and gamma2 is either (q - 1) / 88 or
+ * (q - 1) / 32.  Except when invoked via the unit test interface, gamma2 is a
+ * compile-time constant, so compilers will optimize the code accordingly.
+ */
+static __always_inline s32 use_hint(u8 h, s32 r, const s32 gamma2)
+{
+	const s32 m = (Q - 1) / (2 * gamma2); /* 44 or 16, compile-time const */
+	s32 r1;
+
+	/*
+	 * Handle the special case where r - (r mod+- (2 * gamma2)) == q - 1,
+	 * i.e. r >= q - gamma2.  This is also exactly where the computation of
+	 * r1 below would produce 'm' and would need a correction.
+	 */
+	if (r >= Q - gamma2)
+		return h == 0 ? 0 : m - 1;
+
+	/*
+	 * Compute the (non-hint-adjusted) HighBits r1 as:
+	 *
+	 *  r1 = (r - (r mod+- (2 * gamma2))) / (2 * gamma2)
+	 *     = floor((r + gamma2 - 1) / (2 * gamma2))
+	 *
+	 * Note that when '2 * gamma2' is a compile-time constant, compilers
+	 * optimize the division to a reciprocal multiplication and shift.
+	 */
+	r1 = (u32)(r + gamma2 - 1) / (2 * gamma2);
+
+	/*
+	 * Return the HighBits r1:
+	 *	+ 0 if the hint is 0;
+	 *	+ 1 (mod m) if the hint is 1 and the LowBits are positive;
+	 *	- 1 (mod m) if the hint is 1 and the LowBits are negative or 0.
+	 *
+	 * r1 is in (and remains in) [0, m - 1].  Note that when 'm' is a
+	 * compile-time constant, compilers optimize the '% m' accordingly.
+	 */
+	if (h == 0)
+		return r1;
+	if (r > r1 * (2 * gamma2))
+		return (u32)(r1 + 1) % m;
+	return (u32)(r1 + m - 1) % m;
+}
+
+static __always_inline void use_hint_elem(struct mldsa_ring_elem *w,
+					  const u8 h[N], const s32 gamma2)
+{
+	for (int j = 0; j < N; j++)
+		w->x[j] = use_hint(h[j], w->x[j], gamma2);
+}
+
+#if IS_ENABLED(CONFIG_CRYPTO_LIB_MLDSA_KUNIT_TEST)
+/* Allow the __always_inline function use_hint() to be unit-tested. */
+s32 mldsa_use_hint(u8 h, s32 r, s32 gamma2)
+{
+	return use_hint(h, r, gamma2);
+}
+EXPORT_SYMBOL_IF_KUNIT(mldsa_use_hint);
+#endif
+
+/*
+ * Encode one element of the commitment vector w'_1 into a byte string.
+ * Reference: FIPS 204 Algorithm 28, w1Encode.
+ * Return the number of bytes used: 192 for ML-DSA-44 and 128 for the others.
+ */
+static size_t encode_w1(u8 out[MAX_W1_ENCODED_LEN],
+			const struct mldsa_ring_elem *w1, int k)
+{
+	size_t pos = 0;
+
+	static_assert(N * 6 / 8 == MAX_W1_ENCODED_LEN);
+	if (k == 4) { /* ML-DSA-44? */
+		/* 6 bits per coefficient.  Pack 4 at a time. */
+		for (int j = 0; j < N; j += 4) {
+			u32 v = (w1->x[j + 0] << 0) | (w1->x[j + 1] << 6) |
+				(w1->x[j + 2] << 12) | (w1->x[j + 3] << 18);
+			out[pos++] = v >> 0;
+			out[pos++] = v >> 8;
+			out[pos++] = v >> 16;
+		}
+	} else {
+		/* 4 bits per coefficient.  Pack 2 at a time. */
+		for (int j = 0; j < N; j += 2)
+			out[pos++] = w1->x[j] | (w1->x[j + 1] << 4);
+	}
+	return pos;
+}
+
+int mldsa_verify(enum mldsa_alg alg, const u8 *sig, size_t sig_len,
+		 const u8 *msg, size_t msg_len, const u8 *pk, size_t pk_len)
+{
+	const struct mldsa_parameter_set *params = &mldsa_parameter_sets[alg];
+	const int k = params->k, l = params->l;
+	/* For now this just does pure ML-DSA with an empty context string. */
+	static const u8 msg_prefix[2] = { /* dom_sep= */ 0, /* ctx_len= */ 0 };
+	const u8 *ctilde; /* The signer's commitment hash */
+	const u8 *t1_encoded = &pk[RHO_LEN]; /* Next encoded element of t_1 */
+	u8 *h; /* The signer's hint vector, length k * N */
+	size_t w1_enc_len;
+
+	/* Validate the public key and signature lengths. */
+	if (pk_len != params->pk_len || sig_len != params->sig_len)
+		return -EBADMSG;
+
+	/*
+	 * Allocate the workspace, including variable-length fields.  Its size
+	 * depends only on the ML-DSA parameter set, not the other inputs.
+	 *
+	 * For freeing it, use kfree_sensitive() rather than kfree().  This is
+	 * mainly to comply with FIPS 204 Section 3.6.3 "Intermediate Values".
+	 * In reality it's a bit gratuitous, as this is a public key operation.
+	 */
+	struct mldsa_verification_workspace *ws __free(kfree_sensitive) =
+		kmalloc(sizeof(*ws) + (l * sizeof(ws->z[0])) + (k * N),
+			GFP_KERNEL);
+	if (!ws)
+		return -ENOMEM;
+	h = (u8 *)&ws->z[l];
+
+	/* Decode the signature.  Reference: FIPS 204 Algorithm 27, sigDecode */
+	ctilde = sig;
+	sig += params->ctilde_len;
+	if (!decode_z(ws->z, l, params->gamma1, params->beta, &sig))
+		return -EBADMSG;
+	if (!decode_hint_vector(h, k, params->omega, sig))
+		return -EBADMSG;
+
+	/* Recreate the challenge c from the signer's commitment hash. */
+	sample_in_ball(&ws->c, ctilde, params->ctilde_len, params->tau,
+		       &ws->shake);
+	ntt(&ws->c);
+
+	/* Compute the message representative mu. */
+	shake256(pk, pk_len, ws->tr, sizeof(ws->tr));
+	shake256_init(&ws->shake);
+	shake_update(&ws->shake, ws->tr, sizeof(ws->tr));
+	shake_update(&ws->shake, msg_prefix, sizeof(msg_prefix));
+	shake_update(&ws->shake, msg, msg_len);
+	shake_squeeze(&ws->shake, ws->mu, sizeof(ws->mu));
+
+	/* Start computing ctildeprime = H(mu || w1Encode(w'_1)). */
+	shake256_init(&ws->shake);
+	shake_update(&ws->shake, ws->mu, sizeof(ws->mu));
+
+	/*
+	 * Compute the commitment w'_1 from A, z, c, t_1, and h.
+	 *
+	 * The computation is the same for each of the k rows.  Just do each row
+	 * before moving on to the next, resulting in only one loop over k.
+	 */
+	for (int i = 0; i < k; i++) {
+		/*
+		 * tmp = NTT(A) * NTT(z) * 2^-32
+		 * To reduce memory use, generate each element of NTT(A)
+		 * on-demand.  Note that each element is used only once.
+		 */
+		ws->tmp = (struct mldsa_ring_elem){};
+		for (int j = 0; j < l; j++) {
+			rej_ntt_poly(&ws->a, pk /* rho is first field of pk */,
+				     cpu_to_le16((i << 8) | j), &ws->a_shake,
+				     ws->block);
+			for (int n = 0; n < N; n++)
+				ws->tmp.x[n] +=
+					Zq_mult(ws->a.x[n], ws->z[j].x[n]);
+		}
+		/* All components of tmp now have abs value < l*q. */
+
+		/* Decode the next element of t_1. */
+		t1_encoded = decode_t1_elem(&ws->t1_scaled, t1_encoded);
+
+		/*
+		 * tmp -= NTT(c) * NTT(t_1 * 2^d) * 2^-32
+		 *
+		 * Taking a conservative bound for the output of ntt(), the
+		 * multiplicands can have absolute value up to 9*q.  That
+		 * corresponds to a product with absolute value 81*q^2.  That is
+		 * within the limits of Zq_mult() which needs < ~256*q^2.
+		 */
+		for (int j = 0; j < N; j++)
+			ws->tmp.x[j] -= Zq_mult(ws->c.x[j], ws->t1_scaled.x[j]);
+		/* All components of tmp now have abs value < (l+1)*q. */
+
+		/* tmp = w'_Approx = NTT^-1(tmp) * 2^32 */
+		invntt_and_mul_2_32(&ws->tmp);
+		/* All coefficients of tmp are now in [0, q - 1]. */
+
+		/*
+		 * tmp = w'_1 = UseHint(h, w'_Approx)
+		 * For efficiency, set gamma2 to a compile-time constant.
+		 */
+		if (k == 4)
+			use_hint_elem(&ws->tmp, &h[i * N], (Q - 1) / 88);
+		else
+			use_hint_elem(&ws->tmp, &h[i * N], (Q - 1) / 32);
+
+		/* Encode and hash the next element of w'_1. */
+		w1_enc_len = encode_w1(ws->w1_encoded, &ws->tmp, k);
+		shake_update(&ws->shake, ws->w1_encoded, w1_enc_len);
+	}
+
+	/* Finish computing ctildeprime. */
+	shake_squeeze(&ws->shake, ws->ctildeprime, params->ctilde_len);
+
+	/* Verify that ctilde == ctildeprime. */
+	if (memcmp(ws->ctildeprime, ctilde, params->ctilde_len) != 0)
+		return -EKEYREJECTED;
+	/* ||z||_infinity < gamma1 - beta was already checked in decode_z(). */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mldsa_verify);
+
+#ifdef CONFIG_CRYPTO_FIPS
+static int __init mldsa_mod_init(void)
+{
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing any ML-DSA parameter set
+		 * satisfies the test requirement for all of them, and only a
+		 * positive test is required.
+		 */
+		int err = mldsa_verify(MLDSA65, fips_test_mldsa65_signature,
+				       sizeof(fips_test_mldsa65_signature),
+				       fips_test_mldsa65_message,
+				       sizeof(fips_test_mldsa65_message),
+				       fips_test_mldsa65_public_key,
+				       sizeof(fips_test_mldsa65_public_key));
+		if (err)
+			panic("mldsa: FIPS self-test failed; err=%pe\n",
+			      ERR_PTR(err));
+	}
+	return 0;
+}
+subsys_initcall(mldsa_mod_init);
+
+static void __exit mldsa_mod_exit(void)
+{
+}
+module_exit(mldsa_mod_exit);
+#endif /* CONFIG_CRYPTO_FIPS */
+
+MODULE_DESCRIPTION("ML-DSA signature verification");
+MODULE_LICENSE("GPL");
--- a/lib/crypto/nh.c
+++ b/lib/crypto/nh.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Implementation of the NH almost-universal hash function, specifically the
+ * variant of NH used in Adiantum.  This is *not* a cryptographic hash function.
+ *
+ * Reference: section 6.3 of "Adiantum: length-preserving encryption for
+ * entry-level processors" (https://eprint.iacr.org/2018/720.pdf).
+ */
+
+#include <crypto/nh.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+#ifdef CONFIG_CRYPTO_LIB_NH_ARCH
+#include "nh.h" /* $(SRCARCH)/nh.h */
+#else
+static bool nh_arch(const u32 *key, const u8 *message, size_t message_len,
+		    __le64 hash[NH_NUM_PASSES])
+{
+	return false;
+}
+#endif
+
+void nh(const u32 *key, const u8 *message, size_t message_len,
+	__le64 hash[NH_NUM_PASSES])
+{
+	u64 sums[4] = { 0, 0, 0, 0 };
+
+	if (nh_arch(key, message, message_len, hash))
+		return;
+
+	static_assert(NH_PAIR_STRIDE == 2);
+	static_assert(NH_NUM_PASSES == 4);
+
+	while (message_len) {
+		u32 m0 = get_unaligned_le32(message + 0);
+		u32 m1 = get_unaligned_le32(message + 4);
+		u32 m2 = get_unaligned_le32(message + 8);
+		u32 m3 = get_unaligned_le32(message + 12);
+
+		sums[0] += (u64)(u32)(m0 + key[0]) * (u32)(m2 + key[2]);
+		sums[1] += (u64)(u32)(m0 + key[4]) * (u32)(m2 + key[6]);
+		sums[2] += (u64)(u32)(m0 + key[8]) * (u32)(m2 + key[10]);
+		sums[3] += (u64)(u32)(m0 + key[12]) * (u32)(m2 + key[14]);
+		sums[0] += (u64)(u32)(m1 + key[1]) * (u32)(m3 + key[3]);
+		sums[1] += (u64)(u32)(m1 + key[5]) * (u32)(m3 + key[7]);
+		sums[2] += (u64)(u32)(m1 + key[9]) * (u32)(m3 + key[11]);
+		sums[3] += (u64)(u32)(m1 + key[13]) * (u32)(m3 + key[15]);
+		key += NH_MESSAGE_UNIT / sizeof(key[0]);
+		message += NH_MESSAGE_UNIT;
+		message_len -= NH_MESSAGE_UNIT;
+	}
+
+	hash[0] = cpu_to_le64(sums[0]);
+	hash[1] = cpu_to_le64(sums[1]);
+	hash[2] = cpu_to_le64(sums[2]);
+	hash[3] = cpu_to_le64(sums[3]);
+}
+EXPORT_SYMBOL_GPL(nh);
+
+#ifdef nh_mod_init_arch
+static int __init nh_mod_init(void)
+{
+	nh_mod_init_arch();
+	return 0;
+}
+subsys_initcall(nh_mod_init);
+
+static void __exit nh_mod_exit(void)
+{
+}
+module_exit(nh_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("NH almost-universal hash function");
+MODULE_LICENSE("GPL");
--- a/lib/crypto/powerpc/.gitignore
+++ b/lib/crypto/powerpc/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+aesp8-ppc.S
--- a/lib/crypto/powerpc/aes-spe-core.S
+++ b/lib/crypto/powerpc/aes-spe-core.S
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast AES implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include "aes-spe-regs.h"
+
+#define	EAD(in, bpos) \
+	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27;
+
+#define DAD(in, bpos) \
+	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31;
+
+#define LWH(out, off) \
+	evlwwsplat	out,off(rT0);	/* load word high		*/
+
+#define LWL(out, off) \
+	lwz		out,off(rT0);	/* load word low		*/
+
+#define LBZ(out, tab, off) \
+	lbz		out,off(tab);	/* load byte			*/
+
+#define LAH(out, in, bpos, off) \
+	EAD(in, bpos)			/* calc addr + load word high	*/ \
+	LWH(out, off)
+
+#define LAL(out, in, bpos, off) \
+	EAD(in, bpos)			/* calc addr + load word low	*/ \
+	LWL(out, off)
+
+#define LAE(out, in, bpos) \
+	EAD(in, bpos)			/* calc addr + load enc byte	*/ \
+	LBZ(out, rT0, 8)
+
+#define LBE(out) \
+	LBZ(out, rT0, 8)		/* load enc byte		*/
+
+#define LAD(out, in, bpos) \
+	DAD(in, bpos)			/* calc addr + load dec byte	*/ \
+	LBZ(out, rT1, 0)
+
+#define LBD(out) \
+	LBZ(out, rT1, 0)
+
+/*
+ * ppc_encrypt_block: The central encryption function for a single 16 bytes
+ * block. It does no stack handling or register saving to support fast calls
+ * via bl/blr. It expects that caller has pre-xored input data with first
+ * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
+ * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
+ * and rW0-rW3 and caller must execute a final xor on the output registers.
+ * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
+ *
+ */
+_GLOBAL(ppc_encrypt_block)
+	LAH(rW4, rD1, 2, 4)
+	LAH(rW6, rD0, 3, 0)
+	LAH(rW3, rD0, 1, 8)
+ppc_encrypt_block_loop:
+	LAH(rW0, rD3, 0, 12)
+	LAL(rW0, rD0, 0, 12)
+	LAH(rW1, rD1, 0, 12)
+	LAH(rW2, rD2, 1, 8)
+	LAL(rW2, rD3, 1, 8)
+	LAL(rW3, rD1, 1, 8)
+	LAL(rW4, rD2, 2, 4)
+	LAL(rW6, rD1, 3, 0)
+	LAH(rW5, rD3, 2, 4)
+	LAL(rW5, rD0, 2, 4)
+	LAH(rW7, rD2, 3, 0)
+	evldw		rD1,16(rKP)
+	EAD(rD3, 3)
+	evxor		rW2,rW2,rW4
+	LWL(rW7, 0)
+	evxor		rW2,rW2,rW6
+	EAD(rD2, 0)
+	evxor		rD1,rD1,rW2
+	LWL(rW1, 12)
+	evxor		rD1,rD1,rW0
+	evldw		rD3,24(rKP)
+	evmergehi	rD0,rD0,rD1
+	EAD(rD1, 2)
+	evxor		rW3,rW3,rW5
+	LWH(rW4, 4)
+	evxor		rW3,rW3,rW7
+	EAD(rD0, 3)
+	evxor		rD3,rD3,rW3
+	LWH(rW6, 0)
+	evxor		rD3,rD3,rW1
+	EAD(rD0, 1)
+	evmergehi	rD2,rD2,rD3
+	LWH(rW3, 8)
+	LAH(rW0, rD3, 0, 12)
+	LAL(rW0, rD0, 0, 12)
+	LAH(rW1, rD1, 0, 12)
+	LAH(rW2, rD2, 1, 8)
+	LAL(rW2, rD3, 1, 8)
+	LAL(rW3, rD1, 1, 8)
+	LAL(rW4, rD2, 2, 4)
+	LAL(rW6, rD1, 3, 0)
+	LAH(rW5, rD3, 2, 4)
+	LAL(rW5, rD0, 2, 4)
+	LAH(rW7, rD2, 3, 0)
+	evldw		rD1,32(rKP)
+	EAD(rD3, 3)
+	evxor		rW2,rW2,rW4
+	LWL(rW7, 0)
+	evxor		rW2,rW2,rW6
+	EAD(rD2, 0)
+	evxor		rD1,rD1,rW2
+	LWL(rW1, 12)
+	evxor		rD1,rD1,rW0
+	evldw		rD3,40(rKP)
+	evmergehi	rD0,rD0,rD1
+	EAD(rD1, 2)
+	evxor		rW3,rW3,rW5
+	LWH(rW4, 4)
+	evxor		rW3,rW3,rW7
+	EAD(rD0, 3)
+	evxor		rD3,rD3,rW3
+	LWH(rW6, 0)
+	evxor		rD3,rD3,rW1
+	EAD(rD0, 1)
+	evmergehi	rD2,rD2,rD3
+	LWH(rW3, 8)
+	addi		rKP,rKP,32
+	bdnz		ppc_encrypt_block_loop
+	LAH(rW0, rD3, 0, 12)
+	LAL(rW0, rD0, 0, 12)
+	LAH(rW1, rD1, 0, 12)
+	LAH(rW2, rD2, 1, 8)
+	LAL(rW2, rD3, 1, 8)
+	LAL(rW3, rD1, 1, 8)
+	LAL(rW4, rD2, 2, 4)
+	LAH(rW5, rD3, 2, 4)
+	LAL(rW6, rD1, 3, 0)
+	LAL(rW5, rD0, 2, 4)
+	LAH(rW7, rD2, 3, 0)
+	evldw		rD1,16(rKP)
+	EAD(rD3, 3)
+	evxor		rW2,rW2,rW4
+	LWL(rW7, 0)
+	evxor		rW2,rW2,rW6
+	EAD(rD2, 0)
+	evxor		rD1,rD1,rW2
+	LWL(rW1, 12)
+	evxor		rD1,rD1,rW0
+	evldw		rD3,24(rKP)
+	evmergehi	rD0,rD0,rD1
+	EAD(rD1, 0)
+	evxor		rW3,rW3,rW5
+	LBE(rW2)
+	evxor		rW3,rW3,rW7
+	EAD(rD0, 1)
+	evxor		rD3,rD3,rW3
+	LBE(rW6)
+	evxor		rD3,rD3,rW1
+	EAD(rD0, 0)
+	evmergehi	rD2,rD2,rD3
+	LBE(rW1)
+	LAE(rW0, rD3, 0)
+	LAE(rW1, rD0, 0)
+	LAE(rW4, rD2, 1)
+	LAE(rW5, rD3, 1)
+	LAE(rW3, rD2, 0)
+	LAE(rW7, rD1, 1)
+	rlwimi		rW0,rW4,8,16,23
+	rlwimi		rW1,rW5,8,16,23
+	LAE(rW4, rD1, 2)
+	LAE(rW5, rD2, 2)
+	rlwimi		rW2,rW6,8,16,23
+	rlwimi		rW3,rW7,8,16,23
+	LAE(rW6, rD3, 2)
+	LAE(rW7, rD0, 2)
+	rlwimi		rW0,rW4,16,8,15
+	rlwimi		rW1,rW5,16,8,15
+	LAE(rW4, rD0, 3)
+	LAE(rW5, rD1, 3)
+	rlwimi		rW2,rW6,16,8,15
+	lwz		rD0,32(rKP)
+	rlwimi		rW3,rW7,16,8,15
+	lwz		rD1,36(rKP)
+	LAE(rW6, rD2, 3)
+	LAE(rW7, rD3, 3)
+	rlwimi		rW0,rW4,24,0,7
+	lwz		rD2,40(rKP)
+	rlwimi		rW1,rW5,24,0,7
+	lwz		rD3,44(rKP)
+	rlwimi		rW2,rW6,24,0,7
+	rlwimi		rW3,rW7,24,0,7
+	blr
+
+/*
+ * ppc_decrypt_block: The central decryption function for a single 16 bytes
+ * block. It does no stack handling or register saving to support fast calls
+ * via bl/blr. It expects that caller has pre-xored input data with first
+ * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
+ * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
+ * and rW0-rW3 and caller must execute a final xor on the output registers.
+ * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
+ *
+ */
+_GLOBAL(ppc_decrypt_block)
+	LAH(rW0, rD1, 0, 12)
+	LAH(rW6, rD0, 3, 0)
+	LAH(rW3, rD0, 1, 8)
+ppc_decrypt_block_loop:
+	LAH(rW1, rD3, 0, 12)
+	LAL(rW0, rD2, 0, 12)
+	LAH(rW2, rD2, 1, 8)
+	LAL(rW2, rD3, 1, 8)
+	LAH(rW4, rD3, 2, 4)
+	LAL(rW4, rD0, 2, 4)
+	LAL(rW6, rD1, 3, 0)
+	LAH(rW5, rD1, 2, 4)
+	LAH(rW7, rD2, 3, 0)
+	LAL(rW7, rD3, 3, 0)
+	LAL(rW3, rD1, 1, 8)
+	evldw		rD1,16(rKP)
+	EAD(rD0, 0)
+	evxor		rW4,rW4,rW6
+	LWL(rW1, 12)
+	evxor		rW0,rW0,rW4
+	EAD(rD2, 2)
+	evxor		rW0,rW0,rW2
+	LWL(rW5, 4)
+	evxor		rD1,rD1,rW0
+	evldw		rD3,24(rKP)
+	evmergehi	rD0,rD0,rD1
+	EAD(rD1, 0)
+	evxor		rW3,rW3,rW7
+	LWH(rW0, 12)
+	evxor		rW3,rW3,rW1
+	EAD(rD0, 3)
+	evxor		rD3,rD3,rW3
+	LWH(rW6, 0)
+	evxor		rD3,rD3,rW5
+	EAD(rD0, 1)
+	evmergehi	rD2,rD2,rD3
+	LWH(rW3, 8)
+	LAH(rW1, rD3, 0, 12)
+	LAL(rW0, rD2, 0, 12)
+	LAH(rW2, rD2, 1, 8)
+	LAL(rW2, rD3, 1, 8)
+	LAH(rW4, rD3, 2, 4)
+	LAL(rW4, rD0, 2, 4)
+	LAL(rW6, rD1, 3, 0)
+	LAH(rW5, rD1, 2, 4)
+	LAH(rW7, rD2, 3, 0)
+	LAL(rW7, rD3, 3, 0)
+	LAL(rW3, rD1, 1, 8)
+	evldw		 rD1,32(rKP)
+	EAD(rD0, 0)
+	evxor		rW4,rW4,rW6
+	LWL(rW1, 12)
+	evxor		rW0,rW0,rW4
+	EAD(rD2, 2)
+	evxor		rW0,rW0,rW2
+	LWL(rW5, 4)
+	evxor		rD1,rD1,rW0
+	evldw		rD3,40(rKP)
+	evmergehi	rD0,rD0,rD1
+	EAD(rD1, 0)
+	evxor		rW3,rW3,rW7
+	LWH(rW0, 12)
+	evxor		rW3,rW3,rW1
+	EAD(rD0, 3)
+	evxor		rD3,rD3,rW3
+	LWH(rW6, 0)
+	evxor		rD3,rD3,rW5
+	EAD(rD0, 1)
+	evmergehi	rD2,rD2,rD3
+	LWH(rW3, 8)
+	addi		rKP,rKP,32
+	bdnz		ppc_decrypt_block_loop
+	LAH(rW1, rD3, 0, 12)
+	LAL(rW0, rD2, 0, 12)
+	LAH(rW2, rD2, 1, 8)
+	LAL(rW2, rD3, 1, 8)
+	LAH(rW4, rD3, 2, 4)
+	LAL(rW4, rD0, 2, 4)
+	LAL(rW6, rD1, 3, 0)
+	LAH(rW5, rD1, 2, 4)
+	LAH(rW7, rD2, 3, 0)
+	LAL(rW7, rD3, 3, 0)
+	LAL(rW3, rD1, 1, 8)
+	evldw		 rD1,16(rKP)
+	EAD(rD0, 0)
+	evxor		rW4,rW4,rW6
+	LWL(rW1, 12)
+	evxor		rW0,rW0,rW4
+	EAD(rD2, 2)
+	evxor		rW0,rW0,rW2
+	LWL(rW5, 4)
+	evxor		rD1,rD1,rW0
+	evldw		rD3,24(rKP)
+	evmergehi	rD0,rD0,rD1
+	DAD(rD1, 0)
+	evxor		rW3,rW3,rW7
+	LBD(rW0)
+	evxor		rW3,rW3,rW1
+	DAD(rD0, 1)
+	evxor		rD3,rD3,rW3
+	LBD(rW6)
+	evxor		rD3,rD3,rW5
+	DAD(rD0, 0)
+	evmergehi	rD2,rD2,rD3
+	LBD(rW3)
+	LAD(rW2, rD3, 0)
+	LAD(rW1, rD2, 0)
+	LAD(rW4, rD2, 1)
+	LAD(rW5, rD3, 1)
+	LAD(rW7, rD1, 1)
+	rlwimi		rW0,rW4,8,16,23
+	rlwimi		rW1,rW5,8,16,23
+	LAD(rW4, rD3, 2)
+	LAD(rW5, rD0, 2)
+	rlwimi		rW2,rW6,8,16,23
+	rlwimi		rW3,rW7,8,16,23
+	LAD(rW6, rD1, 2)
+	LAD(rW7, rD2, 2)
+	rlwimi		rW0,rW4,16,8,15
+	rlwimi		rW1,rW5,16,8,15
+	LAD(rW4, rD0, 3)
+	LAD(rW5, rD1, 3)
+	rlwimi		rW2,rW6,16,8,15
+	lwz		rD0,32(rKP)
+	rlwimi		rW3,rW7,16,8,15
+	lwz		rD1,36(rKP)
+	LAD(rW6, rD2, 3)
+	LAD(rW7, rD3, 3)
+	rlwimi		rW0,rW4,24,0,7
+	lwz		rD2,40(rKP)
+	rlwimi		rW1,rW5,24,0,7
+	lwz		rD3,44(rKP)
+	rlwimi		rW2,rW6,24,0,7
+	rlwimi		rW3,rW7,24,0,7
+	blr
--- a/lib/crypto/powerpc/aes-spe-keys.S
+++ b/lib/crypto/powerpc/aes-spe-keys.S
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Key handling functions for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_KEY(d, s, off) \
+	lwz		d,off(s);
+#else
+#define LOAD_KEY(d, s, off) \
+	li		r0,off; \
+	lwbrx		d,s,r0;
+#endif
+
+#define INITIALIZE_KEY \
+	stwu		r1,-32(r1);	/* create stack frame		*/ \
+	stw		r14,8(r1);	/* save registers		*/ \
+	stw		r15,12(r1);					   \
+	stw		r16,16(r1);
+
+#define FINALIZE_KEY \
+	lwz		r14,8(r1);	/* restore registers		*/ \
+	lwz		r15,12(r1);					   \
+	lwz		r16,16(r1);					   \
+	xor		r5,r5,r5;	/* clear sensitive data		*/ \
+	xor		r6,r6,r6;					   \
+	xor		r7,r7,r7;					   \
+	xor		r8,r8,r8;					   \
+	xor		r9,r9,r9;					   \
+	xor		r10,r10,r10;					   \
+	xor		r11,r11,r11;					   \
+	xor		r12,r12,r12;					   \
+	addi		r1,r1,32;	/* cleanup stack		*/
+
+#define LS_BOX(r, t1, t2) \
+	lis		t2,PPC_AES_4K_ENCTAB@h;				   \
+	ori		t2,t2,PPC_AES_4K_ENCTAB@l;			   \
+	rlwimi		t2,r,4,20,27;					   \
+	lbz		t1,8(t2);					   \
+	rlwimi		r,t1,0,24,31;					   \
+	rlwimi		t2,r,28,20,27;					   \
+	lbz		t1,8(t2);					   \
+	rlwimi		r,t1,8,16,23;					   \
+	rlwimi		t2,r,20,20,27;					   \
+	lbz		t1,8(t2);					   \
+	rlwimi		r,t1,16,8,15;					   \
+	rlwimi		t2,r,12,20,27;					   \
+	lbz		t1,8(t2);					   \
+	rlwimi		r,t1,24,0,7;
+
+#define GF8_MUL(out, in, t1, t2) \
+	lis t1,0x8080;			/* multiplication in GF8	*/ \
+	ori t1,t1,0x8080; 						   \
+	and t1,t1,in; 							   \
+	srwi t1,t1,7; 							   \
+	mulli t1,t1,0x1b; 						   \
+	lis t2,0x7f7f; 							   \
+	ori t2,t2,0x7f7f; 						   \
+	and t2,t2,in; 							   \
+	slwi t2,t2,1; 							   \
+	xor out,t1,t2;
+
+/*
+ * ppc_expand_key_128(u32 *key_enc, const u8 *key)
+ *
+ * Expand 128 bit key into 176 bytes encryption key. It consists of
+ * key itself plus 10 rounds with 16 bytes each
+ *
+ */
+_GLOBAL(ppc_expand_key_128)
+	INITIALIZE_KEY
+	LOAD_KEY(r5,r4,0)
+	LOAD_KEY(r6,r4,4)
+	LOAD_KEY(r7,r4,8)
+	LOAD_KEY(r8,r4,12)
+	stw		r5,0(r3)	/* key[0..3] = input data	*/
+	stw		r6,4(r3)
+	stw		r7,8(r3)
+	stw		r8,12(r3)
+	li		r16,10		/* 10 expansion rounds		*/
+	lis		r0,0x0100	/* RCO(1)			*/
+ppc_expand_128_loop:
+	addi		r3,r3,16
+	mr		r14,r8		/* apply LS_BOX to 4th temp	*/
+	rotlwi		r14,r14,8
+	LS_BOX(r14, r15, r4)
+	xor		r14,r14,r0
+	xor		r5,r5,r14	/* xor next 4 keys		*/
+	xor		r6,r6,r5
+	xor		r7,r7,r6
+	xor		r8,r8,r7
+	stw		r5,0(r3)	/* store next 4 keys		*/
+	stw		r6,4(r3)
+	stw		r7,8(r3)
+	stw		r8,12(r3)
+	GF8_MUL(r0, r0, r4, r14)	/* multiply RCO by 2 in GF	*/
+	subi		r16,r16,1
+	cmpwi		r16,0
+	bt		eq,ppc_expand_128_end
+	b		ppc_expand_128_loop
+ppc_expand_128_end:
+	FINALIZE_KEY
+	blr
+
+/*
+ * ppc_expand_key_192(u32 *key_enc, const u8 *key)
+ *
+ * Expand 192 bit key into 208 bytes encryption key. It consists of key
+ * itself plus 12 rounds with 16 bytes each
+ *
+ */
+_GLOBAL(ppc_expand_key_192)
+	INITIALIZE_KEY
+	LOAD_KEY(r5,r4,0)
+	LOAD_KEY(r6,r4,4)
+	LOAD_KEY(r7,r4,8)
+	LOAD_KEY(r8,r4,12)
+	LOAD_KEY(r9,r4,16)
+	LOAD_KEY(r10,r4,20)
+	stw		r5,0(r3)
+	stw		r6,4(r3)
+	stw		r7,8(r3)
+	stw		r8,12(r3)
+	stw		r9,16(r3)
+	stw		r10,20(r3)
+	li		r16,8		/* 8 expansion rounds		*/
+	lis		r0,0x0100	/* RCO(1)			*/
+ppc_expand_192_loop:
+	addi		r3,r3,24
+	mr		r14,r10		/* apply LS_BOX to 6th temp	*/
+	rotlwi		r14,r14,8
+	LS_BOX(r14, r15, r4)
+	xor		r14,r14,r0
+	xor		r5,r5,r14	/* xor next 6 keys		*/
+	xor		r6,r6,r5
+	xor		r7,r7,r6
+	xor		r8,r8,r7
+	xor		r9,r9,r8
+	xor		r10,r10,r9
+	stw		r5,0(r3)
+	stw		r6,4(r3)
+	stw		r7,8(r3)
+	stw		r8,12(r3)
+	subi		r16,r16,1
+	cmpwi		r16,0		/* last round early kick out	*/
+	bt		eq,ppc_expand_192_end
+	stw		r9,16(r3)
+	stw		r10,20(r3)
+	GF8_MUL(r0, r0, r4, r14)	/* multiply RCO GF8		*/
+	b		ppc_expand_192_loop
+ppc_expand_192_end:
+	FINALIZE_KEY
+	blr
+
+/*
+ * ppc_expand_key_256(u32 *key_enc, const u8 *key)
+ *
+ * Expand 256 bit key into 240 bytes encryption key. It consists of key
+ * itself plus 14 rounds with 16 bytes each
+ *
+ */
+_GLOBAL(ppc_expand_key_256)
+	INITIALIZE_KEY
+	LOAD_KEY(r5,r4,0)
+	LOAD_KEY(r6,r4,4)
+	LOAD_KEY(r7,r4,8)
+	LOAD_KEY(r8,r4,12)
+	LOAD_KEY(r9,r4,16)
+	LOAD_KEY(r10,r4,20)
+	LOAD_KEY(r11,r4,24)
+	LOAD_KEY(r12,r4,28)
+	stw		r5,0(r3)
+	stw		r6,4(r3)
+	stw		r7,8(r3)
+	stw		r8,12(r3)
+	stw		r9,16(r3)
+	stw		r10,20(r3)
+	stw		r11,24(r3)
+	stw		r12,28(r3)
+	li		r16,7		/* 7 expansion rounds		*/
+	lis		r0,0x0100	/* RCO(1)			*/
+ppc_expand_256_loop:
+	addi		r3,r3,32
+	mr		r14,r12		/* apply LS_BOX to 8th temp	*/
+	rotlwi		r14,r14,8
+	LS_BOX(r14, r15, r4)
+	xor		r14,r14,r0
+	xor		r5,r5,r14	/* xor 4 keys			*/
+	xor		r6,r6,r5
+	xor		r7,r7,r6
+	xor		r8,r8,r7
+	mr		r14,r8
+	LS_BOX(r14, r15, r4)		/* apply LS_BOX to 4th temp	*/
+	xor		r9,r9,r14	/* xor 4 keys			*/
+	xor		r10,r10,r9
+	xor		r11,r11,r10
+	xor		r12,r12,r11
+	stw		r5,0(r3)
+	stw		r6,4(r3)
+	stw		r7,8(r3)
+	stw		r8,12(r3)
+	subi		r16,r16,1
+	cmpwi		r16,0		/* last round early kick out	*/
+	bt		eq,ppc_expand_256_end
+	stw		r9,16(r3)
+	stw		r10,20(r3)
+	stw		r11,24(r3)
+	stw		r12,28(r3)
+	GF8_MUL(r0, r0, r4, r14)
+	b		ppc_expand_256_loop
+ppc_expand_256_end:
+	FINALIZE_KEY
+	blr
+
+/*
+ * ppc_generate_decrypt_key: derive decryption key from encryption key
+ * number of bytes to handle are calculated from length of key (16/24/32)
+ *
+ */
+_GLOBAL(ppc_generate_decrypt_key)
+	addi		r6,r5,24
+	slwi		r6,r6,2
+	lwzx		r7,r4,r6	/* first/last 4 words are same	*/
+	stw		r7,0(r3)
+	lwz		r7,0(r4)
+	stwx		r7,r3,r6
+	addi		r6,r6,4
+	lwzx		r7,r4,r6
+	stw		r7,4(r3)
+	lwz		r7,4(r4)
+	stwx		r7,r3,r6
+	addi		r6,r6,4
+	lwzx		r7,r4,r6
+	stw		r7,8(r3)
+	lwz		r7,8(r4)
+	stwx		r7,r3,r6
+	addi		r6,r6,4
+	lwzx		r7,r4,r6
+	stw		r7,12(r3)
+	lwz		r7,12(r4)
+	stwx		r7,r3,r6
+	addi		r3,r3,16
+	add		r4,r4,r6
+	subi		r4,r4,28
+	addi		r5,r5,20
+	srwi		r5,r5,2
+ppc_generate_decrypt_block:
+	li	r6,4
+	mtctr	r6
+ppc_generate_decrypt_word:
+	lwz		r6,0(r4)
+	GF8_MUL(r7, r6, r0, r7)
+	GF8_MUL(r8, r7, r0, r8)
+	GF8_MUL(r9, r8, r0, r9)
+	xor		r10,r9,r6
+	xor		r11,r7,r8
+	xor		r11,r11,r9
+	xor		r12,r7,r10
+	rotrwi		r12,r12,24
+	xor		r11,r11,r12
+	xor		r12,r8,r10
+	rotrwi		r12,r12,16
+	xor		r11,r11,r12
+	rotrwi		r12,r10,8
+	xor		r11,r11,r12
+	stw		r11,0(r3)
+	addi		r3,r3,4
+	addi		r4,r4,4
+	bdnz		ppc_generate_decrypt_word
+	subi		r4,r4,32
+	subi		r5,r5,1
+	cmpwi		r5,0
+	bt		gt,ppc_generate_decrypt_block
+	blr
--- a/lib/crypto/powerpc/aes-spe-modes.S
+++ b/lib/crypto/powerpc/aes-spe-modes.S
@@ -0,0 +1,625 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES modes (ECB/CBC/CTR/XTS) for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include "aes-spe-regs.h"
+
+#ifdef __BIG_ENDIAN__			/* Macros for big endian builds	*/
+
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rSP);	/* load with offset		*/
+#define SAVE_DATA(reg, off) \
+	stw		reg,off(rDP);	/* save with offset		*/
+#define NEXT_BLOCK \
+	addi		rSP,rSP,16;	/* increment pointers per bloc	*/ \
+	addi		rDP,rDP,16;
+#define LOAD_IV(reg, off) \
+	lwz		reg,off(rIP);	/* IV loading with offset	*/
+#define SAVE_IV(reg, off) \
+	stw		reg,off(rIP);	/* IV saving with offset	*/
+#define START_IV			/* nothing to reset		*/
+#define CBC_DEC 16			/* CBC decrement per block	*/
+#define CTR_DEC 1			/* CTR decrement one byte	*/
+
+#else					/* Macros for little endian	*/
+
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rSP;	/* load reversed		*/ \
+	addi		rSP,rSP,4;	/* and increment pointer	*/
+#define SAVE_DATA(reg, off) \
+	stwbrx		reg,0,rDP;	/* save reversed		*/ \
+	addi		rDP,rDP,4;	/* and increment pointer	*/
+#define NEXT_BLOCK			/* nothing todo			*/
+#define LOAD_IV(reg, off) \
+	lwbrx		reg,0,rIP;	/* load reversed		*/ \
+	addi		rIP,rIP,4;	/* and increment pointer	*/
+#define SAVE_IV(reg, off) \
+	stwbrx		reg,0,rIP;	/* load reversed		*/ \
+	addi		rIP,rIP,4;	/* and increment pointer	*/
+#define START_IV \
+	subi		rIP,rIP,16;	/* must reset pointer		*/
+#define CBC_DEC 32			/* 2 blocks because of incs	*/
+#define CTR_DEC 17			/* 1 block because of incs	*/
+
+#endif
+
+#define SAVE_0_REGS
+#define LOAD_0_REGS
+
+#define SAVE_4_REGS \
+	stw		rI0,96(r1);	/* save 32 bit registers	*/ \
+	stw		rI1,100(r1);					   \
+	stw		rI2,104(r1);					   \
+	stw		rI3,108(r1);
+
+#define LOAD_4_REGS \
+	lwz		rI0,96(r1);	/* restore 32 bit registers	*/ \
+	lwz		rI1,100(r1);					   \
+	lwz		rI2,104(r1);					   \
+	lwz		rI3,108(r1);
+
+#define SAVE_8_REGS \
+	SAVE_4_REGS							   \
+	stw		rG0,112(r1);	/* save 32 bit registers	*/ \
+	stw		rG1,116(r1);					   \
+	stw		rG2,120(r1);					   \
+	stw		rG3,124(r1);
+
+#define LOAD_8_REGS \
+	LOAD_4_REGS							   \
+	lwz		rG0,112(r1);	/* restore 32 bit registers	*/ \
+	lwz		rG1,116(r1);					   \
+	lwz		rG2,120(r1);					   \
+	lwz		rG3,124(r1);
+
+#define INITIALIZE_CRYPT(tab,nr32bitregs) \
+	mflr		r0;						   \
+	stwu		r1,-160(r1);	/* create stack frame		*/ \
+	lis		rT0,tab@h;	/* en-/decryption table pointer	*/ \
+	stw		r0,8(r1);	/* save link register		*/ \
+	ori		rT0,rT0,tab@l;					   \
+	evstdw		r14,16(r1);					   \
+	mr		rKS,rKP;					   \
+	evstdw		r15,24(r1);	/* We must save non volatile	*/ \
+	evstdw		r16,32(r1);	/* registers. Take the chance	*/ \
+	evstdw		r17,40(r1);	/* and save the SPE part too	*/ \
+	evstdw		r18,48(r1);					   \
+	evstdw		r19,56(r1);					   \
+	evstdw		r20,64(r1);					   \
+	evstdw		r21,72(r1);					   \
+	evstdw		r22,80(r1);					   \
+	evstdw		r23,88(r1);					   \
+	SAVE_##nr32bitregs##_REGS
+
+#define FINALIZE_CRYPT(nr32bitregs) \
+	lwz		r0,8(r1);					   \
+	evldw		r14,16(r1);	/* restore SPE registers	*/ \
+	evldw		r15,24(r1);					   \
+	evldw		r16,32(r1);					   \
+	evldw		r17,40(r1);					   \
+	evldw		r18,48(r1);					   \
+	evldw		r19,56(r1);					   \
+	evldw		r20,64(r1);					   \
+	evldw		r21,72(r1);					   \
+	evldw		r22,80(r1);					   \
+	evldw		r23,88(r1);					   \
+	LOAD_##nr32bitregs##_REGS					   \
+	mtlr		r0;		/* restore link register	*/ \
+	xor		r0,r0,r0;					   \
+	stw		r0,16(r1);	/* delete sensitive data	*/ \
+	stw		r0,24(r1);	/* that we might have pushed	*/ \
+	stw		r0,32(r1);	/* from other context that runs	*/ \
+	stw		r0,40(r1);	/* the same code		*/ \
+	stw		r0,48(r1);					   \
+	stw		r0,56(r1);					   \
+	stw		r0,64(r1);					   \
+	stw		r0,72(r1);					   \
+	stw		r0,80(r1);					   \
+	stw		r0,88(r1);					   \
+	addi		r1,r1,160;	/* cleanup stack frame		*/
+
+#define ENDIAN_SWAP(t0, t1, s0, s1) \
+	rotrwi		t0,s0,8;	/* swap endianness for 2 GPRs	*/ \
+	rotrwi		t1,s1,8;					   \
+	rlwimi		t0,s0,8,8,15;					   \
+	rlwimi		t1,s1,8,8,15;					   \
+	rlwimi		t0,s0,8,24,31;					   \
+	rlwimi		t1,s1,8,24,31;
+
+#define GF128_MUL(d0, d1, d2, d3, t0) \
+	li		t0,0x87;	/* multiplication in GF128	*/ \
+	cmpwi		d3,-1;						   \
+	iselgt		t0,0,t0;					   \
+	rlwimi		d3,d2,0,0,0;	/* propagate "carry" bits	*/ \
+	rotlwi		d3,d3,1;					   \
+	rlwimi		d2,d1,0,0,0;					   \
+	rotlwi		d2,d2,1;					   \
+	rlwimi		d1,d0,0,0,0;					   \
+	slwi		d0,d0,1;	/* shift left 128 bit		*/ \
+	rotlwi		d1,d1,1;					   \
+	xor		d0,d0,t0;
+
+#define START_KEY(d0, d1, d2, d3) \
+	lwz		rW0,0(rKP);					   \
+	mtctr		rRR;						   \
+	lwz		rW1,4(rKP);					   \
+	lwz		rW2,8(rKP);					   \
+	lwz		rW3,12(rKP);					   \
+	xor		rD0,d0,rW0;					   \
+	xor		rD1,d1,rW1;					   \
+	xor		rD2,d2,rW2;					   \
+	xor		rD3,d3,rW3;
+
+/*
+ * ppc_encrypt_aes(u8 *out, const u8 *in, u32 *key_enc,
+ *		   u32 rounds)
+ *
+ * called from glue layer to encrypt a single 16 byte block
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_aes)
+	INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0)
+	LOAD_DATA(rD0, 0)
+	LOAD_DATA(rD1, 4)
+	LOAD_DATA(rD2, 8)
+	LOAD_DATA(rD3, 12)
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_encrypt_block
+	xor		rD0,rD0,rW0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rW1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rW2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rW3
+	SAVE_DATA(rD3, 12)
+	FINALIZE_CRYPT(0)
+	blr
+
+/*
+ * ppc_decrypt_aes(u8 *out, const u8 *in, u32 *key_dec,
+ *		   u32 rounds)
+ *
+ * called from glue layer to decrypt a single 16 byte block
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_aes)
+	INITIALIZE_CRYPT(PPC_AES_4K_DECTAB,0)
+	LOAD_DATA(rD0, 0)
+	addi		rT1,rT0,4096
+	LOAD_DATA(rD1, 4)
+	LOAD_DATA(rD2, 8)
+	LOAD_DATA(rD3, 12)
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_decrypt_block
+	xor		rD0,rD0,rW0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rW1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rW2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rW3
+	SAVE_DATA(rD3, 12)
+	FINALIZE_CRYPT(0)
+	blr
+
+/*
+ * ppc_encrypt_ecb(u8 *out, const u8 *in, u32 *key_enc,
+ *		   u32 rounds, u32 bytes);
+ *
+ * called from glue layer to encrypt multiple blocks via ECB
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_ecb)
+	INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0)
+ppc_encrypt_ecb_loop:
+	LOAD_DATA(rD0, 0)
+	mr		rKP,rKS
+	LOAD_DATA(rD1, 4)
+	subi		rLN,rLN,16
+	LOAD_DATA(rD2, 8)
+	cmpwi		rLN,15
+	LOAD_DATA(rD3, 12)
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_encrypt_block
+	xor		rD0,rD0,rW0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rW1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rW2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rW3
+	SAVE_DATA(rD3, 12)
+	NEXT_BLOCK
+	bt		gt,ppc_encrypt_ecb_loop
+	FINALIZE_CRYPT(0)
+	blr
+
+/*
+ * ppc_decrypt_ecb(u8 *out, const u8 *in, u32 *key_dec,
+ *		   u32 rounds, u32 bytes);
+ *
+ * called from glue layer to decrypt multiple blocks via ECB
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_ecb)
+	INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 0)
+	addi		rT1,rT0,4096
+ppc_decrypt_ecb_loop:
+	LOAD_DATA(rD0, 0)
+	mr		rKP,rKS
+	LOAD_DATA(rD1, 4)
+	subi		rLN,rLN,16
+	LOAD_DATA(rD2, 8)
+	cmpwi		rLN,15
+	LOAD_DATA(rD3, 12)
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_decrypt_block
+	xor		rD0,rD0,rW0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rW1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rW2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rW3
+	SAVE_DATA(rD3, 12)
+	NEXT_BLOCK
+	bt		gt,ppc_decrypt_ecb_loop
+	FINALIZE_CRYPT(0)
+	blr
+
+/*
+ * ppc_encrypt_cbc(u8 *out, const u8 *in, u32 *key_enc,
+ *		   32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to encrypt multiple blocks via CBC
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_cbc)
+	INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4)
+	LOAD_IV(rI0, 0)
+	LOAD_IV(rI1, 4)
+	LOAD_IV(rI2, 8)
+	LOAD_IV(rI3, 12)
+ppc_encrypt_cbc_loop:
+	LOAD_DATA(rD0, 0)
+	mr		rKP,rKS
+	LOAD_DATA(rD1, 4)
+	subi		rLN,rLN,16
+	LOAD_DATA(rD2, 8)
+	cmpwi		rLN,15
+	LOAD_DATA(rD3, 12)
+	xor		rD0,rD0,rI0
+	xor		rD1,rD1,rI1
+	xor		rD2,rD2,rI2
+	xor		rD3,rD3,rI3
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_encrypt_block
+	xor		rI0,rD0,rW0
+	SAVE_DATA(rI0, 0)
+	xor		rI1,rD1,rW1
+	SAVE_DATA(rI1, 4)
+	xor		rI2,rD2,rW2
+	SAVE_DATA(rI2, 8)
+	xor		rI3,rD3,rW3
+	SAVE_DATA(rI3, 12)
+	NEXT_BLOCK
+	bt		gt,ppc_encrypt_cbc_loop
+	START_IV
+	SAVE_IV(rI0, 0)
+	SAVE_IV(rI1, 4)
+	SAVE_IV(rI2, 8)
+	SAVE_IV(rI3, 12)
+	FINALIZE_CRYPT(4)
+	blr
+
+/*
+ * ppc_decrypt_cbc(u8 *out, const u8 *in, u32 *key_dec,
+ *		   u32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to decrypt multiple blocks via CBC
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_cbc)
+	INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 4)
+	li		rT1,15
+	LOAD_IV(rI0, 0)
+	andc		rLN,rLN,rT1
+	LOAD_IV(rI1, 4)
+	subi		rLN,rLN,16
+	LOAD_IV(rI2, 8)
+	add		rSP,rSP,rLN	/* reverse processing		*/
+	LOAD_IV(rI3, 12)
+	add		rDP,rDP,rLN
+	LOAD_DATA(rD0, 0)
+	addi		rT1,rT0,4096
+	LOAD_DATA(rD1, 4)
+	LOAD_DATA(rD2, 8)
+	LOAD_DATA(rD3, 12)
+	START_IV
+	SAVE_IV(rD0, 0)
+	SAVE_IV(rD1, 4)
+	SAVE_IV(rD2, 8)
+	cmpwi		rLN,16
+	SAVE_IV(rD3, 12)
+	bt		lt,ppc_decrypt_cbc_end
+ppc_decrypt_cbc_loop:
+	mr		rKP,rKS
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_decrypt_block
+	subi		rLN,rLN,16
+	subi		rSP,rSP,CBC_DEC
+	xor		rW0,rD0,rW0
+	LOAD_DATA(rD0, 0)
+	xor		rW1,rD1,rW1
+	LOAD_DATA(rD1, 4)
+	xor		rW2,rD2,rW2
+	LOAD_DATA(rD2, 8)
+	xor		rW3,rD3,rW3
+	LOAD_DATA(rD3, 12)
+	xor		rW0,rW0,rD0
+	SAVE_DATA(rW0, 0)
+	xor		rW1,rW1,rD1
+	SAVE_DATA(rW1, 4)
+	xor		rW2,rW2,rD2
+	SAVE_DATA(rW2, 8)
+	xor		rW3,rW3,rD3
+	SAVE_DATA(rW3, 12)
+	cmpwi		rLN,15
+	subi		rDP,rDP,CBC_DEC
+	bt		gt,ppc_decrypt_cbc_loop
+ppc_decrypt_cbc_end:
+	mr		rKP,rKS
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_decrypt_block
+	xor		rW0,rW0,rD0
+	xor		rW1,rW1,rD1
+	xor		rW2,rW2,rD2
+	xor		rW3,rW3,rD3
+	xor		rW0,rW0,rI0	/* decrypt with initial IV	*/
+	SAVE_DATA(rW0, 0)
+	xor		rW1,rW1,rI1
+	SAVE_DATA(rW1, 4)
+	xor		rW2,rW2,rI2
+	SAVE_DATA(rW2, 8)
+	xor		rW3,rW3,rI3
+	SAVE_DATA(rW3, 12)
+	FINALIZE_CRYPT(4)
+	blr
+
+/*
+ * ppc_crypt_ctr(u8 *out, const u8 *in, u32 *key_enc,
+ *		 u32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to encrypt/decrypt multiple blocks
+ * via CTR. Number of bytes does not need to be a multiple of
+ * 16. Round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_crypt_ctr)
+	INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4)
+	LOAD_IV(rI0, 0)
+	LOAD_IV(rI1, 4)
+	LOAD_IV(rI2, 8)
+	cmpwi		rLN,16
+	LOAD_IV(rI3, 12)
+	START_IV
+	bt		lt,ppc_crypt_ctr_partial
+ppc_crypt_ctr_loop:
+	mr		rKP,rKS
+	START_KEY(rI0, rI1, rI2, rI3)
+	bl		ppc_encrypt_block
+	xor		rW0,rD0,rW0
+	xor		rW1,rD1,rW1
+	xor		rW2,rD2,rW2
+	xor		rW3,rD3,rW3
+	LOAD_DATA(rD0, 0)
+	subi		rLN,rLN,16
+	LOAD_DATA(rD1, 4)
+	LOAD_DATA(rD2, 8)
+	LOAD_DATA(rD3, 12)
+	xor		rD0,rD0,rW0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rW1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rW2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rW3
+	SAVE_DATA(rD3, 12)
+	addic		rI3,rI3,1	/* increase counter			*/
+	addze		rI2,rI2
+	addze		rI1,rI1
+	addze		rI0,rI0
+	NEXT_BLOCK
+	cmpwi		rLN,15
+	bt		gt,ppc_crypt_ctr_loop
+ppc_crypt_ctr_partial:
+	cmpwi		rLN,0
+	bt		eq,ppc_crypt_ctr_end
+	mr		rKP,rKS
+	START_KEY(rI0, rI1, rI2, rI3)
+	bl		ppc_encrypt_block
+	xor		rW0,rD0,rW0
+	SAVE_IV(rW0, 0)
+	xor		rW1,rD1,rW1
+	SAVE_IV(rW1, 4)
+	xor		rW2,rD2,rW2
+	SAVE_IV(rW2, 8)
+	xor		rW3,rD3,rW3
+	SAVE_IV(rW3, 12)
+	mtctr		rLN
+	subi		rIP,rIP,CTR_DEC
+	subi		rSP,rSP,1
+	subi		rDP,rDP,1
+ppc_crypt_ctr_xorbyte:
+	lbzu		rW4,1(rIP)	/* bytewise xor for partial block	*/
+	lbzu		rW5,1(rSP)
+	xor		rW4,rW4,rW5
+	stbu		rW4,1(rDP)
+	bdnz		ppc_crypt_ctr_xorbyte
+	subf		rIP,rLN,rIP
+	addi		rIP,rIP,1
+	addic		rI3,rI3,1
+	addze		rI2,rI2
+	addze		rI1,rI1
+	addze		rI0,rI0
+ppc_crypt_ctr_end:
+	SAVE_IV(rI0, 0)
+	SAVE_IV(rI1, 4)
+	SAVE_IV(rI2, 8)
+	SAVE_IV(rI3, 12)
+	FINALIZE_CRYPT(4)
+	blr
+
+/*
+ * ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc,
+ *		   u32 rounds, u32 bytes, u8 *iv, u32 *key_twk);
+ *
+ * called from glue layer to encrypt multiple blocks via XTS
+ * If key_twk is given, the initial IV encryption will be
+ * processed too. Round values are AES128 = 4, AES192 = 5,
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_xts)
+	INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 8)
+	LOAD_IV(rI0, 0)
+	LOAD_IV(rI1, 4)
+	LOAD_IV(rI2, 8)
+	cmpwi		rKT,0
+	LOAD_IV(rI3, 12)
+	bt		eq,ppc_encrypt_xts_notweak
+	mr		rKP,rKT
+	START_KEY(rI0, rI1, rI2, rI3)
+	bl		ppc_encrypt_block
+	xor		rI0,rD0,rW0
+	xor		rI1,rD1,rW1
+	xor		rI2,rD2,rW2
+	xor		rI3,rD3,rW3
+ppc_encrypt_xts_notweak:
+	ENDIAN_SWAP(rG0, rG1, rI0, rI1)
+	ENDIAN_SWAP(rG2, rG3, rI2, rI3)
+ppc_encrypt_xts_loop:
+	LOAD_DATA(rD0, 0)
+	mr		rKP,rKS
+	LOAD_DATA(rD1, 4)
+	subi		rLN,rLN,16
+	LOAD_DATA(rD2, 8)
+	LOAD_DATA(rD3, 12)
+	xor		rD0,rD0,rI0
+	xor		rD1,rD1,rI1
+	xor		rD2,rD2,rI2
+	xor		rD3,rD3,rI3
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_encrypt_block
+	xor		rD0,rD0,rW0
+	xor		rD1,rD1,rW1
+	xor		rD2,rD2,rW2
+	xor		rD3,rD3,rW3
+	xor		rD0,rD0,rI0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rI1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rI2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rI3
+	SAVE_DATA(rD3, 12)
+	GF128_MUL(rG0, rG1, rG2, rG3, rW0)
+	ENDIAN_SWAP(rI0, rI1, rG0, rG1)
+	ENDIAN_SWAP(rI2, rI3, rG2, rG3)
+	cmpwi		rLN,0
+	NEXT_BLOCK
+	bt		gt,ppc_encrypt_xts_loop
+	START_IV
+	SAVE_IV(rI0, 0)
+	SAVE_IV(rI1, 4)
+	SAVE_IV(rI2, 8)
+	SAVE_IV(rI3, 12)
+	FINALIZE_CRYPT(8)
+	blr
+
+/*
+ * ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec,
+ *		   u32 rounds, u32 blocks, u8 *iv, u32 *key_twk);
+ *
+ * called from glue layer to decrypt multiple blocks via XTS
+ * If key_twk is given, the initial IV encryption will be
+ * processed too. Round values are AES128 = 4, AES192 = 5,
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_xts)
+	INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 8)
+	LOAD_IV(rI0, 0)
+	addi		rT1,rT0,4096
+	LOAD_IV(rI1, 4)
+	LOAD_IV(rI2, 8)
+	cmpwi		rKT,0
+	LOAD_IV(rI3, 12)
+	bt		eq,ppc_decrypt_xts_notweak
+	subi		rT0,rT0,4096
+	mr		rKP,rKT
+	START_KEY(rI0, rI1, rI2, rI3)
+	bl		ppc_encrypt_block
+	xor		rI0,rD0,rW0
+	xor		rI1,rD1,rW1
+	xor		rI2,rD2,rW2
+	xor		rI3,rD3,rW3
+	addi		rT0,rT0,4096
+ppc_decrypt_xts_notweak:
+	ENDIAN_SWAP(rG0, rG1, rI0, rI1)
+	ENDIAN_SWAP(rG2, rG3, rI2, rI3)
+ppc_decrypt_xts_loop:
+	LOAD_DATA(rD0, 0)
+	mr		rKP,rKS
+	LOAD_DATA(rD1, 4)
+	subi		rLN,rLN,16
+	LOAD_DATA(rD2, 8)
+	LOAD_DATA(rD3, 12)
+	xor		rD0,rD0,rI0
+	xor		rD1,rD1,rI1
+	xor		rD2,rD2,rI2
+	xor		rD3,rD3,rI3
+	START_KEY(rD0, rD1, rD2, rD3)
+	bl		ppc_decrypt_block
+	xor		rD0,rD0,rW0
+	xor		rD1,rD1,rW1
+	xor		rD2,rD2,rW2
+	xor		rD3,rD3,rW3
+	xor		rD0,rD0,rI0
+	SAVE_DATA(rD0, 0)
+	xor		rD1,rD1,rI1
+	SAVE_DATA(rD1, 4)
+	xor		rD2,rD2,rI2
+	SAVE_DATA(rD2, 8)
+	xor		rD3,rD3,rI3
+	SAVE_DATA(rD3, 12)
+	GF128_MUL(rG0, rG1, rG2, rG3, rW0)
+	ENDIAN_SWAP(rI0, rI1, rG0, rG1)
+	ENDIAN_SWAP(rI2, rI3, rG2, rG3)
+	cmpwi		rLN,0
+	NEXT_BLOCK
+	bt		gt,ppc_decrypt_xts_loop
+	START_IV
+	SAVE_IV(rI0, 0)
+	SAVE_IV(rI1, 4)
+	SAVE_IV(rI2, 8)
+	SAVE_IV(rI3, 12)
+	FINALIZE_CRYPT(8)
+	blr
--- a/lib/crypto/powerpc/aes-spe-regs.h
+++ b/lib/crypto/powerpc/aes-spe-regs.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Common registers for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#define rKS r0	/* copy of en-/decryption key pointer			*/
+#define rDP r3	/* destination pointer					*/
+#define rSP r4	/* source pointer					*/
+#define rKP r5	/* pointer to en-/decryption key pointer		*/
+#define rRR r6	/* en-/decryption rounds				*/
+#define rLN r7	/* length of data to be processed			*/
+#define rIP r8	/* potiner to IV (CBC/CTR/XTS modes)			*/
+#define rKT r9	/* pointer to tweak key (XTS mode)			*/
+#define rT0 r11	/* pointers to en-/decryption tables			*/
+#define rT1 r10
+#define rD0 r9	/* data 						*/
+#define rD1 r14
+#define rD2 r12
+#define rD3 r15
+#define rW0 r16	/* working registers					*/
+#define rW1 r17
+#define rW2 r18
+#define rW3 r19
+#define rW4 r20
+#define rW5 r21
+#define rW6 r22
+#define rW7 r23
+#define rI0 r24	/* IV							*/
+#define rI1 r25
+#define rI2 r26
+#define rI3 r27
+#define rG0 r28	/* endian reversed tweak (XTS mode)			*/
+#define rG1 r29
+#define rG2 r30
+#define rG3 r31
--- a/lib/crypto/powerpc/aes-tab-4k.S
+++ b/lib/crypto/powerpc/aes-tab-4k.S
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * 4K AES tables for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+/*
+ * These big endian AES encryption/decryption tables have been taken from
+ * crypto/aes_generic.c and are designed to be simply accessed by a combination
+ * of rlwimi/lwz instructions with a minimum of table registers (usually only
+ * one required). Thus they are aligned to 4K. The locality of rotated values
+ * is derived from the reduced offsets that are available in the SPE load
+ * instructions. E.g. evldw, evlwwsplat, ...
+ *
+ * For the safety-conscious it has to be noted that they might be vulnerable
+ * to cache timing attacks because of their size. Nevertheless in contrast to
+ * the generic tables they have been reduced from 16KB to 8KB + 256 bytes.
+ * This is a quite good tradeoff for low power devices (e.g. routers) without
+ * dedicated encryption hardware where we usually have no multiuser
+ * environment.
+ *
+ */
+
+#define R(a, b, c, d) \
+	0x##a##b##c##d, 0x##d##a##b##c, 0x##c##d##a##b, 0x##b##c##d##a
+
+.data
+.align 12
+.globl PPC_AES_4K_ENCTAB
+PPC_AES_4K_ENCTAB:
+/* encryption table, same as crypto_ft_tab in crypto/aes-generic.c */
+	.long R(c6, 63, 63, a5), R(f8, 7c, 7c, 84)
+	.long R(ee, 77, 77, 99), R(f6, 7b, 7b, 8d)
+	.long R(ff, f2, f2, 0d), R(d6, 6b, 6b, bd)
+	.long R(de, 6f, 6f, b1), R(91, c5, c5, 54)
+	.long R(60, 30, 30, 50), R(02, 01, 01, 03)
+	.long R(ce, 67, 67, a9), R(56, 2b, 2b, 7d)
+	.long R(e7, fe, fe, 19), R(b5, d7, d7, 62)
+	.long R(4d, ab, ab, e6), R(ec, 76, 76, 9a)
+	.long R(8f, ca, ca, 45), R(1f, 82, 82, 9d)
+	.long R(89, c9, c9, 40), R(fa, 7d, 7d, 87)
+	.long R(ef, fa, fa, 15), R(b2, 59, 59, eb)
+	.long R(8e, 47, 47, c9), R(fb, f0, f0, 0b)
+	.long R(41, ad, ad, ec), R(b3, d4, d4, 67)
+	.long R(5f, a2, a2, fd), R(45, af, af, ea)
+	.long R(23, 9c, 9c, bf), R(53, a4, a4, f7)
+	.long R(e4, 72, 72, 96), R(9b, c0, c0, 5b)
+	.long R(75, b7, b7, c2), R(e1, fd, fd, 1c)
+	.long R(3d, 93, 93, ae), R(4c, 26, 26, 6a)
+	.long R(6c, 36, 36, 5a), R(7e, 3f, 3f, 41)
+	.long R(f5, f7, f7, 02), R(83, cc, cc, 4f)
+	.long R(68, 34, 34, 5c), R(51, a5, a5, f4)
+	.long R(d1, e5, e5, 34), R(f9, f1, f1, 08)
+	.long R(e2, 71, 71, 93), R(ab, d8, d8, 73)
+	.long R(62, 31, 31, 53), R(2a, 15, 15, 3f)
+	.long R(08, 04, 04, 0c), R(95, c7, c7, 52)
+	.long R(46, 23, 23, 65), R(9d, c3, c3, 5e)
+	.long R(30, 18, 18, 28), R(37, 96, 96, a1)
+	.long R(0a, 05, 05, 0f), R(2f, 9a, 9a, b5)
+	.long R(0e, 07, 07, 09), R(24, 12, 12, 36)
+	.long R(1b, 80, 80, 9b), R(df, e2, e2, 3d)
+	.long R(cd, eb, eb, 26), R(4e, 27, 27, 69)
+	.long R(7f, b2, b2, cd), R(ea, 75, 75, 9f)
+	.long R(12, 09, 09, 1b), R(1d, 83, 83, 9e)
+	.long R(58, 2c, 2c, 74), R(34, 1a, 1a, 2e)
+	.long R(36, 1b, 1b, 2d), R(dc, 6e, 6e, b2)
+	.long R(b4, 5a, 5a, ee), R(5b, a0, a0, fb)
+	.long R(a4, 52, 52, f6), R(76, 3b, 3b, 4d)
+	.long R(b7, d6, d6, 61), R(7d, b3, b3, ce)
+	.long R(52, 29, 29, 7b), R(dd, e3, e3, 3e)
+	.long R(5e, 2f, 2f, 71), R(13, 84, 84, 97)
+	.long R(a6, 53, 53, f5), R(b9, d1, d1, 68)
+	.long R(00, 00, 00, 00), R(c1, ed, ed, 2c)
+	.long R(40, 20, 20, 60), R(e3, fc, fc, 1f)
+	.long R(79, b1, b1, c8), R(b6, 5b, 5b, ed)
+	.long R(d4, 6a, 6a, be), R(8d, cb, cb, 46)
+	.long R(67, be, be, d9), R(72, 39, 39, 4b)
+	.long R(94, 4a, 4a, de), R(98, 4c, 4c, d4)
+	.long R(b0, 58, 58, e8), R(85, cf, cf, 4a)
+	.long R(bb, d0, d0, 6b), R(c5, ef, ef, 2a)
+	.long R(4f, aa, aa, e5), R(ed, fb, fb, 16)
+	.long R(86, 43, 43, c5), R(9a, 4d, 4d, d7)
+	.long R(66, 33, 33, 55), R(11, 85, 85, 94)
+	.long R(8a, 45, 45, cf), R(e9, f9, f9, 10)
+	.long R(04, 02, 02, 06), R(fe, 7f, 7f, 81)
+	.long R(a0, 50, 50, f0), R(78, 3c, 3c, 44)
+	.long R(25, 9f, 9f, ba), R(4b, a8, a8, e3)
+	.long R(a2, 51, 51, f3), R(5d, a3, a3, fe)
+	.long R(80, 40, 40, c0), R(05, 8f, 8f, 8a)
+	.long R(3f, 92, 92, ad), R(21, 9d, 9d, bc)
+	.long R(70, 38, 38, 48), R(f1, f5, f5, 04)
+	.long R(63, bc, bc, df), R(77, b6, b6, c1)
+	.long R(af, da, da, 75), R(42, 21, 21, 63)
+	.long R(20, 10, 10, 30), R(e5, ff, ff, 1a)
+	.long R(fd, f3, f3, 0e), R(bf, d2, d2, 6d)
+	.long R(81, cd, cd, 4c), R(18, 0c, 0c, 14)
+	.long R(26, 13, 13, 35), R(c3, ec, ec, 2f)
+	.long R(be, 5f, 5f, e1), R(35, 97, 97, a2)
+	.long R(88, 44, 44, cc), R(2e, 17, 17, 39)
+	.long R(93, c4, c4, 57), R(55, a7, a7, f2)
+	.long R(fc, 7e, 7e, 82), R(7a, 3d, 3d, 47)
+	.long R(c8, 64, 64, ac), R(ba, 5d, 5d, e7)
+	.long R(32, 19, 19, 2b), R(e6, 73, 73, 95)
+	.long R(c0, 60, 60, a0), R(19, 81, 81, 98)
+	.long R(9e, 4f, 4f, d1), R(a3, dc, dc, 7f)
+	.long R(44, 22, 22, 66), R(54, 2a, 2a, 7e)
+	.long R(3b, 90, 90, ab), R(0b, 88, 88, 83)
+	.long R(8c, 46, 46, ca), R(c7, ee, ee, 29)
+	.long R(6b, b8, b8, d3), R(28, 14, 14, 3c)
+	.long R(a7, de, de, 79), R(bc, 5e, 5e, e2)
+	.long R(16, 0b, 0b, 1d), R(ad, db, db, 76)
+	.long R(db, e0, e0, 3b), R(64, 32, 32, 56)
+	.long R(74, 3a, 3a, 4e), R(14, 0a, 0a, 1e)
+	.long R(92, 49, 49, db), R(0c, 06, 06, 0a)
+	.long R(48, 24, 24, 6c), R(b8, 5c, 5c, e4)
+	.long R(9f, c2, c2, 5d), R(bd, d3, d3, 6e)
+	.long R(43, ac, ac, ef), R(c4, 62, 62, a6)
+	.long R(39, 91, 91, a8), R(31, 95, 95, a4)
+	.long R(d3, e4, e4, 37), R(f2, 79, 79, 8b)
+	.long R(d5, e7, e7, 32), R(8b, c8, c8, 43)
+	.long R(6e, 37, 37, 59), R(da, 6d, 6d, b7)
+	.long R(01, 8d, 8d, 8c), R(b1, d5, d5, 64)
+	.long R(9c, 4e, 4e, d2), R(49, a9, a9, e0)
+	.long R(d8, 6c, 6c, b4), R(ac, 56, 56, fa)
+	.long R(f3, f4, f4, 07), R(cf, ea, ea, 25)
+	.long R(ca, 65, 65, af), R(f4, 7a, 7a, 8e)
+	.long R(47, ae, ae, e9), R(10, 08, 08, 18)
+	.long R(6f, ba, ba, d5), R(f0, 78, 78, 88)
+	.long R(4a, 25, 25, 6f), R(5c, 2e, 2e, 72)
+	.long R(38, 1c, 1c, 24), R(57, a6, a6, f1)
+	.long R(73, b4, b4, c7), R(97, c6, c6, 51)
+	.long R(cb, e8, e8, 23), R(a1, dd, dd, 7c)
+	.long R(e8, 74, 74, 9c), R(3e, 1f, 1f, 21)
+	.long R(96, 4b, 4b, dd), R(61, bd, bd, dc)
+	.long R(0d, 8b, 8b, 86), R(0f, 8a, 8a, 85)
+	.long R(e0, 70, 70, 90), R(7c, 3e, 3e, 42)
+	.long R(71, b5, b5, c4), R(cc, 66, 66, aa)
+	.long R(90, 48, 48, d8), R(06, 03, 03, 05)
+	.long R(f7, f6, f6, 01), R(1c, 0e, 0e, 12)
+	.long R(c2, 61, 61, a3), R(6a, 35, 35, 5f)
+	.long R(ae, 57, 57, f9), R(69, b9, b9, d0)
+	.long R(17, 86, 86, 91), R(99, c1, c1, 58)
+	.long R(3a, 1d, 1d, 27), R(27, 9e, 9e, b9)
+	.long R(d9, e1, e1, 38), R(eb, f8, f8, 13)
+	.long R(2b, 98, 98, b3), R(22, 11, 11, 33)
+	.long R(d2, 69, 69, bb), R(a9, d9, d9, 70)
+	.long R(07, 8e, 8e, 89), R(33, 94, 94, a7)
+	.long R(2d, 9b, 9b, b6), R(3c, 1e, 1e, 22)
+	.long R(15, 87, 87, 92), R(c9, e9, e9, 20)
+	.long R(87, ce, ce, 49), R(aa, 55, 55, ff)
+	.long R(50, 28, 28, 78), R(a5, df, df, 7a)
+	.long R(03, 8c, 8c, 8f), R(59, a1, a1, f8)
+	.long R(09, 89, 89, 80), R(1a, 0d, 0d, 17)
+	.long R(65, bf, bf, da), R(d7, e6, e6, 31)
+	.long R(84, 42, 42, c6), R(d0, 68, 68, b8)
+	.long R(82, 41, 41, c3), R(29, 99, 99, b0)
+	.long R(5a, 2d, 2d, 77), R(1e, 0f, 0f, 11)
+	.long R(7b, b0, b0, cb), R(a8, 54, 54, fc)
+	.long R(6d, bb, bb, d6), R(2c, 16, 16, 3a)
+.globl PPC_AES_4K_DECTAB
+PPC_AES_4K_DECTAB:
+/* decryption table, same as crypto_it_tab in crypto/aes-generic.c */
+	.long R(51, f4, a7, 50), R(7e, 41, 65, 53)
+	.long R(1a, 17, a4, c3), R(3a, 27, 5e, 96)
+	.long R(3b, ab, 6b, cb), R(1f, 9d, 45, f1)
+	.long R(ac, fa, 58, ab), R(4b, e3, 03, 93)
+	.long R(20, 30, fa, 55), R(ad, 76, 6d, f6)
+	.long R(88, cc, 76, 91), R(f5, 02, 4c, 25)
+	.long R(4f, e5, d7, fc), R(c5, 2a, cb, d7)
+	.long R(26, 35, 44, 80), R(b5, 62, a3, 8f)
+	.long R(de, b1, 5a, 49), R(25, ba, 1b, 67)
+	.long R(45, ea, 0e, 98), R(5d, fe, c0, e1)
+	.long R(c3, 2f, 75, 02), R(81, 4c, f0, 12)
+	.long R(8d, 46, 97, a3), R(6b, d3, f9, c6)
+	.long R(03, 8f, 5f, e7), R(15, 92, 9c, 95)
+	.long R(bf, 6d, 7a, eb), R(95, 52, 59, da)
+	.long R(d4, be, 83, 2d), R(58, 74, 21, d3)
+	.long R(49, e0, 69, 29), R(8e, c9, c8, 44)
+	.long R(75, c2, 89, 6a), R(f4, 8e, 79, 78)
+	.long R(99, 58, 3e, 6b), R(27, b9, 71, dd)
+	.long R(be, e1, 4f, b6), R(f0, 88, ad, 17)
+	.long R(c9, 20, ac, 66), R(7d, ce, 3a, b4)
+	.long R(63, df, 4a, 18), R(e5, 1a, 31, 82)
+	.long R(97, 51, 33, 60), R(62, 53, 7f, 45)
+	.long R(b1, 64, 77, e0), R(bb, 6b, ae, 84)
+	.long R(fe, 81, a0, 1c), R(f9, 08, 2b, 94)
+	.long R(70, 48, 68, 58), R(8f, 45, fd, 19)
+	.long R(94, de, 6c, 87), R(52, 7b, f8, b7)
+	.long R(ab, 73, d3, 23), R(72, 4b, 02, e2)
+	.long R(e3, 1f, 8f, 57), R(66, 55, ab, 2a)
+	.long R(b2, eb, 28, 07), R(2f, b5, c2, 03)
+	.long R(86, c5, 7b, 9a), R(d3, 37, 08, a5)
+	.long R(30, 28, 87, f2), R(23, bf, a5, b2)
+	.long R(02, 03, 6a, ba), R(ed, 16, 82, 5c)
+	.long R(8a, cf, 1c, 2b), R(a7, 79, b4, 92)
+	.long R(f3, 07, f2, f0), R(4e, 69, e2, a1)
+	.long R(65, da, f4, cd), R(06, 05, be, d5)
+	.long R(d1, 34, 62, 1f), R(c4, a6, fe, 8a)
+	.long R(34, 2e, 53, 9d), R(a2, f3, 55, a0)
+	.long R(05, 8a, e1, 32), R(a4, f6, eb, 75)
+	.long R(0b, 83, ec, 39), R(40, 60, ef, aa)
+	.long R(5e, 71, 9f, 06), R(bd, 6e, 10, 51)
+	.long R(3e, 21, 8a, f9), R(96, dd, 06, 3d)
+	.long R(dd, 3e, 05, ae), R(4d, e6, bd, 46)
+	.long R(91, 54, 8d, b5), R(71, c4, 5d, 05)
+	.long R(04, 06, d4, 6f), R(60, 50, 15, ff)
+	.long R(19, 98, fb, 24), R(d6, bd, e9, 97)
+	.long R(89, 40, 43, cc), R(67, d9, 9e, 77)
+	.long R(b0, e8, 42, bd), R(07, 89, 8b, 88)
+	.long R(e7, 19, 5b, 38), R(79, c8, ee, db)
+	.long R(a1, 7c, 0a, 47), R(7c, 42, 0f, e9)
+	.long R(f8, 84, 1e, c9), R(00, 00, 00, 00)
+	.long R(09, 80, 86, 83), R(32, 2b, ed, 48)
+	.long R(1e, 11, 70, ac), R(6c, 5a, 72, 4e)
+	.long R(fd, 0e, ff, fb), R(0f, 85, 38, 56)
+	.long R(3d, ae, d5, 1e), R(36, 2d, 39, 27)
+	.long R(0a, 0f, d9, 64), R(68, 5c, a6, 21)
+	.long R(9b, 5b, 54, d1), R(24, 36, 2e, 3a)
+	.long R(0c, 0a, 67, b1), R(93, 57, e7, 0f)
+	.long R(b4, ee, 96, d2), R(1b, 9b, 91, 9e)
+	.long R(80, c0, c5, 4f), R(61, dc, 20, a2)
+	.long R(5a, 77, 4b, 69), R(1c, 12, 1a, 16)
+	.long R(e2, 93, ba, 0a), R(c0, a0, 2a, e5)
+	.long R(3c, 22, e0, 43), R(12, 1b, 17, 1d)
+	.long R(0e, 09, 0d, 0b), R(f2, 8b, c7, ad)
+	.long R(2d, b6, a8, b9), R(14, 1e, a9, c8)
+	.long R(57, f1, 19, 85), R(af, 75, 07, 4c)
+	.long R(ee, 99, dd, bb), R(a3, 7f, 60, fd)
+	.long R(f7, 01, 26, 9f), R(5c, 72, f5, bc)
+	.long R(44, 66, 3b, c5), R(5b, fb, 7e, 34)
+	.long R(8b, 43, 29, 76), R(cb, 23, c6, dc)
+	.long R(b6, ed, fc, 68), R(b8, e4, f1, 63)
+	.long R(d7, 31, dc, ca), R(42, 63, 85, 10)
+	.long R(13, 97, 22, 40), R(84, c6, 11, 20)
+	.long R(85, 4a, 24, 7d), R(d2, bb, 3d, f8)
+	.long R(ae, f9, 32, 11), R(c7, 29, a1, 6d)
+	.long R(1d, 9e, 2f, 4b), R(dc, b2, 30, f3)
+	.long R(0d, 86, 52, ec), R(77, c1, e3, d0)
+	.long R(2b, b3, 16, 6c), R(a9, 70, b9, 99)
+	.long R(11, 94, 48, fa), R(47, e9, 64, 22)
+	.long R(a8, fc, 8c, c4), R(a0, f0, 3f, 1a)
+	.long R(56, 7d, 2c, d8), R(22, 33, 90, ef)
+	.long R(87, 49, 4e, c7), R(d9, 38, d1, c1)
+	.long R(8c, ca, a2, fe), R(98, d4, 0b, 36)
+	.long R(a6, f5, 81, cf), R(a5, 7a, de, 28)
+	.long R(da, b7, 8e, 26), R(3f, ad, bf, a4)
+	.long R(2c, 3a, 9d, e4), R(50, 78, 92, 0d)
+	.long R(6a, 5f, cc, 9b), R(54, 7e, 46, 62)
+	.long R(f6, 8d, 13, c2), R(90, d8, b8, e8)
+	.long R(2e, 39, f7, 5e), R(82, c3, af, f5)
+	.long R(9f, 5d, 80, be), R(69, d0, 93, 7c)
+	.long R(6f, d5, 2d, a9), R(cf, 25, 12, b3)
+	.long R(c8, ac, 99, 3b), R(10, 18, 7d, a7)
+	.long R(e8, 9c, 63, 6e), R(db, 3b, bb, 7b)
+	.long R(cd, 26, 78, 09), R(6e, 59, 18, f4)
+	.long R(ec, 9a, b7, 01), R(83, 4f, 9a, a8)
+	.long R(e6, 95, 6e, 65), R(aa, ff, e6, 7e)
+	.long R(21, bc, cf, 08), R(ef, 15, e8, e6)
+	.long R(ba, e7, 9b, d9), R(4a, 6f, 36, ce)
+	.long R(ea, 9f, 09, d4), R(29, b0, 7c, d6)
+	.long R(31, a4, b2, af), R(2a, 3f, 23, 31)
+	.long R(c6, a5, 94, 30), R(35, a2, 66, c0)
+	.long R(74, 4e, bc, 37), R(fc, 82, ca, a6)
+	.long R(e0, 90, d0, b0), R(33, a7, d8, 15)
+	.long R(f1, 04, 98, 4a), R(41, ec, da, f7)
+	.long R(7f, cd, 50, 0e), R(17, 91, f6, 2f)
+	.long R(76, 4d, d6, 8d), R(43, ef, b0, 4d)
+	.long R(cc, aa, 4d, 54), R(e4, 96, 04, df)
+	.long R(9e, d1, b5, e3), R(4c, 6a, 88, 1b)
+	.long R(c1, 2c, 1f, b8), R(46, 65, 51, 7f)
+	.long R(9d, 5e, ea, 04), R(01, 8c, 35, 5d)
+	.long R(fa, 87, 74, 73), R(fb, 0b, 41, 2e)
+	.long R(b3, 67, 1d, 5a), R(92, db, d2, 52)
+	.long R(e9, 10, 56, 33), R(6d, d6, 47, 13)
+	.long R(9a, d7, 61, 8c), R(37, a1, 0c, 7a)
+	.long R(59, f8, 14, 8e), R(eb, 13, 3c, 89)
+	.long R(ce, a9, 27, ee), R(b7, 61, c9, 35)
+	.long R(e1, 1c, e5, ed), R(7a, 47, b1, 3c)
+	.long R(9c, d2, df, 59), R(55, f2, 73, 3f)
+	.long R(18, 14, ce, 79), R(73, c7, 37, bf)
+	.long R(53, f7, cd, ea), R(5f, fd, aa, 5b)
+	.long R(df, 3d, 6f, 14), R(78, 44, db, 86)
+	.long R(ca, af, f3, 81), R(b9, 68, c4, 3e)
+	.long R(38, 24, 34, 2c), R(c2, a3, 40, 5f)
+	.long R(16, 1d, c3, 72), R(bc, e2, 25, 0c)
+	.long R(28, 3c, 49, 8b), R(ff, 0d, 95, 41)
+	.long R(39, a8, 01, 71), R(08, 0c, b3, de)
+	.long R(d8, b4, e4, 9c), R(64, 56, c1, 90)
+	.long R(7b, cb, 84, 61), R(d5, 32, b6, 70)
+	.long R(48, 6c, 5c, 74), R(d0, b8, 57, 42)
+.globl PPC_AES_4K_DECTAB2
+PPC_AES_4K_DECTAB2:
+/* decryption table, same as crypto_il_tab in crypto/aes-generic.c */
+	.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
--- a/lib/crypto/powerpc/aes.h
+++ b/lib/crypto/powerpc/aes.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ * Copyright (C) 2015 International Business Machines Inc.
+ * Copyright 2026 Google LLC
+ */
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#ifdef CONFIG_SPE
+
+EXPORT_SYMBOL_GPL(ppc_expand_key_128);
+EXPORT_SYMBOL_GPL(ppc_expand_key_192);
+EXPORT_SYMBOL_GPL(ppc_expand_key_256);
+EXPORT_SYMBOL_GPL(ppc_generate_decrypt_key);
+EXPORT_SYMBOL_GPL(ppc_encrypt_ecb);
+EXPORT_SYMBOL_GPL(ppc_decrypt_ecb);
+EXPORT_SYMBOL_GPL(ppc_encrypt_cbc);
+EXPORT_SYMBOL_GPL(ppc_decrypt_cbc);
+EXPORT_SYMBOL_GPL(ppc_crypt_ctr);
+EXPORT_SYMBOL_GPL(ppc_encrypt_xts);
+EXPORT_SYMBOL_GPL(ppc_decrypt_xts);
+
+void ppc_encrypt_aes(u8 *out, const u8 *in, const u32 *key_enc, u32 rounds);
+void ppc_decrypt_aes(u8 *out, const u8 *in, const u32 *key_dec, u32 rounds);
+
+static void spe_begin(void)
+{
+	/* disable preemption and save users SPE registers if required */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	if (key_len == AES_KEYSIZE_128)
+		ppc_expand_key_128(k->spe_enc_key, in_key);
+	else if (key_len == AES_KEYSIZE_192)
+		ppc_expand_key_192(k->spe_enc_key, in_key);
+	else
+		ppc_expand_key_256(k->spe_enc_key, in_key);
+
+	if (inv_k)
+		ppc_generate_decrypt_key(inv_k->spe_dec_key, k->spe_enc_key,
+					 key_len);
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	spe_begin();
+	ppc_encrypt_aes(out, in, key->k.spe_enc_key, key->nrounds / 2 - 1);
+	spe_end();
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	spe_begin();
+	ppc_decrypt_aes(out, in, key->inv_k.spe_dec_key, key->nrounds / 2 - 1);
+	spe_end();
+}
+
+#else /* CONFIG_SPE */
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+EXPORT_SYMBOL_GPL(aes_p8_set_encrypt_key);
+EXPORT_SYMBOL_GPL(aes_p8_set_decrypt_key);
+EXPORT_SYMBOL_GPL(aes_p8_encrypt);
+EXPORT_SYMBOL_GPL(aes_p8_decrypt);
+EXPORT_SYMBOL_GPL(aes_p8_cbc_encrypt);
+EXPORT_SYMBOL_GPL(aes_p8_ctr32_encrypt_blocks);
+EXPORT_SYMBOL_GPL(aes_p8_xts_encrypt);
+EXPORT_SYMBOL_GPL(aes_p8_xts_decrypt);
+
+static inline bool is_vsx_format(const struct p8_aes_key *key)
+{
+	return key->nrounds != 0;
+}
+
+/*
+ * Convert a round key from VSX to generic format by reflecting the 16 bytes,
+ * and (if apply_inv_mix=true) applying InvMixColumn to each column.
+ *
+ * It would be nice if the VSX and generic key formats would be compatible.  But
+ * that's very difficult to do, with the assembly code having been borrowed from
+ * OpenSSL and also targeted to POWER8 rather than POWER9.
+ *
+ * Fortunately, this conversion should only be needed in extremely rare cases,
+ * possibly not at all in practice.  It's just included for full correctness.
+ */
+static void rndkey_from_vsx(u32 out[4], const u32 in[4], bool apply_inv_mix)
+{
+	u32 k0 = swab32(in[0]);
+	u32 k1 = swab32(in[1]);
+	u32 k2 = swab32(in[2]);
+	u32 k3 = swab32(in[3]);
+
+	if (apply_inv_mix) {
+		k0 = inv_mix_columns(k0);
+		k1 = inv_mix_columns(k1);
+		k2 = inv_mix_columns(k2);
+		k3 = inv_mix_columns(k3);
+	}
+	out[0] = k3;
+	out[1] = k2;
+	out[2] = k1;
+	out[3] = k0;
+}
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	const int keybits = 8 * key_len;
+	int ret;
+
+	if (static_branch_likely(&have_vec_crypto) && likely(may_use_simd())) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		ret = aes_p8_set_encrypt_key(in_key, keybits, &k->p8);
+		/*
+		 * aes_p8_set_encrypt_key() should never fail here, since the
+		 * key length was already validated.
+		 */
+		WARN_ON_ONCE(ret);
+		if (inv_k) {
+			ret = aes_p8_set_decrypt_key(in_key, keybits,
+						     &inv_k->p8);
+			/* ... and likewise for aes_p8_set_decrypt_key(). */
+			WARN_ON_ONCE(ret);
+		}
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	} else {
+		aes_expandkey_generic(k->rndkeys,
+				      inv_k ? inv_k->inv_rndkeys : NULL,
+				      in_key, key_len);
+		/* Mark the key as using the generic format. */
+		k->p8.nrounds = 0;
+		if (inv_k)
+			inv_k->p8.nrounds = 0;
+	}
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_vec_crypto) &&
+	    likely(is_vsx_format(&key->k.p8) && may_use_simd())) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		aes_p8_encrypt(in, out, &key->k.p8);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	} else if (unlikely(is_vsx_format(&key->k.p8))) {
+		/*
+		 * This handles (the hopefully extremely rare) case where a key
+		 * was prepared using the VSX optimized format, then encryption
+		 * is done in a context that cannot use VSX instructions.
+		 */
+		u32 rndkeys[AES_MAX_KEYLENGTH_U32];
+
+		for (int i = 0; i < 4 * (key->nrounds + 1); i += 4)
+			rndkey_from_vsx(&rndkeys[i],
+					&key->k.p8.rndkeys[i], false);
+		aes_encrypt_generic(rndkeys, key->nrounds, out, in);
+	} else {
+		aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key, u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_vec_crypto) &&
+	    likely(is_vsx_format(&key->inv_k.p8) && may_use_simd())) {
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_vsx();
+		aes_p8_decrypt(in, out, &key->inv_k.p8);
+		disable_kernel_vsx();
+		pagefault_enable();
+		preempt_enable();
+	} else if (unlikely(is_vsx_format(&key->inv_k.p8))) {
+		/*
+		 * This handles (the hopefully extremely rare) case where a key
+		 * was prepared using the VSX optimized format, then decryption
+		 * is done in a context that cannot use VSX instructions.
+		 */
+		u32 inv_rndkeys[AES_MAX_KEYLENGTH_U32];
+		int i;
+
+		rndkey_from_vsx(&inv_rndkeys[0],
+				&key->inv_k.p8.rndkeys[0], false);
+		for (i = 4; i < 4 * key->nrounds; i += 4) {
+			rndkey_from_vsx(&inv_rndkeys[i],
+					&key->inv_k.p8.rndkeys[i], true);
+		}
+		rndkey_from_vsx(&inv_rndkeys[i],
+				&key->inv_k.p8.rndkeys[i], false);
+		aes_decrypt_generic(inv_rndkeys, key->nrounds, out, in);
+	} else {
+		aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+				    out, in);
+	}
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+	    (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+		static_branch_enable(&have_vec_crypto);
+}
+
+#endif /* !CONFIG_SPE */
--- a/lib/crypto/powerpc/aesp8-ppc.pl
+++ b/lib/crypto/powerpc/aesp8-ppc.pl
--- a/lib/crypto/riscv/aes-riscv64-zvkned.S
+++ b/lib/crypto/riscv/aes-riscv64-zvkned.S
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector AES block cipher extension ('Zvkned')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkned
+
+#include "../../arch/riscv/crypto/aes-macros.S"
+
+#define RNDKEYS		a0
+#define KEY_LEN		a1
+#define OUTP		a2
+#define INP		a3
+
+.macro	__aes_crypt_zvkned	enc, keybits
+	vle32.v		v16, (INP)
+	aes_crypt	v16, \enc, \keybits
+	vse32.v		v16, (OUTP)
+	ret
+.endm
+
+.macro	aes_crypt_zvkned	enc
+	aes_begin	RNDKEYS, 128f, 192f, KEY_LEN
+	__aes_crypt_zvkned	\enc, 256
+128:
+	__aes_crypt_zvkned	\enc, 128
+192:
+	__aes_crypt_zvkned	\enc, 192
+.endm
+
+// void aes_encrypt_zvkned(const u32 rndkeys[], int key_len,
+//			   u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_encrypt_zvkned)
+	aes_crypt_zvkned	1
+SYM_FUNC_END(aes_encrypt_zvkned)
+
+// void aes_decrypt_zvkned(const u32 rndkeys[], int key_len,
+//			   u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_decrypt_zvkned)
+	aes_crypt_zvkned	0
+SYM_FUNC_END(aes_decrypt_zvkned)
--- a/lib/crypto/riscv/aes.h
+++ b/lib/crypto/riscv/aes.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 VRULL GmbH
+ * Copyright (C) 2023 SiFive, Inc.
+ * Copyright 2024 Google LLC
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_zvkned);
+
+void aes_encrypt_zvkned(const u32 rndkeys[], int key_len,
+			u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+void aes_decrypt_zvkned(const u32 rndkeys[], int key_len,
+			u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	aes_expandkey_generic(k->rndkeys, inv_k ? inv_k->inv_rndkeys : NULL,
+			      in_key, key_len);
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_zvkned) && likely(may_use_simd())) {
+		kernel_vector_begin();
+		aes_encrypt_zvkned(key->k.rndkeys, key->len, out, in);
+		kernel_vector_end();
+	} else {
+		aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	/*
+	 * Note that the Zvkned code uses the standard round keys, while the
+	 * fallback uses the inverse round keys.  Thus both must be present.
+	 */
+	if (static_branch_likely(&have_zvkned) && likely(may_use_simd())) {
+		kernel_vector_begin();
+		aes_decrypt_zvkned(key->k.rndkeys, key->len, out, in);
+		kernel_vector_end();
+	} else {
+		aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+				    out, in);
+	}
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKNED) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_zvkned);
+}
--- a/lib/crypto/s390/aes.h
+++ b/lib/crypto/s390/aes.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2026 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_aes128);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_aes192);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_aes256);
+
+/*
+ * When the CPU supports CPACF AES for the requested key length, we need only
+ * save a copy of the raw AES key, as that's what the CPACF instructions need.
+ *
+ * When unsupported, fall back to the generic key expansion and en/decryption.
+ */
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	if (key_len == AES_KEYSIZE_128) {
+		if (static_branch_likely(&have_cpacf_aes128)) {
+			memcpy(k->raw_key, in_key, AES_KEYSIZE_128);
+			return;
+		}
+	} else if (key_len == AES_KEYSIZE_192) {
+		if (static_branch_likely(&have_cpacf_aes192)) {
+			memcpy(k->raw_key, in_key, AES_KEYSIZE_192);
+			return;
+		}
+	} else {
+		if (static_branch_likely(&have_cpacf_aes256)) {
+			memcpy(k->raw_key, in_key, AES_KEYSIZE_256);
+			return;
+		}
+	}
+	aes_expandkey_generic(k->rndkeys, inv_k ? inv_k->inv_rndkeys : NULL,
+			      in_key, key_len);
+}
+
+static inline bool aes_crypt_s390(const struct aes_enckey *key,
+				  u8 out[AES_BLOCK_SIZE],
+				  const u8 in[AES_BLOCK_SIZE], int decrypt)
+{
+	if (key->len == AES_KEYSIZE_128) {
+		if (static_branch_likely(&have_cpacf_aes128)) {
+			cpacf_km(CPACF_KM_AES_128 | decrypt,
+				 (void *)key->k.raw_key, out, in,
+				 AES_BLOCK_SIZE);
+			return true;
+		}
+	} else if (key->len == AES_KEYSIZE_192) {
+		if (static_branch_likely(&have_cpacf_aes192)) {
+			cpacf_km(CPACF_KM_AES_192 | decrypt,
+				 (void *)key->k.raw_key, out, in,
+				 AES_BLOCK_SIZE);
+			return true;
+		}
+	} else {
+		if (static_branch_likely(&have_cpacf_aes256)) {
+			cpacf_km(CPACF_KM_AES_256 | decrypt,
+				 (void *)key->k.raw_key, out, in,
+				 AES_BLOCK_SIZE);
+			return true;
+		}
+	}
+	return false;
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (likely(aes_crypt_s390(key, out, in, 0)))
+		return;
+	aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (likely(aes_crypt_s390((const struct aes_enckey *)key, out, in,
+				  CPACF_DECRYPT)))
+		return;
+	aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds, out, in);
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA)) {
+		cpacf_mask_t km_functions;
+
+		cpacf_query(CPACF_KM, &km_functions);
+		if (cpacf_test_func(&km_functions, CPACF_KM_AES_128))
+			static_branch_enable(&have_cpacf_aes128);
+		if (cpacf_test_func(&km_functions, CPACF_KM_AES_192))
+			static_branch_enable(&have_cpacf_aes192);
+		if (cpacf_test_func(&km_functions, CPACF_KM_AES_256))
+			static_branch_enable(&have_cpacf_aes256);
+	}
+}
--- a/lib/crypto/sparc/aes.h
+++ b/lib/crypto/sparc/aes.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AES accelerated using the sparc64 aes opcodes
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Copyright (c) 2010, Intel Corporation.
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/fpumacro.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes_opcodes);
+
+EXPORT_SYMBOL_GPL(aes_sparc64_key_expand);
+EXPORT_SYMBOL_GPL(aes_sparc64_load_encrypt_keys_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_load_encrypt_keys_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_load_encrypt_keys_256);
+EXPORT_SYMBOL_GPL(aes_sparc64_load_decrypt_keys_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_load_decrypt_keys_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_load_decrypt_keys_256);
+EXPORT_SYMBOL_GPL(aes_sparc64_ecb_encrypt_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_ecb_encrypt_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_ecb_encrypt_256);
+EXPORT_SYMBOL_GPL(aes_sparc64_ecb_decrypt_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_ecb_decrypt_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_ecb_decrypt_256);
+EXPORT_SYMBOL_GPL(aes_sparc64_cbc_encrypt_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_cbc_encrypt_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_cbc_encrypt_256);
+EXPORT_SYMBOL_GPL(aes_sparc64_cbc_decrypt_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_cbc_decrypt_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_cbc_decrypt_256);
+EXPORT_SYMBOL_GPL(aes_sparc64_ctr_crypt_128);
+EXPORT_SYMBOL_GPL(aes_sparc64_ctr_crypt_192);
+EXPORT_SYMBOL_GPL(aes_sparc64_ctr_crypt_256);
+
+void aes_sparc64_encrypt_128(const u64 *key, const u32 *input, u32 *output);
+void aes_sparc64_encrypt_192(const u64 *key, const u32 *input, u32 *output);
+void aes_sparc64_encrypt_256(const u64 *key, const u32 *input, u32 *output);
+void aes_sparc64_decrypt_128(const u64 *key, const u32 *input, u32 *output);
+void aes_sparc64_decrypt_192(const u64 *key, const u32 *input, u32 *output);
+void aes_sparc64_decrypt_256(const u64 *key, const u32 *input, u32 *output);
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	if (static_branch_likely(&have_aes_opcodes)) {
+		u32 aligned_key[AES_MAX_KEY_SIZE / 4];
+
+		if (IS_ALIGNED((uintptr_t)in_key, 4)) {
+			aes_sparc64_key_expand((const u32 *)in_key,
+					       k->sparc_rndkeys, key_len);
+		} else {
+			memcpy(aligned_key, in_key, key_len);
+			aes_sparc64_key_expand(aligned_key,
+					       k->sparc_rndkeys, key_len);
+			memzero_explicit(aligned_key, key_len);
+		}
+		/*
+		 * Note that nothing needs to be written to inv_k (if it's
+		 * non-NULL) here, since the SPARC64 assembly code uses
+		 * k->sparc_rndkeys for both encryption and decryption.
+		 */
+	} else {
+		aes_expandkey_generic(k->rndkeys,
+				      inv_k ? inv_k->inv_rndkeys : NULL,
+				      in_key, key_len);
+	}
+}
+
+static void aes_sparc64_encrypt(const struct aes_enckey *key,
+				const u32 *input, u32 *output)
+{
+	if (key->len == AES_KEYSIZE_128)
+		aes_sparc64_encrypt_128(key->k.sparc_rndkeys, input, output);
+	else if (key->len == AES_KEYSIZE_192)
+		aes_sparc64_encrypt_192(key->k.sparc_rndkeys, input, output);
+	else
+		aes_sparc64_encrypt_256(key->k.sparc_rndkeys, input, output);
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	u32 bounce_buf[AES_BLOCK_SIZE / 4];
+
+	if (static_branch_likely(&have_aes_opcodes)) {
+		if (IS_ALIGNED((uintptr_t)in | (uintptr_t)out, 4)) {
+			aes_sparc64_encrypt(key, (const u32 *)in, (u32 *)out);
+		} else {
+			memcpy(bounce_buf, in, AES_BLOCK_SIZE);
+			aes_sparc64_encrypt(key, bounce_buf, bounce_buf);
+			memcpy(out, bounce_buf, AES_BLOCK_SIZE);
+		}
+	} else {
+		aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+	}
+}
+
+static void aes_sparc64_decrypt(const struct aes_key *key,
+				const u32 *input, u32 *output)
+{
+	if (key->len == AES_KEYSIZE_128)
+		aes_sparc64_decrypt_128(key->k.sparc_rndkeys, input, output);
+	else if (key->len == AES_KEYSIZE_192)
+		aes_sparc64_decrypt_192(key->k.sparc_rndkeys, input, output);
+	else
+		aes_sparc64_decrypt_256(key->k.sparc_rndkeys, input, output);
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	u32 bounce_buf[AES_BLOCK_SIZE / 4];
+
+	if (static_branch_likely(&have_aes_opcodes)) {
+		if (IS_ALIGNED((uintptr_t)in | (uintptr_t)out, 4)) {
+			aes_sparc64_decrypt(key, (const u32 *)in, (u32 *)out);
+		} else {
+			memcpy(bounce_buf, in, AES_BLOCK_SIZE);
+			aes_sparc64_decrypt(key, bounce_buf, bounce_buf);
+			memcpy(out, bounce_buf, AES_BLOCK_SIZE);
+		}
+	} else {
+		aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+				    out, in);
+	}
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_AES))
+		return;
+
+	static_branch_enable(&have_aes_opcodes);
+}
--- a/lib/crypto/sparc/aes_asm.S
+++ b/lib/crypto/sparc/aes_asm.S
--- a/lib/crypto/tests/Kconfig
+++ b/lib/crypto/tests/Kconfig
@@ -38,6 +38,23 @@ config CRYPTO_LIB_MD5_KUNIT_TEST
 	  KUnit tests for the MD5 cryptographic hash function and its
 	  corresponding HMAC.

+config CRYPTO_LIB_MLDSA_KUNIT_TEST
+	tristate "KUnit tests for ML-DSA" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_MLDSA
+	help
+	  KUnit tests for the ML-DSA digital signature algorithm.
+
+config CRYPTO_LIB_NH_KUNIT_TEST
+	tristate "KUnit tests for NH" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_NH
+	help
+	  KUnit tests for the NH almost-universal hash function.
+
 config CRYPTO_LIB_POLY1305_KUNIT_TEST
 	tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS
 	depends on KUNIT
--- a/lib/crypto/tests/Makefile
+++ b/lib/crypto/tests/Makefile
@@ -4,6 +4,8 @@ obj-$(CONFIG_CRYPTO_LIB_BLAKE2B_KUNIT_TEST) += blake2b_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_MLDSA_KUNIT_TEST) += mldsa_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_NH_KUNIT_TEST) += nh_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_POLYVAL_KUNIT_TEST) += polyval_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
--- a/lib/crypto/tests/mldsa-testvecs.h
+++ b/lib/crypto/tests/mldsa-testvecs.h
--- a/lib/crypto/tests/mldsa_kunit.c
+++ b/lib/crypto/tests/mldsa_kunit.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * KUnit tests and benchmark for ML-DSA
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/mldsa.h>
+#include <kunit/test.h>
+#include <linux/random.h>
+#include <linux/unaligned.h>
+
+#define Q 8380417 /* The prime q = 2^23 - 2^13 + 1 */
+
+/* ML-DSA parameters that the tests use */
+static const struct {
+	int sig_len;
+	int pk_len;
+	int k;
+	int lambda;
+	int gamma1;
+	int beta;
+	int omega;
+} params[] = {
+	[MLDSA44] = {
+		.sig_len = MLDSA44_SIGNATURE_SIZE,
+		.pk_len = MLDSA44_PUBLIC_KEY_SIZE,
+		.k = 4,
+		.lambda = 128,
+		.gamma1 = 1 << 17,
+		.beta = 78,
+		.omega = 80,
+	},
+	[MLDSA65] = {
+		.sig_len = MLDSA65_SIGNATURE_SIZE,
+		.pk_len = MLDSA65_PUBLIC_KEY_SIZE,
+		.k = 6,
+		.lambda = 192,
+		.gamma1 = 1 << 19,
+		.beta = 196,
+		.omega = 55,
+	},
+	[MLDSA87] = {
+		.sig_len = MLDSA87_SIGNATURE_SIZE,
+		.pk_len = MLDSA87_PUBLIC_KEY_SIZE,
+		.k = 8,
+		.lambda = 256,
+		.gamma1 = 1 << 19,
+		.beta = 120,
+		.omega = 75,
+	},
+};
+
+#include "mldsa-testvecs.h"
+
+static void do_mldsa_and_assert_success(struct kunit *test,
+					const struct mldsa_testvector *tv)
+{
+	int err = mldsa_verify(tv->alg, tv->sig, tv->sig_len, tv->msg,
+			       tv->msg_len, tv->pk, tv->pk_len);
+	KUNIT_ASSERT_EQ(test, err, 0);
+}
+
+static u8 *kunit_kmemdup_or_fail(struct kunit *test, const u8 *src, size_t len)
+{
+	u8 *dst = kunit_kmalloc(test, len, GFP_KERNEL);
+
+	KUNIT_ASSERT_NOT_NULL(test, dst);
+	return memcpy(dst, src, len);
+}
+
+/*
+ * Test that changing coefficients in a valid signature's z vector results in
+ * the following behavior from mldsa_verify():
+ *
+ *  * -EBADMSG if a coefficient is changed to have an out-of-range value, i.e.
+ *    absolute value >= gamma1 - beta, corresponding to the verifier detecting
+ *    the out-of-range coefficient and rejecting the signature as malformed
+ *
+ *  * -EKEYREJECTED if a coefficient is changed to a different in-range value,
+ *    i.e. absolute value < gamma1 - beta, corresponding to the verifier
+ *    continuing to the "real" signature check and that check failing
+ */
+static void test_mldsa_z_range(struct kunit *test,
+			       const struct mldsa_testvector *tv)
+{
+	u8 *sig = kunit_kmemdup_or_fail(test, tv->sig, tv->sig_len);
+	const int lambda = params[tv->alg].lambda;
+	const s32 gamma1 = params[tv->alg].gamma1;
+	const int beta = params[tv->alg].beta;
+	/*
+	 * We just modify the first coefficient.  The coefficient is gamma1
+	 * minus either the first 18 or 20 bits of the u32, depending on gamma1.
+	 *
+	 * The layout of ML-DSA signatures is ctilde || z || h.  ctilde is
+	 * lambda / 4 bytes, so z starts at &sig[lambda / 4].
+	 */
+	u8 *z_ptr = &sig[lambda / 4];
+	const u32 z_data = get_unaligned_le32(z_ptr);
+	const u32 mask = (gamma1 << 1) - 1;
+	/* These are the four boundaries of the out-of-range values. */
+	const s32 out_of_range_coeffs[] = {
+		-gamma1 + 1,
+		-(gamma1 - beta),
+		gamma1,
+		gamma1 - beta,
+	};
+	/*
+	 * These are the two boundaries of the valid range, along with 0.  We
+	 * assume that none of these matches the original coefficient.
+	 */
+	const s32 in_range_coeffs[] = {
+		-(gamma1 - beta - 1),
+		0,
+		gamma1 - beta - 1,
+	};
+
+	/* Initially the signature is valid. */
+	do_mldsa_and_assert_success(test, tv);
+
+	/* Test some out-of-range coefficients. */
+	for (int i = 0; i < ARRAY_SIZE(out_of_range_coeffs); i++) {
+		const s32 c = out_of_range_coeffs[i];
+
+		put_unaligned_le32((z_data & ~mask) | (mask & (gamma1 - c)),
+				   z_ptr);
+		KUNIT_ASSERT_EQ(test, -EBADMSG,
+				mldsa_verify(tv->alg, sig, tv->sig_len, tv->msg,
+					     tv->msg_len, tv->pk, tv->pk_len));
+	}
+
+	/* Test some in-range coefficients. */
+	for (int i = 0; i < ARRAY_SIZE(in_range_coeffs); i++) {
+		const s32 c = in_range_coeffs[i];
+
+		put_unaligned_le32((z_data & ~mask) | (mask & (gamma1 - c)),
+				   z_ptr);
+		KUNIT_ASSERT_EQ(test, -EKEYREJECTED,
+				mldsa_verify(tv->alg, sig, tv->sig_len, tv->msg,
+					     tv->msg_len, tv->pk, tv->pk_len));
+	}
+}
+
+/* Test that mldsa_verify() rejects malformed hint vectors with -EBADMSG. */
+static void test_mldsa_bad_hints(struct kunit *test,
+				 const struct mldsa_testvector *tv)
+{
+	const int omega = params[tv->alg].omega;
+	const int k = params[tv->alg].k;
+	u8 *sig = kunit_kmemdup_or_fail(test, tv->sig, tv->sig_len);
+	/* Pointer to the encoded hint vector in the signature */
+	u8 *hintvec = &sig[tv->sig_len - omega - k];
+	u8 h;
+
+	/* Initially the signature is valid. */
+	do_mldsa_and_assert_success(test, tv);
+
+	/* Cumulative hint count exceeds omega */
+	memcpy(sig, tv->sig, tv->sig_len);
+	hintvec[omega + k - 1] = omega + 1;
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, sig, tv->sig_len, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len));
+
+	/* Cumulative hint count decreases */
+	memcpy(sig, tv->sig, tv->sig_len);
+	KUNIT_ASSERT_GE(test, hintvec[omega + k - 2], 1);
+	hintvec[omega + k - 1] = hintvec[omega + k - 2] - 1;
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, sig, tv->sig_len, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len));
+
+	/*
+	 * Hint indices out of order.  To test this, swap hintvec[0] and
+	 * hintvec[1].  This assumes that the original valid signature had at
+	 * least two nonzero hints in the first element (asserted below).
+	 */
+	memcpy(sig, tv->sig, tv->sig_len);
+	KUNIT_ASSERT_GE(test, hintvec[omega], 2);
+	h = hintvec[0];
+	hintvec[0] = hintvec[1];
+	hintvec[1] = h;
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, sig, tv->sig_len, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len));
+
+	/*
+	 * Extra hint indices given.  For this test to work, the original valid
+	 * signature must have fewer than omega nonzero hints (asserted below).
+	 */
+	memcpy(sig, tv->sig, tv->sig_len);
+	KUNIT_ASSERT_LT(test, hintvec[omega + k - 1], omega);
+	hintvec[omega - 1] = 0xff;
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, sig, tv->sig_len, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len));
+}
+
+static void test_mldsa_mutation(struct kunit *test,
+				const struct mldsa_testvector *tv)
+{
+	const int sig_len = tv->sig_len;
+	const int msg_len = tv->msg_len;
+	const int pk_len = tv->pk_len;
+	const int num_iter = 200;
+	u8 *sig = kunit_kmemdup_or_fail(test, tv->sig, sig_len);
+	u8 *msg = kunit_kmemdup_or_fail(test, tv->msg, msg_len);
+	u8 *pk = kunit_kmemdup_or_fail(test, tv->pk, pk_len);
+
+	/* Initially the signature is valid. */
+	do_mldsa_and_assert_success(test, tv);
+
+	/* Changing any bit in the signature should invalidate the signature */
+	for (int i = 0; i < num_iter; i++) {
+		size_t pos = get_random_u32_below(sig_len);
+		u8 b = 1 << get_random_u32_below(8);
+
+		sig[pos] ^= b;
+		KUNIT_ASSERT_NE(test, 0,
+				mldsa_verify(tv->alg, sig, sig_len, msg,
+					     msg_len, pk, pk_len));
+		sig[pos] ^= b;
+	}
+
+	/* Changing any bit in the message should invalidate the signature */
+	for (int i = 0; i < num_iter; i++) {
+		size_t pos = get_random_u32_below(msg_len);
+		u8 b = 1 << get_random_u32_below(8);
+
+		msg[pos] ^= b;
+		KUNIT_ASSERT_NE(test, 0,
+				mldsa_verify(tv->alg, sig, sig_len, msg,
+					     msg_len, pk, pk_len));
+		msg[pos] ^= b;
+	}
+
+	/* Changing any bit in the public key should invalidate the signature */
+	for (int i = 0; i < num_iter; i++) {
+		size_t pos = get_random_u32_below(pk_len);
+		u8 b = 1 << get_random_u32_below(8);
+
+		pk[pos] ^= b;
+		KUNIT_ASSERT_NE(test, 0,
+				mldsa_verify(tv->alg, sig, sig_len, msg,
+					     msg_len, pk, pk_len));
+		pk[pos] ^= b;
+	}
+
+	/* All changes should have been undone. */
+	KUNIT_ASSERT_EQ(test, 0,
+			mldsa_verify(tv->alg, sig, sig_len, msg, msg_len, pk,
+				     pk_len));
+}
+
+static void test_mldsa(struct kunit *test, const struct mldsa_testvector *tv)
+{
+	/* Valid signature */
+	KUNIT_ASSERT_EQ(test, tv->sig_len, params[tv->alg].sig_len);
+	KUNIT_ASSERT_EQ(test, tv->pk_len, params[tv->alg].pk_len);
+	do_mldsa_and_assert_success(test, tv);
+
+	/* Signature too short */
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, tv->sig, tv->sig_len - 1, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len));
+
+	/* Signature too long */
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, tv->sig, tv->sig_len + 1, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len));
+
+	/* Public key too short */
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, tv->sig, tv->sig_len, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len - 1));
+
+	/* Public key too long */
+	KUNIT_ASSERT_EQ(test, -EBADMSG,
+			mldsa_verify(tv->alg, tv->sig, tv->sig_len, tv->msg,
+				     tv->msg_len, tv->pk, tv->pk_len + 1));
+
+	/*
+	 * Message too short.  Error is EKEYREJECTED because it gets rejected by
+	 * the "real" signature check rather than the well-formedness checks.
+	 */
+	KUNIT_ASSERT_EQ(test, -EKEYREJECTED,
+			mldsa_verify(tv->alg, tv->sig, tv->sig_len, tv->msg,
+				     tv->msg_len - 1, tv->pk, tv->pk_len));
+	/*
+	 * Can't simply try (tv->msg, tv->msg_len + 1) too, as tv->msg would be
+	 * accessed out of bounds.  However, ML-DSA just hashes the message and
+	 * doesn't handle different message lengths differently anyway.
+	 */
+
+	/* Test the validity checks on the z vector. */
+	test_mldsa_z_range(test, tv);
+
+	/* Test the validity checks on the hint vector. */
+	test_mldsa_bad_hints(test, tv);
+
+	/* Test randomly mutating the inputs. */
+	test_mldsa_mutation(test, tv);
+}
+
+static void test_mldsa44(struct kunit *test)
+{
+	test_mldsa(test, &mldsa44_testvector);
+}
+
+static void test_mldsa65(struct kunit *test)
+{
+	test_mldsa(test, &mldsa65_testvector);
+}
+
+static void test_mldsa87(struct kunit *test)
+{
+	test_mldsa(test, &mldsa87_testvector);
+}
+
+static s32 mod(s32 a, s32 m)
+{
+	a %= m;
+	if (a < 0)
+		a += m;
+	return a;
+}
+
+static s32 symmetric_mod(s32 a, s32 m)
+{
+	a = mod(a, m);
+	if (a > m / 2)
+		a -= m;
+	return a;
+}
+
+/* Mechanical, inefficient translation of FIPS 204 Algorithm 36, Decompose */
+static void decompose_ref(s32 r, s32 gamma2, s32 *r0, s32 *r1)
+{
+	s32 rplus = mod(r, Q);
+
+	*r0 = symmetric_mod(rplus, 2 * gamma2);
+	if (rplus - *r0 == Q - 1) {
+		*r1 = 0;
+		*r0 = *r0 - 1;
+	} else {
+		*r1 = (rplus - *r0) / (2 * gamma2);
+	}
+}
+
+/* Mechanical, inefficient translation of FIPS 204 Algorithm 40, UseHint */
+static s32 use_hint_ref(u8 h, s32 r, s32 gamma2)
+{
+	s32 m = (Q - 1) / (2 * gamma2);
+	s32 r0, r1;
+
+	decompose_ref(r, gamma2, &r0, &r1);
+	if (h == 1 && r0 > 0)
+		return mod(r1 + 1, m);
+	if (h == 1 && r0 <= 0)
+		return mod(r1 - 1, m);
+	return r1;
+}
+
+/*
+ * Test that for all possible inputs, mldsa_use_hint() gives the same output as
+ * a mechanical translation of the pseudocode from FIPS 204.
+ */
+static void test_mldsa_use_hint(struct kunit *test)
+{
+	for (int i = 0; i < 2; i++) {
+		const s32 gamma2 = (Q - 1) / (i == 0 ? 88 : 32);
+
+		for (u8 h = 0; h < 2; h++) {
+			for (s32 r = 0; r < Q; r++) {
+				KUNIT_ASSERT_EQ(test,
+						mldsa_use_hint(h, r, gamma2),
+						use_hint_ref(h, r, gamma2));
+			}
+		}
+	}
+}
+
+static void benchmark_mldsa(struct kunit *test,
+			    const struct mldsa_testvector *tv)
+{
+	const int warmup_niter = 200;
+	const int benchmark_niter = 200;
+	u64 t0, t1;
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+
+	for (int i = 0; i < warmup_niter; i++)
+		do_mldsa_and_assert_success(test, tv);
+
+	t0 = ktime_get_ns();
+	for (int i = 0; i < benchmark_niter; i++)
+		do_mldsa_and_assert_success(test, tv);
+	t1 = ktime_get_ns();
+	kunit_info(test, "%llu ops/s",
+		   div64_u64((u64)benchmark_niter * NSEC_PER_SEC,
+			     t1 - t0 ?: 1));
+}
+
+static void benchmark_mldsa44(struct kunit *test)
+{
+	benchmark_mldsa(test, &mldsa44_testvector);
+}
+
+static void benchmark_mldsa65(struct kunit *test)
+{
+	benchmark_mldsa(test, &mldsa65_testvector);
+}
+
+static void benchmark_mldsa87(struct kunit *test)
+{
+	benchmark_mldsa(test, &mldsa87_testvector);
+}
+
+static struct kunit_case mldsa_kunit_cases[] = {
+	KUNIT_CASE(test_mldsa44),
+	KUNIT_CASE(test_mldsa65),
+	KUNIT_CASE(test_mldsa87),
+	KUNIT_CASE(test_mldsa_use_hint),
+	KUNIT_CASE(benchmark_mldsa44),
+	KUNIT_CASE(benchmark_mldsa65),
+	KUNIT_CASE(benchmark_mldsa87),
+	{},
+};
+
+static struct kunit_suite mldsa_kunit_suite = {
+	.name = "mldsa",
+	.test_cases = mldsa_kunit_cases,
+};
+kunit_test_suite(mldsa_kunit_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for ML-DSA");
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+MODULE_LICENSE("GPL");
--- a/lib/crypto/tests/nh-testvecs.h
+++ b/lib/crypto/tests/nh-testvecs.h
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py nh */
+
+static const u8 nh_test_key[NH_KEY_BYTES] = {
+	0x04, 0x59, 0x66, 0x92, 0x81, 0xd7, 0xe9, 0x25,
+	0x68, 0xfa, 0xb0, 0xca, 0x9f, 0xea, 0x98, 0xca,
+	0xcd, 0xbf, 0x6d, 0xa5, 0x0c, 0x22, 0xc3, 0x57,
+	0xdc, 0x35, 0x05, 0xdd, 0x5b, 0xb0, 0xce, 0xf6,
+	0xb2, 0x4c, 0x77, 0x2e, 0xd2, 0x63, 0xf0, 0x17,
+	0x60, 0xd8, 0xd3, 0xd9, 0xed, 0x34, 0xb6, 0xed,
+	0x6a, 0x11, 0xc0, 0x25, 0xda, 0xba, 0x7e, 0xef,
+	0x49, 0x13, 0xf7, 0xd9, 0xfc, 0xb6, 0xfd, 0x58,
+	0xe9, 0x5f, 0xc5, 0xc4, 0x69, 0x89, 0xba, 0xa6,
+	0x2b, 0x58, 0x8d, 0x36, 0x6c, 0xb9, 0x90, 0x1e,
+	0x64, 0xc7, 0x44, 0x84, 0x03, 0x70, 0x30, 0x47,
+	0xdd, 0x58, 0xf4, 0x87, 0x61, 0xfd, 0x9c, 0x6b,
+	0x51, 0x1b, 0x39, 0x1d, 0x6d, 0x50, 0xae, 0x19,
+	0x71, 0x03, 0xc7, 0xa7, 0x42, 0x82, 0x8f, 0xa5,
+	0x63, 0x6a, 0xe2, 0x8a, 0xad, 0x4b, 0x40, 0xa7,
+	0x3f, 0x8b, 0xe4, 0xae, 0xb2, 0x8a, 0x14, 0x78,
+	0x91, 0x07, 0xba, 0x02, 0x08, 0xc1, 0x34, 0xb8,
+	0xda, 0x61, 0x67, 0xf6, 0x98, 0x97, 0x1a, 0xcb,
+	0x0f, 0x82, 0x80, 0xff, 0x02, 0x54, 0x16, 0x57,
+	0x18, 0x35, 0xaf, 0x16, 0x17, 0x68, 0xcc, 0xc7,
+	0x52, 0xac, 0x31, 0x39, 0x60, 0xe4, 0xb4, 0xcb,
+	0x0e, 0xf9, 0x57, 0xe9, 0x96, 0xff, 0x99, 0xd6,
+	0x10, 0x96, 0x09, 0xab, 0x28, 0x92, 0x1b, 0x9f,
+	0x10, 0xde, 0x3e, 0x87, 0xb8, 0x9d, 0x2d, 0xa0,
+	0x3c, 0x91, 0x85, 0x8c, 0x9e, 0xc0, 0x97, 0x9a,
+	0xb4, 0x54, 0x7f, 0x4a, 0x63, 0xc2, 0x75, 0x0f,
+	0x0d, 0x2f, 0x62, 0x56, 0x48, 0x0e, 0xb6, 0xc7,
+	0xcf, 0x0d, 0x78, 0xca, 0xbd, 0x31, 0x9e, 0x4c,
+	0xf7, 0x3f, 0x9e, 0xc2, 0xea, 0x5e, 0x44, 0x6d,
+	0x76, 0xf9, 0xc5, 0xe0, 0x29, 0xea, 0x15, 0xbf,
+	0xaf, 0xd4, 0x75, 0xc8, 0x89, 0xcf, 0x4f, 0x17,
+	0xfd, 0x4a, 0x45, 0xa5, 0x4d, 0x2d, 0x87, 0x11,
+	0x2b, 0x3e, 0x64, 0xa2, 0x6b, 0xc5, 0x23, 0x8c,
+	0xfa, 0x71, 0x13, 0x72, 0x0e, 0x7c, 0xe1, 0x2c,
+	0x9f, 0x0e, 0x29, 0xc9, 0x15, 0xde, 0x4e, 0xd7,
+	0x42, 0x1f, 0x8e, 0xe1, 0x91, 0x99, 0x50, 0x38,
+	0x7f, 0x15, 0xc0, 0xf6, 0x4b, 0xfd, 0x9d, 0x40,
+	0xe9, 0x44, 0x51, 0xca, 0x3b, 0x83, 0x41, 0x9f,
+	0x82, 0x64, 0x66, 0x22, 0x12, 0x43, 0x1c, 0x4f,
+	0x45, 0x11, 0x3a, 0x46, 0xb1, 0x7c, 0x62, 0x0a,
+	0x9d, 0x4c, 0x99, 0x85, 0xb0, 0x10, 0x19, 0xcf,
+	0xeb, 0xf9, 0x65, 0xaf, 0xd8, 0x05, 0x9e, 0x61,
+	0x03, 0x5f, 0x15, 0x99, 0xa9, 0x05, 0x20, 0xc8,
+	0xaf, 0xab, 0x31, 0x9d, 0xd5, 0xdf, 0x24, 0xce,
+	0x2b, 0x6d, 0xd7, 0x17, 0xc3, 0x04, 0xff, 0x82,
+	0xa7, 0x18, 0x39, 0xe9, 0x0d, 0x0a, 0x5f, 0xb9,
+	0xc9, 0x86, 0x1d, 0xf8, 0x02, 0x2d, 0xc3, 0x88,
+	0x28, 0x73, 0x5c, 0xac, 0x25, 0xc9, 0xfe, 0xcb,
+	0xd2, 0xfd, 0x63, 0x74, 0xac, 0xe1, 0xb8, 0xa2,
+	0xc6, 0x2b, 0xb5, 0x40, 0x01, 0x9b, 0xed, 0xee,
+	0x7b, 0x63, 0x66, 0x05, 0x45, 0xc2, 0x6c, 0xd8,
+	0x58, 0xf1, 0xa1, 0x3d, 0xc8, 0x43, 0x59, 0x4b,
+	0x39, 0x87, 0x24, 0x64, 0x92, 0xb0, 0xab, 0x75,
+	0xf1, 0xb7, 0xbf, 0x7c, 0xde, 0xc0, 0xaf, 0x4a,
+	0xc2, 0x7b, 0xd9, 0x8a, 0x99, 0xcd, 0x83, 0x01,
+	0xe6, 0xae, 0xeb, 0x16, 0xe7, 0x54, 0x9c, 0x95,
+	0x0a, 0x91, 0x02, 0xaf, 0x9f, 0x79, 0x40, 0x45,
+	0xce, 0x47, 0x41, 0x65, 0xca, 0x80, 0x0d, 0x14,
+	0x46, 0x58, 0x5d, 0x4d, 0x28, 0x55, 0x70, 0x49,
+	0x7c, 0x32, 0x1f, 0x01, 0xaa, 0x05, 0x2f, 0xf1,
+	0xeb, 0xa3, 0xe6, 0x1d, 0xf9, 0x43, 0xe0, 0x58,
+	0x05, 0x61, 0x22, 0xc3, 0xee, 0xe4, 0x6f, 0x94,
+	0xaf, 0x82, 0xda, 0x18, 0x18, 0x63, 0x9c, 0xfa,
+	0xc0, 0x04, 0x27, 0xc5, 0x39, 0x5e, 0x7a, 0xa6,
+	0x85, 0x46, 0xb7, 0x76, 0xc9, 0x16, 0xf2, 0xf8,
+	0x40, 0x8d, 0x4b, 0x5e, 0x72, 0xf3, 0x3e, 0x12,
+	0xa4, 0x80, 0x39, 0xb2, 0x92, 0xfe, 0x6e, 0x5b,
+	0x5b, 0xad, 0xea, 0x29, 0xbc, 0x66, 0xe6, 0xfe,
+	0x80, 0x02, 0x5d, 0x83, 0x37, 0xfc, 0xde, 0x6c,
+	0x25, 0x54, 0xa2, 0xff, 0x7d, 0xb6, 0xe1, 0xd6,
+	0xcf, 0xdb, 0x60, 0xe3, 0xbe, 0x2f, 0x4e, 0xb4,
+	0xf5, 0xb4, 0x51, 0xf7, 0x5a, 0x25, 0xda, 0x40,
+	0x84, 0x5e, 0xc0, 0x0a, 0x6b, 0xfa, 0x0c, 0xfb,
+	0x5e, 0x3e, 0x12, 0x6c, 0x39, 0x35, 0xc0, 0x28,
+	0xd6, 0x1b, 0x3a, 0x72, 0xc3, 0xfe, 0xa5, 0x4c,
+	0x35, 0xa2, 0x42, 0xf6, 0x3d, 0xa5, 0xbf, 0xb5,
+	0x39, 0xe3, 0xc9, 0xd5, 0x8c, 0x1b, 0xe5, 0xef,
+	0x91, 0xd2, 0x80, 0x6f, 0xcc, 0x77, 0x44, 0x50,
+	0x62, 0xc7, 0xac, 0x29, 0xcb, 0x72, 0xda, 0x6d,
+	0xc5, 0xfe, 0xa7, 0xee, 0x8b, 0xeb, 0xfc, 0xa3,
+	0x46, 0x18, 0x5f, 0xaa, 0xc3, 0x65, 0xd0, 0x8f,
+	0x67, 0x98, 0xd6, 0xce, 0x5f, 0x84, 0xd4, 0x96,
+	0x1b, 0x67, 0xa0, 0xcf, 0xfc, 0x94, 0x55, 0x5e,
+	0x4b, 0x51, 0x68, 0xa7, 0x6d, 0x02, 0xf9, 0x53,
+	0x54, 0x86, 0x6b, 0x53, 0x39, 0xe0, 0x36, 0x23,
+	0x87, 0x1a, 0xfb, 0x53, 0x1a, 0x65, 0xd8, 0x42,
+	0xa8, 0x85, 0xfd, 0x2c, 0x7f, 0x6b, 0x7f, 0x67,
+	0x70, 0x23, 0x6c, 0xe9, 0x0b, 0xf0, 0x1e, 0x0d,
+	0x0b, 0xb4, 0xd4, 0x96, 0x14, 0x95, 0x7e, 0xf3,
+	0x9b, 0xdd, 0xd7, 0xc4, 0x24, 0x22, 0xb9, 0x9d,
+	0xb3, 0xa6, 0xac, 0x09, 0x7c, 0x00, 0xbf, 0xd0,
+	0xdc, 0xfb, 0x9b, 0x7c, 0x8c, 0xbd, 0xd4, 0x1a,
+	0x13, 0x2b, 0x82, 0x3d, 0x7c, 0x8c, 0x10, 0x47,
+	0x49, 0x6c, 0x53, 0xeb, 0xa7, 0xc2, 0xde, 0xed,
+	0xe2, 0x55, 0x93, 0x2c, 0x1a, 0x5a, 0x7d, 0xe1,
+	0x37, 0x62, 0xdd, 0x29, 0x1a, 0x72, 0x82, 0xc0,
+	0x14, 0x73, 0x5d, 0x0e, 0x9b, 0xcc, 0x54, 0x68,
+	0x3a, 0x4d, 0x56, 0x8f, 0xc9, 0x4e, 0xaf, 0x7b,
+	0xde, 0x17, 0x9c, 0x5e, 0x83, 0x82, 0x22, 0xe3,
+	0x28, 0xdf, 0x1b, 0xb6, 0xdb, 0x17, 0x90, 0x48,
+	0xb5, 0x13, 0x4e, 0xd3, 0x97, 0x5e, 0xb3, 0x9c,
+	0x16, 0x08, 0xc8, 0x77, 0xb3, 0xcd, 0x94, 0x90,
+	0x4f, 0x77, 0xaf, 0x67, 0xdd, 0x80, 0x15, 0x1c,
+	0x59, 0xfb, 0x3c, 0xec, 0xf8, 0xb3, 0x67, 0xfb,
+	0xa0, 0x94, 0x3c, 0x53, 0x99, 0x49, 0x94, 0x2c,
+	0x85, 0x26, 0x92, 0x6d, 0x8d, 0x48, 0xf6, 0x72,
+	0xdd, 0xfb, 0xb2, 0x10, 0x51, 0x5b, 0xbe, 0xd5,
+	0x70, 0x3d, 0x28, 0x94, 0x98, 0x4f, 0x6e, 0x20,
+	0x7b, 0x7d, 0x0f, 0x56, 0xc9, 0x96, 0x5f, 0x60,
+	0x2e, 0x2f, 0x9b, 0x38, 0x7f, 0xc7, 0x3c, 0x6b,
+	0x2f, 0x2b, 0x8f, 0x1f, 0x07, 0x1c, 0x85, 0x57,
+	0x16, 0x2e, 0xc7, 0x74, 0xe5, 0xf2, 0x0d, 0xfe,
+	0xef, 0x57, 0xb0, 0xa4, 0x4f, 0x4c, 0x7d, 0x81,
+	0xbb, 0xaa, 0xcb, 0xa0, 0xb0, 0x51, 0xcf, 0xc2,
+	0xee, 0x90, 0x2e, 0x5e, 0x27, 0xca, 0xd3, 0xe8,
+	0xf3, 0x55, 0x02, 0x56, 0x06, 0xa5, 0xad, 0xdf,
+	0xa3, 0xa9, 0x06, 0x05, 0x53, 0x74, 0x55, 0xd5,
+	0xd2, 0x20, 0x0a, 0x6d, 0x4a, 0xef, 0x16, 0xbf,
+	0xc3, 0xb2, 0x75, 0x93, 0xd8, 0x6e, 0x0f, 0xd2,
+	0xae, 0x3b, 0xc0, 0x00, 0x22, 0x6f, 0xb5, 0x0a,
+	0x41, 0xfc, 0xf9, 0x41, 0xfc, 0x16, 0x4f, 0xa6,
+	0x1c, 0x18, 0x41, 0x67, 0x73, 0xa8, 0x79, 0xa9,
+	0x54, 0x18, 0x4e, 0x88, 0x44, 0x0f, 0xa1, 0x5b,
+	0xf0, 0x68, 0xea, 0x3c, 0x62, 0x59, 0x8d, 0xc7,
+	0x6f, 0xd7, 0x72, 0x20, 0x74, 0x39, 0xd4, 0x3a,
+	0x41, 0x1b, 0x58, 0x57, 0x54, 0x85, 0x60, 0xca,
+	0x49, 0x4b, 0xa1, 0x04, 0x91, 0xb6, 0xf2, 0xcd,
+	0x62, 0x63, 0x67, 0xd1, 0xee, 0x6b, 0x9e, 0x5d,
+	0xd6, 0xc4, 0x58, 0x6b, 0xe1, 0xe6, 0x4a, 0xdb,
+	0xe8, 0xb1, 0x35, 0x03, 0x15, 0x8d, 0x34, 0x69,
+	0x4c, 0xd2, 0x54, 0xce, 0xe8, 0x6a, 0x69, 0x6f,
+	0xaa, 0xb5, 0x1f, 0x86, 0xed, 0xac, 0x4f, 0x16,
+	0x1e, 0x48, 0x93, 0xe8, 0x6c, 0x24, 0x1c, 0xd0,
+	0xbb, 0x61, 0xc2, 0x34, 0xdd, 0xc9, 0x5c, 0xce,
+};
+
+static const u8 nh_test_msg[NH_MESSAGE_BYTES] = {
+	0x99, 0x57, 0x61, 0x41, 0xad, 0x08, 0x7e, 0x17,
+	0xd4, 0xef, 0x0b, 0x23, 0xff, 0x0b, 0x96, 0x0a,
+	0x6c, 0x98, 0xac, 0x78, 0x5e, 0xb6, 0xb2, 0x67,
+	0x0f, 0x48, 0xf4, 0xa1, 0xe5, 0x1e, 0xfe, 0x83,
+	0xe4, 0x56, 0x2a, 0x03, 0x64, 0xff, 0x7a, 0xf3,
+	0x03, 0xfe, 0xa7, 0x86, 0xdc, 0x35, 0x79, 0x13,
+	0xf8, 0xe1, 0x59, 0x19, 0x04, 0x43, 0x24, 0x82,
+	0x44, 0x82, 0x41, 0x2b, 0xc7, 0xcf, 0xf5, 0xa4,
+	0xdc, 0xca, 0xf5, 0x34, 0xc4, 0x23, 0x3c, 0x1f,
+	0xa8, 0x84, 0x1f, 0x2a, 0xcd, 0xae, 0x9d, 0x5e,
+	0x05, 0xe2, 0xfb, 0x0c, 0x68, 0x81, 0x90, 0x11,
+	0x44, 0xf6, 0xdd, 0x5b, 0x51, 0xd3, 0xe0, 0xab,
+	0x29, 0x3a, 0xa9, 0x9c, 0xf6, 0x7e, 0x2d, 0xe3,
+	0x6c, 0x09, 0x59, 0xd7, 0xfa, 0x7f, 0x6a, 0x33,
+	0x3b, 0x23, 0x7b, 0x1b, 0xb2, 0x79, 0x5f, 0x5c,
+	0xb6, 0x2d, 0xb0, 0xf8, 0xab, 0x33, 0x28, 0xe0,
+	0x72, 0x2e, 0x2f, 0x03, 0x22, 0x16, 0xb4, 0x87,
+	0xf7, 0x14, 0x3f, 0x55, 0x8a, 0xb0, 0x47, 0xdb,
+	0x42, 0x2d, 0xc0, 0x0c, 0x0a, 0x33, 0xf8, 0xab,
+	0x44, 0xae, 0xa3, 0xc9, 0xfc, 0xf6, 0x34, 0x8c,
+	0x60, 0x30, 0x6d, 0x31, 0x70, 0xf3, 0x39, 0x53,
+	0xf1, 0x2d, 0xb9, 0x6c, 0xa6, 0x48, 0x9c, 0x9c,
+	0xc2, 0x88, 0xb3, 0xa9, 0x98, 0xb6, 0xc3, 0x47,
+	0x94, 0x02, 0x9d, 0x98, 0x6e, 0x25, 0x6c, 0xf5,
+	0x9b, 0xc6, 0x4d, 0xee, 0x07, 0x1e, 0x25, 0x8f,
+	0x01, 0xde, 0xad, 0xe5, 0x77, 0x4f, 0xd1, 0xc0,
+	0x62, 0xbb, 0x3a, 0xb9, 0x83, 0x0b, 0x29, 0x76,
+	0x4f, 0xb1, 0x86, 0x2c, 0x27, 0xc7, 0x38, 0x65,
+	0xcb, 0x78, 0xb7, 0x02, 0x10, 0x9e, 0xde, 0x83,
+	0xd1, 0xac, 0x05, 0x86, 0x23, 0xce, 0x4f, 0x8d,
+	0xcc, 0x4e, 0x3f, 0x04, 0xf4, 0x39, 0x91, 0x81,
+	0x1c, 0x42, 0x47, 0x4d, 0x50, 0xe5, 0x01, 0x22,
+	0x98, 0xcf, 0x91, 0x36, 0xb3, 0x7c, 0xcf, 0x78,
+	0x07, 0x22, 0xa9, 0x18, 0xd2, 0xcd, 0x7d, 0x4d,
+	0xa6, 0xcb, 0xaa, 0x52, 0x13, 0x49, 0x64, 0xb0,
+	0xa5, 0x3d, 0xc7, 0xc3, 0x10, 0x87, 0x2e, 0x76,
+	0xa9, 0x52, 0xc5, 0x50, 0x18, 0xc0, 0x5d, 0xb4,
+	0x4c, 0xc6, 0x7f, 0x64, 0xae, 0x53, 0xc3, 0x46,
+	0x99, 0xb7, 0x61, 0x6b, 0x08, 0x43, 0x08, 0x4c,
+	0x90, 0x2c, 0xee, 0x56, 0x91, 0xb4, 0x28, 0xa8,
+	0xa8, 0x8b, 0x3b, 0x1a, 0x67, 0x71, 0xf2, 0x81,
+	0x48, 0x20, 0x71, 0x30, 0xdd, 0x69, 0x8a, 0xc2,
+	0x4c, 0x9d, 0x4e, 0x17, 0xfb, 0x2e, 0xe7, 0x9b,
+	0x86, 0x94, 0xa5, 0xce, 0xf9, 0x74, 0x56, 0xff,
+	0x3b, 0xff, 0xd9, 0x5a, 0xc8, 0x98, 0xf5, 0x25,
+	0xa2, 0xb9, 0x66, 0x46, 0x89, 0x17, 0x39, 0x08,
+	0x69, 0x03, 0x59, 0x1e, 0x13, 0x12, 0x68, 0xe7,
+	0x2f, 0x00, 0xd3, 0xf3, 0x71, 0xd1, 0x20, 0xc5,
+	0x0b, 0x38, 0x89, 0xda, 0x62, 0x3c, 0xce, 0xea,
+	0x04, 0x19, 0x47, 0x6d, 0xd8, 0x64, 0x38, 0x60,
+	0x96, 0x71, 0x68, 0x48, 0x79, 0xf8, 0xf4, 0x76,
+	0x33, 0xf6, 0x60, 0x8d, 0x21, 0xd0, 0xee, 0x41,
+	0xc0, 0xbe, 0x33, 0x61, 0x5e, 0x66, 0xe6, 0x16,
+	0x14, 0xc7, 0xfb, 0x6c, 0xf3, 0x58, 0xef, 0x12,
+	0x7c, 0x70, 0x65, 0x5d, 0x55, 0xe8, 0xf2, 0x92,
+	0x3a, 0xfe, 0x34, 0x64, 0x31, 0x7c, 0x29, 0xbb,
+	0x01, 0x18, 0xbd, 0xb6, 0xe4, 0x1e, 0xa4, 0xf3,
+	0x7b, 0x4c, 0x6a, 0x0d, 0x01, 0xfc, 0xc7, 0x66,
+	0xc3, 0x88, 0x37, 0x25, 0xcf, 0xe9, 0xca, 0x82,
+	0xeb, 0xa1, 0x38, 0x40, 0xc9, 0xdb, 0x38, 0x7b,
+	0x78, 0xcf, 0x11, 0xa3, 0x1c, 0x6b, 0x70, 0xc8,
+	0xe1, 0x2f, 0x7c, 0x17, 0x2c, 0x58, 0x28, 0xa4,
+	0x13, 0x40, 0xc7, 0x69, 0x0f, 0x04, 0xe5, 0x8e,
+	0xf0, 0x67, 0x53, 0xea, 0x10, 0xf5, 0x83, 0xc9,
+	0xcb, 0x6b, 0x16, 0xef, 0x2e, 0x55, 0xb3, 0xdd,
+	0xed, 0xf9, 0x1a, 0x52, 0x9a, 0x73, 0x78, 0x14,
+	0x14, 0x21, 0xfc, 0xef, 0x3c, 0x40, 0xa9, 0xfe,
+	0xef, 0xd7, 0x6e, 0x28, 0x2f, 0xd3, 0x73, 0xed,
+	0xa3, 0x73, 0xb5, 0x62, 0x41, 0xe6, 0xd4, 0x79,
+	0x49, 0x31, 0x2b, 0x86, 0x74, 0x56, 0x21, 0xfe,
+	0x6d, 0xb2, 0xbe, 0x81, 0x80, 0xa6, 0x81, 0x19,
+	0x90, 0x79, 0x6f, 0xc4, 0x4e, 0x7d, 0x6f, 0x2f,
+	0xa8, 0x6f, 0xd5, 0xc4, 0x7e, 0x23, 0x3b, 0xe6,
+	0x9b, 0x60, 0x97, 0x7b, 0xe2, 0x08, 0x8a, 0xaa,
+	0xc7, 0x7c, 0xf6, 0xe5, 0x01, 0x3e, 0xd2, 0x29,
+	0x7d, 0xd7, 0x40, 0x84, 0x95, 0xfa, 0xdf, 0xd8,
+	0x81, 0xe9, 0x5e, 0xdd, 0x0d, 0x17, 0x51, 0x6b,
+	0x8c, 0x0e, 0x47, 0xf9, 0x0c, 0x92, 0x1b, 0x60,
+	0xca, 0x06, 0x8a, 0xe5, 0xe8, 0x0f, 0x06, 0x75,
+	0x5d, 0x76, 0xc9, 0x32, 0x2c, 0x52, 0x2c, 0x2e,
+	0xd8, 0x66, 0x38, 0x75, 0x16, 0xc7, 0x7d, 0x51,
+	0xc4, 0xc2, 0x22, 0xc8, 0x19, 0xfc, 0x3d, 0x69,
+	0x1e, 0xd9, 0x64, 0x47, 0x5d, 0x21, 0x84, 0x46,
+	0xd7, 0xe1, 0xf0, 0x95, 0x3a, 0x8f, 0xbd, 0x7a,
+	0x53, 0x71, 0x4c, 0x54, 0xc1, 0x3e, 0x27, 0xde,
+	0xeb, 0x04, 0x11, 0xb0, 0x33, 0x4d, 0x57, 0x0b,
+	0x6b, 0x7d, 0x6c, 0xd5, 0x87, 0x7e, 0xb4, 0xe2,
+	0x94, 0x9e, 0x9f, 0x74, 0xe8, 0xb7, 0xfa, 0x05,
+	0x9b, 0x8f, 0x81, 0x43, 0x35, 0x82, 0xb8, 0x5b,
+	0xa8, 0x5e, 0xfa, 0x7a, 0x80, 0x8d, 0xd2, 0x90,
+	0x58, 0x79, 0x89, 0x56, 0x90, 0x2b, 0xff, 0x92,
+	0x3c, 0x35, 0xbe, 0x99, 0x5f, 0xd2, 0x4b, 0x15,
+	0x58, 0x4b, 0xbf, 0x08, 0x9b, 0x9b, 0x97, 0x10,
+	0xa4, 0x55, 0xc7, 0xec, 0x29, 0xc5, 0x14, 0x3e,
+	0x8f, 0x56, 0xa3, 0x92, 0x9e, 0x33, 0xcc, 0x9e,
+	0x77, 0x2f, 0x33, 0xcb, 0xc4, 0xe9, 0x19, 0xf4,
+	0x32, 0x2b, 0xef, 0x6c, 0x1c, 0x92, 0x2c, 0x45,
+	0x88, 0x74, 0x5f, 0xcf, 0x56, 0xfd, 0x87, 0x5f,
+	0xb6, 0x9b, 0xa2, 0x51, 0xda, 0x9b, 0x83, 0x4f,
+	0xec, 0x14, 0xe8, 0xd2, 0x42, 0x03, 0xcb, 0xe8,
+	0xd0, 0xb7, 0xf8, 0x38, 0xde, 0x6f, 0xdf, 0x43,
+	0xfa, 0x41, 0xab, 0xec, 0x2e, 0x3c, 0x93, 0x39,
+	0x76, 0xd1, 0x6f, 0x5b, 0x6c, 0x6e, 0x8d, 0xeb,
+	0x45, 0x6b, 0xc5, 0x76, 0x00, 0x29, 0xca, 0x3b,
+	0xdb, 0x78, 0xc2, 0x32, 0x09, 0x39, 0x19, 0x50,
+	0xa2, 0x44, 0x92, 0x09, 0xdb, 0x8b, 0x9e, 0x16,
+	0x76, 0x7f, 0xf1, 0x78, 0x7b, 0xb2, 0x51, 0xbc,
+	0x28, 0xbd, 0xb0, 0x7f, 0x25, 0x63, 0x7d, 0x34,
+	0xfb, 0xf6, 0x36, 0x24, 0xc7, 0xf9, 0x41, 0xb6,
+	0x2a, 0x06, 0xfc, 0xf0, 0x83, 0xf2, 0x12, 0x3d,
+	0x60, 0x2e, 0x10, 0x70, 0x31, 0x6f, 0x37, 0x08,
+	0x3e, 0x91, 0x93, 0xb5, 0xda, 0xb8, 0x4c, 0x1b,
+	0xd8, 0xb8, 0x3b, 0xd5, 0x3e, 0xb6, 0xc0, 0xbb,
+	0x38, 0x0f, 0xd2, 0x68, 0x4f, 0x78, 0x56, 0xf6,
+	0xda, 0x65, 0xb4, 0x0b, 0xb4, 0xaf, 0xa8, 0x19,
+	0x2f, 0x70, 0x55, 0xe0, 0x47, 0x31, 0x9f, 0x37,
+	0x1a, 0x47, 0xb9, 0x0c, 0x97, 0x79, 0xfc, 0xa9,
+	0x76, 0xe6, 0xfa, 0x38, 0x67, 0x25, 0xd3, 0x89,
+	0x8d, 0xad, 0xc6, 0x11, 0x2d, 0x77, 0x0b, 0x35,
+	0xa2, 0xe2, 0xdf, 0xc8, 0x94, 0xd5, 0xdf, 0xd2,
+	0x69, 0x2a, 0x99, 0x93, 0xfa, 0x4a, 0x5f, 0xc7,
+	0x8a, 0x14, 0x5f, 0x2a, 0xf3, 0x02, 0xf0, 0x3e,
+	0x21, 0x8e, 0x2e, 0x4b, 0xc4, 0xd2, 0xc8, 0xa6,
+	0x41, 0x6e, 0x17, 0x36, 0xe9, 0xad, 0x73, 0x33,
+	0x6c, 0xea, 0xc2, 0x31, 0x8f, 0x30, 0x51, 0x5c,
+	0x1c, 0x20, 0xe6, 0x05, 0x1a, 0x17, 0x15, 0x5d,
+	0x3e, 0x8f, 0xd2, 0x7f, 0xa1, 0xc5, 0x47, 0xb3,
+	0xb2, 0x9c, 0xe8, 0xf0, 0x6d, 0xc1, 0xc3, 0xa2,
+};
+
+static const u8 nh_test_val16[NH_HASH_BYTES] = {
+	0x30, 0x77, 0x55, 0x7c, 0x45, 0xd8, 0xce, 0xf7,
+	0x2a, 0xb5, 0x14, 0x8c, 0x35, 0x7e, 0xaa, 0x00,
+	0x50, 0xbc, 0x50, 0x7c, 0xd3, 0x20, 0x7c, 0x9c,
+	0xb4, 0xf1, 0x91, 0x26, 0x81, 0x03, 0xa5, 0x68,
+};
+
+static const u8 nh_test_val96[NH_HASH_BYTES] = {
+	0xd2, 0x19, 0xca, 0xa5, 0x6c, 0x0c, 0xdf, 0x2f,
+	0x69, 0xfa, 0x75, 0xc1, 0x63, 0xdb, 0xfa, 0x4d,
+	0x45, 0x2b, 0xb8, 0xdb, 0xac, 0xee, 0x61, 0xc6,
+	0x7a, 0x83, 0xb6, 0x0f, 0x32, 0x82, 0xe4, 0xd0,
+};
+
+static const u8 nh_test_val256[NH_HASH_BYTES] = {
+	0x33, 0x8f, 0xb4, 0x96, 0xf1, 0xb6, 0xf1, 0xb5,
+	0x05, 0x19, 0xbb, 0x6b, 0xda, 0xd9, 0x95, 0x75,
+	0x96, 0x3f, 0x8b, 0x42, 0xb6, 0xcd, 0xb7, 0xb7,
+	0xe7, 0x97, 0xb5, 0xa9, 0x0b, 0xd7, 0xdd, 0x33,
+};
+
+static const u8 nh_test_val1024[NH_HASH_BYTES] = {
+	0x32, 0x3d, 0x51, 0xe1, 0x77, 0xb6, 0xac, 0x06,
+	0x84, 0x67, 0xb7, 0xf2, 0x24, 0xe7, 0xec, 0xfd,
+	0x96, 0x64, 0xff, 0x55, 0xc7, 0x1b, 0xf9, 0xdc,
+	0xa3, 0xc7, 0x32, 0x06, 0x79, 0xcf, 0xca, 0xb6,
+};
--- a/lib/crypto/tests/nh_kunit.c
+++ b/lib/crypto/tests/nh_kunit.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/nh.h>
+#include <kunit/test.h>
+#include "nh-testvecs.h"
+
+static void test_nh(struct kunit *test)
+{
+	u32 *key = kunit_kmalloc(test, NH_KEY_BYTES, GFP_KERNEL);
+	__le64 hash[NH_NUM_PASSES];
+
+	KUNIT_ASSERT_NOT_NULL(test, key);
+	memcpy(key, nh_test_key, NH_KEY_BYTES);
+	le32_to_cpu_array(key, NH_KEY_WORDS);
+
+	nh(key, nh_test_msg, 16, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, nh_test_val16, NH_HASH_BYTES);
+
+	nh(key, nh_test_msg, 96, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, nh_test_val96, NH_HASH_BYTES);
+
+	nh(key, nh_test_msg, 256, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, nh_test_val256, NH_HASH_BYTES);
+
+	nh(key, nh_test_msg, 1024, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, nh_test_val1024, NH_HASH_BYTES);
+}
+
+static struct kunit_case nh_test_cases[] = {
+	KUNIT_CASE(test_nh),
+	{},
+};
+
+static struct kunit_suite nh_test_suite = {
+	.name = "nh",
+	.test_cases = nh_test_cases,
+};
+kunit_test_suite(nh_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for NH");
+MODULE_LICENSE("GPL");
--- a/lib/crypto/x86/aes-aesni.S
+++ b/lib/crypto/x86/aes-aesni.S
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// AES block cipher using AES-NI instructions
+//
+// Copyright 2026 Google LLC
+//
+// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
+// AVX.  It does use up to SSE4.1, which all CPUs with AES-NI have.
+#include <linux/linkage.h>
+
+.section .rodata
+#ifdef __x86_64__
+#define RODATA(label)	label(%rip)
+#else
+#define RODATA(label)	label
+#endif
+
+	// A mask for pshufb that extracts the last dword, rotates it right by 8
+	// bits, and copies the result to all four dwords.
+.p2align 4
+.Lmask:
+	.byte	13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
+
+	// The AES round constants, used during key expansion
+.Lrcon:
+	.long	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+
+.text
+
+// Transform four dwords [a0, a1, a2, a3] in \a into
+// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3].  \tmp is a temporary xmm register.
+//
+// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
+// if the temporary register were zero-initialized ahead of time.  We instead do
+// it in an easier-to-understand way that doesn't require zero-initialization
+// and avoids the unusual shufps instruction.  movdqa is usually "free" anyway.
+.macro	_prefix_sum	a, tmp
+	movdqa		\a, \tmp	// [a0, a1, a2, a3]
+	pslldq		$4, \a		// [0, a0, a1, a2]
+	pxor		\tmp, \a	// [a0, a0^a1, a1^a2, a2^a3]
+	movdqa		\a, \tmp
+	pslldq		$8, \a		// [0, 0, a0, a0^a1]
+	pxor		\tmp, \a	// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
+.endm
+
+.macro	_gen_round_key	a, b
+	// Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
+	// the last dword of the previous round key (given in \b).
+	//
+	// 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
+	// It is used here solely for the SubBytes and the XOR.  The ShiftRows
+	// is a no-op because all four columns are the same here.
+	//
+	// Don't use the 'aeskeygenassist' instruction, since:
+	//  - On most Intel CPUs it is microcoded, making it have a much higher
+	//    latency and use more execution ports than 'aesenclast'.
+	//  - It cannot be used in a loop, since it requires an immediate.
+	//  - It doesn't do much more than 'aesenclast' in the first place.
+	movdqa		\b, %xmm2
+	pshufb		MASK, %xmm2
+	aesenclast	RCON, %xmm2
+
+	// XOR in the prefix sum of the four dwords of \a, which is the
+	// previous round key (AES-128) or the first round key in the previous
+	// pair of round keys (AES-256).  The result is the next round key.
+	_prefix_sum	\a, tmp=%xmm3
+	pxor		%xmm2, \a
+
+	// Store the next round key to memory.  Also leave it in \a.
+	movdqu		\a, (RNDKEYS)
+.endm
+
+.macro	_aes_expandkey_aesni	is_aes128
+#ifdef __x86_64__
+	// Arguments
+	.set	RNDKEYS,	%rdi
+	.set	INV_RNDKEYS,	%rsi
+	.set	IN_KEY,		%rdx
+
+	// Other local variables
+	.set	RCON_PTR,	%rcx
+	.set	COUNTER,	%eax
+#else
+	// Arguments, assuming -mregparm=3
+	.set	RNDKEYS,	%eax
+	.set	INV_RNDKEYS,	%edx
+	.set	IN_KEY,		%ecx
+
+	// Other local variables
+	.set	RCON_PTR,	%ebx
+	.set	COUNTER,	%esi
+#endif
+	.set	RCON,		%xmm6
+	.set	MASK,		%xmm7
+
+#ifdef __i386__
+	push		%ebx
+	push		%esi
+#endif
+
+.if \is_aes128
+	// AES-128: the first round key is simply a copy of the raw key.
+	movdqu		(IN_KEY), %xmm0
+	movdqu		%xmm0, (RNDKEYS)
+.else
+	// AES-256: the first two round keys are simply a copy of the raw key.
+	movdqu		(IN_KEY), %xmm0
+	movdqu		%xmm0, (RNDKEYS)
+	movdqu		16(IN_KEY), %xmm1
+	movdqu		%xmm1, 16(RNDKEYS)
+	add		$32, RNDKEYS
+.endif
+
+	// Generate the remaining round keys.
+	movdqa		RODATA(.Lmask), MASK
+.if \is_aes128
+	lea		RODATA(.Lrcon), RCON_PTR
+	mov		$10, COUNTER
+.Lgen_next_aes128_round_key:
+	add		$16, RNDKEYS
+	movd		(RCON_PTR), RCON
+	pshufd		$0x00, RCON, RCON
+	add		$4, RCON_PTR
+	_gen_round_key	%xmm0, %xmm0
+	dec		COUNTER
+	jnz		.Lgen_next_aes128_round_key
+.else
+	// AES-256: only the first 7 round constants are needed, so instead of
+	// loading each one from memory, just start by loading [1, 1, 1, 1] and
+	// then generate the rest by doubling.
+	pshufd		$0x00, RODATA(.Lrcon), RCON
+	pxor		%xmm5, %xmm5	// All-zeroes
+	mov		$7, COUNTER
+.Lgen_next_aes256_round_key_pair:
+	// Generate the next AES-256 round key: either the first of a pair of
+	// two, or the last one.
+	_gen_round_key	%xmm0, %xmm1
+
+	dec		COUNTER
+	jz		.Lgen_aes256_round_keys_done
+
+	// Generate the second AES-256 round key of the pair.  Compared to the
+	// first, there's no rotation and no XOR of a round constant.
+	pshufd		$0xff, %xmm0, %xmm2	// Get four copies of last dword
+	aesenclast	%xmm5, %xmm2		// Just does SubBytes
+	_prefix_sum	%xmm1, tmp=%xmm3
+	pxor		%xmm2, %xmm1
+	movdqu		%xmm1, 16(RNDKEYS)
+	add		$32, RNDKEYS
+	paddd		RCON, RCON		// RCON <<= 1
+	jmp		.Lgen_next_aes256_round_key_pair
+.Lgen_aes256_round_keys_done:
+.endif
+
+	// If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
+	// Inverse Cipher to it.  To do that, reverse the standard round keys,
+	// and apply aesimc (InvMixColumn) to each except the first and last.
+	test		INV_RNDKEYS, INV_RNDKEYS
+	jz		.Ldone\@
+	movdqu		(RNDKEYS), %xmm0	// Last standard round key
+	movdqu		%xmm0, (INV_RNDKEYS)	// => First inverse round key
+.if \is_aes128
+	mov		$9, COUNTER
+.else
+	mov		$13, COUNTER
+.endif
+.Lgen_next_inv_round_key\@:
+	sub		$16, RNDKEYS
+	add		$16, INV_RNDKEYS
+	movdqu		(RNDKEYS), %xmm0
+	aesimc		%xmm0, %xmm0
+	movdqu		%xmm0, (INV_RNDKEYS)
+	dec		COUNTER
+	jnz		.Lgen_next_inv_round_key\@
+	movdqu		-16(RNDKEYS), %xmm0	// First standard round key
+	movdqu		%xmm0, 16(INV_RNDKEYS)	// => Last inverse round key
+
+.Ldone\@:
+#ifdef __i386__
+	pop		%esi
+	pop		%ebx
+#endif
+	RET
+.endm
+
+// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+//			       const u8 in_key[AES_KEYSIZE_128]);
+SYM_FUNC_START(aes128_expandkey_aesni)
+	_aes_expandkey_aesni	1
+SYM_FUNC_END(aes128_expandkey_aesni)
+
+// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+//			       const u8 in_key[AES_KEYSIZE_256]);
+SYM_FUNC_START(aes256_expandkey_aesni)
+	_aes_expandkey_aesni	0
+SYM_FUNC_END(aes256_expandkey_aesni)
+
+.macro	_aes_crypt_aesni	enc
+#ifdef __x86_64__
+	.set	RNDKEYS,	%rdi
+	.set	NROUNDS,	%esi
+	.set	OUT,		%rdx
+	.set	IN,		%rcx
+#else
+	// Assuming -mregparm=3
+	.set	RNDKEYS,	%eax
+	.set	NROUNDS,	%edx
+	.set	OUT,		%ecx
+	.set	IN,		%ebx	// Passed on stack
+#endif
+
+#ifdef __i386__
+	push		%ebx
+	mov		8(%esp), %ebx
+#endif
+
+	// Zero-th round
+	movdqu		(IN), %xmm0
+	movdqu		(RNDKEYS), %xmm1
+	pxor		%xmm1, %xmm0
+
+	// Normal rounds
+	add		$16, RNDKEYS
+	dec		NROUNDS
+.Lnext_round\@:
+	movdqu		(RNDKEYS), %xmm1
+.if \enc
+	aesenc		%xmm1, %xmm0
+.else
+	aesdec		%xmm1, %xmm0
+.endif
+	add		$16, RNDKEYS
+	dec		NROUNDS
+	jne		.Lnext_round\@
+
+	// Last round
+	movdqu		(RNDKEYS), %xmm1
+.if \enc
+	aesenclast	%xmm1, %xmm0
+.else
+	aesdeclast	%xmm1, %xmm0
+.endif
+	movdqu		%xmm0, (OUT)
+
+#ifdef __i386__
+	pop		%ebx
+#endif
+	RET
+.endm
+
+// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_encrypt_aesni)
+	_aes_crypt_aesni	1
+SYM_FUNC_END(aes_encrypt_aesni)
+
+// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_decrypt_aesni)
+	_aes_crypt_aesni	0
+SYM_FUNC_END(aes_decrypt_aesni)
--- a/lib/crypto/x86/aes.h
+++ b/lib/crypto/x86/aes.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES block cipher using AES-NI instructions
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes);
+
+void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+			    const u8 in_key[AES_KEYSIZE_128]);
+void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+			    const u8 in_key[AES_KEYSIZE_256]);
+void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+		       u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+		       u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+
+/*
+ * Expand an AES key using AES-NI if supported and usable or generic code
+ * otherwise.  The expanded key format is compatible between the two cases.  The
+ * outputs are @k->rndkeys (required) and @inv_k->inv_rndkeys (optional).
+ *
+ * We could just always use the generic key expansion code.  AES key expansion
+ * is usually less performance-critical than AES en/decryption.  However,
+ * there's still *some* value in speed here, as well as in non-key-dependent
+ * execution time which AES-NI provides.  So, do use AES-NI to expand AES-128
+ * and AES-256 keys.  (Don't bother with AES-192, as it's almost never used.)
+ */
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	u32 *rndkeys = k->rndkeys;
+	u32 *inv_rndkeys = inv_k ? inv_k->inv_rndkeys : NULL;
+
+	if (static_branch_likely(&have_aes) && key_len != AES_KEYSIZE_192 &&
+	    irq_fpu_usable()) {
+		kernel_fpu_begin();
+		if (key_len == AES_KEYSIZE_128)
+			aes128_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+		else
+			aes256_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+		kernel_fpu_end();
+	} else {
+		aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len);
+	}
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		aes_encrypt_aesni(key->k.rndkeys, key->nrounds, out, in);
+		kernel_fpu_end();
+	} else {
+		aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		aes_decrypt_aesni(key->inv_k.inv_rndkeys, key->nrounds,
+				  out, in);
+		kernel_fpu_end();
+	} else {
+		aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+				    out, in);
+	}
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AES))
+		static_branch_enable(&have_aes);
+}
--- a/lib/crypto/x86/nh-avx2.S
+++ b/lib/crypto/x86/nh-avx2.S
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define		PASS0_SUMS	%ymm0
+#define		PASS1_SUMS	%ymm1
+#define		PASS2_SUMS	%ymm2
+#define		PASS3_SUMS	%ymm3
+#define		K0		%ymm4
+#define		K0_XMM		%xmm4
+#define		K1		%ymm5
+#define		K1_XMM		%xmm5
+#define		K2		%ymm6
+#define		K2_XMM		%xmm6
+#define		K3		%ymm7
+#define		K3_XMM		%xmm7
+#define		T0		%ymm8
+#define		T1		%ymm9
+#define		T2		%ymm10
+#define		T2_XMM		%xmm10
+#define		T3		%ymm11
+#define		T3_XMM		%xmm11
+#define		T4		%ymm12
+#define		T5		%ymm13
+#define		T6		%ymm14
+#define		T7		%ymm15
+#define		KEY		%rdi
+#define		MESSAGE		%rsi
+#define		MESSAGE_LEN	%rdx
+#define		HASH		%rcx
+
+.macro _nh_2xstride	k0, k1, k2, k3
+
+	// Add message words to key words
+	vpaddd		\k0, T3, T0
+	vpaddd		\k1, T3, T1
+	vpaddd		\k2, T3, T2
+	vpaddd		\k3, T3, T3
+
+	// Multiply 32x32 => 64 and accumulate
+	vpshufd		$0x10, T0, T4
+	vpshufd		$0x32, T0, T0
+	vpshufd		$0x10, T1, T5
+	vpshufd		$0x32, T1, T1
+	vpshufd		$0x10, T2, T6
+	vpshufd		$0x32, T2, T2
+	vpshufd		$0x10, T3, T7
+	vpshufd		$0x32, T3, T3
+	vpmuludq	T4, T0, T0
+	vpmuludq	T5, T1, T1
+	vpmuludq	T6, T2, T2
+	vpmuludq	T7, T3, T3
+	vpaddq		T0, PASS0_SUMS, PASS0_SUMS
+	vpaddq		T1, PASS1_SUMS, PASS1_SUMS
+	vpaddq		T2, PASS2_SUMS, PASS2_SUMS
+	vpaddq		T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ *		__le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_FUNC_START(nh_avx2)
+
+	vmovdqu		0x00(KEY), K0
+	vmovdqu		0x10(KEY), K1
+	add		$0x20, KEY
+	vpxor		PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+	vpxor		PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+	vpxor		PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+	vpxor		PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+	sub		$0x40, MESSAGE_LEN
+	jl		.Lloop4_done
+.Lloop4:
+	vmovdqu		(MESSAGE), T3
+	vmovdqu		0x00(KEY), K2
+	vmovdqu		0x10(KEY), K3
+	_nh_2xstride	K0, K1, K2, K3
+
+	vmovdqu		0x20(MESSAGE), T3
+	vmovdqu		0x20(KEY), K0
+	vmovdqu		0x30(KEY), K1
+	_nh_2xstride	K2, K3, K0, K1
+
+	add		$0x40, MESSAGE
+	add		$0x40, KEY
+	sub		$0x40, MESSAGE_LEN
+	jge		.Lloop4
+
+.Lloop4_done:
+	and		$0x3f, MESSAGE_LEN
+	jz		.Ldone
+
+	cmp		$0x20, MESSAGE_LEN
+	jl		.Llast
+
+	// 2 or 3 strides remain; do 2 more.
+	vmovdqu		(MESSAGE), T3
+	vmovdqu		0x00(KEY), K2
+	vmovdqu		0x10(KEY), K3
+	_nh_2xstride	K0, K1, K2, K3
+	add		$0x20, MESSAGE
+	add		$0x20, KEY
+	sub		$0x20, MESSAGE_LEN
+	jz		.Ldone
+	vmovdqa		K2, K0
+	vmovdqa		K3, K1
+.Llast:
+	// Last stride.  Zero the high 128 bits of the message and keys so they
+	// don't affect the result when processing them like 2 strides.
+	vmovdqu		(MESSAGE), T3_XMM
+	vmovdqa		K0_XMM, K0_XMM
+	vmovdqa		K1_XMM, K1_XMM
+	vmovdqu		0x00(KEY), K2_XMM
+	vmovdqu		0x10(KEY), K3_XMM
+	_nh_2xstride	K0, K1, K2, K3
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+
+	// PASS0_SUMS is (0A 0B 0C 0D)
+	// PASS1_SUMS is (1A 1B 1C 1D)
+	// PASS2_SUMS is (2A 2B 2C 2D)
+	// PASS3_SUMS is (3A 3B 3C 3D)
+	// We need the horizontal sums:
+	//     (0A + 0B + 0C + 0D,
+	//	1A + 1B + 1C + 1D,
+	//	2A + 2B + 2C + 2D,
+	//	3A + 3B + 3C + 3D)
+	//
+
+	vpunpcklqdq	PASS1_SUMS, PASS0_SUMS, T0	// T0 = (0A 1A 0C 1C)
+	vpunpckhqdq	PASS1_SUMS, PASS0_SUMS, T1	// T1 = (0B 1B 0D 1D)
+	vpunpcklqdq	PASS3_SUMS, PASS2_SUMS, T2	// T2 = (2A 3A 2C 3C)
+	vpunpckhqdq	PASS3_SUMS, PASS2_SUMS, T3	// T3 = (2B 3B 2D 3D)
+
+	vinserti128	$0x1, T2_XMM, T0, T4		// T4 = (0A 1A 2A 3A)
+	vinserti128	$0x1, T3_XMM, T1, T5		// T5 = (0B 1B 2B 3B)
+	vperm2i128	$0x31, T2, T0, T0		// T0 = (0C 1C 2C 3C)
+	vperm2i128	$0x31, T3, T1, T1		// T1 = (0D 1D 2D 3D)
+
+	vpaddq		T5, T4, T4
+	vpaddq		T1, T0, T0
+	vpaddq		T4, T0, T0
+	vmovdqu		T0, (HASH)
+	vzeroupper
+	RET
+SYM_FUNC_END(nh_avx2)
--- a/lib/crypto/x86/nh-sse2.S
+++ b/lib/crypto/x86/nh-sse2.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define		PASS0_SUMS	%xmm0
+#define		PASS1_SUMS	%xmm1
+#define		PASS2_SUMS	%xmm2
+#define		PASS3_SUMS	%xmm3
+#define		K0		%xmm4
+#define		K1		%xmm5
+#define		K2		%xmm6
+#define		K3		%xmm7
+#define		T0		%xmm8
+#define		T1		%xmm9
+#define		T2		%xmm10
+#define		T3		%xmm11
+#define		T4		%xmm12
+#define		T5		%xmm13
+#define		T6		%xmm14
+#define		T7		%xmm15
+#define		KEY		%rdi
+#define		MESSAGE		%rsi
+#define		MESSAGE_LEN	%rdx
+#define		HASH		%rcx
+
+.macro _nh_stride	k0, k1, k2, k3, offset
+
+	// Load next message stride
+	movdqu		\offset(MESSAGE), T1
+
+	// Load next key stride
+	movdqu		\offset(KEY), \k3
+
+	// Add message words to key words
+	movdqa		T1, T2
+	movdqa		T1, T3
+	paddd		T1, \k0    // reuse k0 to avoid a move
+	paddd		\k1, T1
+	paddd		\k2, T2
+	paddd		\k3, T3
+
+	// Multiply 32x32 => 64 and accumulate
+	pshufd		$0x10, \k0, T4
+	pshufd		$0x32, \k0, \k0
+	pshufd		$0x10, T1, T5
+	pshufd		$0x32, T1, T1
+	pshufd		$0x10, T2, T6
+	pshufd		$0x32, T2, T2
+	pshufd		$0x10, T3, T7
+	pshufd		$0x32, T3, T3
+	pmuludq		T4, \k0
+	pmuludq		T5, T1
+	pmuludq		T6, T2
+	pmuludq		T7, T3
+	paddq		\k0, PASS0_SUMS
+	paddq		T1, PASS1_SUMS
+	paddq		T2, PASS2_SUMS
+	paddq		T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ *		__le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_FUNC_START(nh_sse2)
+
+	movdqu		0x00(KEY), K0
+	movdqu		0x10(KEY), K1
+	movdqu		0x20(KEY), K2
+	add		$0x30, KEY
+	pxor		PASS0_SUMS, PASS0_SUMS
+	pxor		PASS1_SUMS, PASS1_SUMS
+	pxor		PASS2_SUMS, PASS2_SUMS
+	pxor		PASS3_SUMS, PASS3_SUMS
+
+	sub		$0x40, MESSAGE_LEN
+	jl		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3, 0x00
+	_nh_stride	K1, K2, K3, K0, 0x10
+	_nh_stride	K2, K3, K0, K1, 0x20
+	_nh_stride	K3, K0, K1, K2, 0x30
+	add		$0x40, KEY
+	add		$0x40, MESSAGE
+	sub		$0x40, MESSAGE_LEN
+	jge		.Lloop4
+
+.Lloop4_done:
+	and		$0x3f, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K0, K1, K2, K3, 0x00
+
+	sub		$0x10, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K1, K2, K3, K0, 0x10
+
+	sub		$0x10, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K2, K3, K0, K1, 0x20
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	movdqa		PASS0_SUMS, T0
+	movdqa		PASS2_SUMS, T1
+	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
+	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
+	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
+	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
+	paddq		PASS0_SUMS, T0
+	paddq		PASS2_SUMS, T1
+	movdqu		T0, 0x00(HASH)
+	movdqu		T1, 0x10(HASH)
+	RET
+SYM_FUNC_END(nh_sse2)
--- a/lib/crypto/x86/nh.h
+++ b/lib/crypto/x86/nh.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86_64 accelerated implementation of NH
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sse2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_avx2);
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+			__le64 hash[NH_NUM_PASSES]);
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+			__le64 hash[NH_NUM_PASSES]);
+
+static bool nh_arch(const u32 *key, const u8 *message, size_t message_len,
+		    __le64 hash[NH_NUM_PASSES])
+{
+	if (message_len >= 64 && static_branch_likely(&have_sse2) &&
+	    irq_fpu_usable()) {
+		kernel_fpu_begin();
+		if (static_branch_likely(&have_avx2))
+			nh_avx2(key, message, message_len, hash);
+		else
+			nh_sse2(key, message, message_len, hash);
+		kernel_fpu_end();
+		return true;
+	}
+	return false;
+}
+
+#define nh_mod_init_arch nh_mod_init_arch
+static void nh_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_XMM2)) {
+		static_branch_enable(&have_sse2);
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				      NULL))
+			static_branch_enable(&have_avx2);
+	}
+}