diff options
-rwxr-xr-x | configure.py | 53 | ||||
-rw-r--r-- | doc/side_channels.rst | 11 | ||||
-rw-r--r-- | src/build-data/cc/clang.txt | 2 | ||||
-rw-r--r-- | src/build-data/cc/gcc.txt | 2 | ||||
-rw-r--r-- | src/build-data/policy/bsi.txt | 2 | ||||
-rw-r--r-- | src/build-data/policy/modern.txt | 2 | ||||
-rw-r--r-- | src/build-data/policy/nist.txt | 2 | ||||
-rw-r--r-- | src/lib/block/aes/aes.cpp | 158 | ||||
-rw-r--r-- | src/lib/block/aes/aes.h | 24 | ||||
-rw-r--r-- | src/lib/block/aes/aes_ni/info.txt | 2 | ||||
-rw-r--r-- | src/lib/block/aes/aes_ssse3/aes_ssse3.cpp | 517 | ||||
-rw-r--r-- | src/lib/block/aes/aes_ssse3/info.txt | 18 | ||||
-rw-r--r-- | src/lib/block/aes/aes_vperm/aes_vperm.cpp | 634 | ||||
-rw-r--r-- | src/lib/block/aes/aes_vperm/info.txt | 30 | ||||
-rw-r--r-- | src/lib/block/shacal2/shacal2_x86/info.txt | 1 | ||||
-rw-r--r-- | src/lib/hash/sha1/sha1_x86/info.txt | 1 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_x86/info.txt | 1 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/info.txt | 2 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul_ssse3/info.txt | 1 | ||||
-rw-r--r-- | src/lib/utils/cpuid/cpuid.h | 15 | ||||
-rw-r--r-- | src/lib/utils/simd/simd_32.h | 42 |
21 files changed, 885 insertions, 635 deletions
diff --git a/configure.py b/configure.py index 8fe2a1aaa..21459edc8 100755 --- a/configure.py +++ b/configure.py @@ -901,8 +901,17 @@ class ModuleInfo(InfoObject): if supp_arch not in arch_info: raise InternalError('Module %s mentions unknown arch %s' % (self.infofile, supp_arch)) + def known_isa(isa): + if isa in all_isa_extn: + return True + + compound_isa = isa.split(':') + if len(compound_isa) == 2 and compound_isa[0] in arch_info and compound_isa[1] in all_isa_extn: + return True + return False + for isa in self.isa: - if isa not in all_isa_extn: + if not known_isa(isa): raise InternalError('Module %s uses unknown ISA extension %s' % (self.infofile, isa)) def sources(self): @@ -917,6 +926,17 @@ class ModuleInfo(InfoObject): def external_headers(self): return self.header_external + def isas_needed(self, arch): + isas = [] + + for isa in self.isa: + if isa.find(':') == -1: + isas.append(isa) + elif isa.startswith(arch + ':'): + isas.append(isa[len(arch)+1:]) + + return isas + def defines(self): return [(key + ' ' + value) for key, value in self._defines.items()] @@ -925,6 +945,12 @@ class ModuleInfo(InfoObject): cpu_name = options.cpu for isa in self.isa: + if isa.find(':') > 0: + (arch, isa) = isa.split(':') + + if arch != arch_name: + continue + if isa in options.disable_intrinsics: return False # explicitly disabled @@ -1182,11 +1208,19 @@ class CompilerInfo(InfoObject): # pylint: disable=too-many-instance-attributes raise InternalError("Compiler %s has so_link_command for unknown OS %s" % (self.infofile, os_name)) def isa_flags_for(self, isa, arch): + if isa.find(':') > 0: + (isa_arch, isa) = isa.split(':') + if isa_arch != arch: + return '' + if isa in self.isa_flags: + return self.isa_flags[isa] + if isa in self.isa_flags: return self.isa_flags[isa] arch_isa = '%s:%s' % (arch, isa) if arch_isa in self.isa_flags: return self.isa_flags[arch_isa] + return None def get_isa_specific_flags(self, isas, arch, options): @@ -1734,7 +1768,7 @@ def generate_build_info(build_paths, modules, cc, arch, osinfo, options): if src in module_that_owns: module = module_that_owns[src] - isas = module.isa + isas = module.isas_needed(arch.basename) if 'simd' in module.dependencies(osinfo): isas.append('simd') @@ -2190,10 +2224,6 @@ class ModulesChooser(object): sorted_modules_to_load = sorted(modules_to_load) for modname in sorted_modules_to_load: - if modname.startswith('simd_') and modname != 'simd_engine': - logging.info('Using SIMD module ' + modname) - - for modname in sorted_modules_to_load: if all_modules[modname].comment: logging.info('%s: %s' % (modname, all_modules[modname].comment)) if all_modules[modname].warning: @@ -2615,8 +2645,9 @@ class AmalgamationGenerator(object): def _target_for_module(self, mod): target = '' if not self._options.single_amalgamation_file: - if mod.isa != []: - target = '_'.join(sorted(mod.isa)) + isas = mod.isas_needed(self._options.arch) + if isas != []: + target = '_'.join(sorted(isas)) if target == 'sse2' and self._options.arch == 'x86_64': target = '' # SSE2 is always available on x86-64 @@ -2629,9 +2660,9 @@ class AmalgamationGenerator(object): # Only first module for target is considered. Does this make sense? if self._target_for_module(mod) == target: out = set() - for isa in mod.isa: + for isa in mod.isas_needed(self._options.arch): if isa == 'aesni': - isa = "aes,ssse3,pclmul" + isa = "aes,pclmul" elif isa == 'rdrand': isa = 'rdrnd' out.add(isa) @@ -3331,7 +3362,7 @@ def main(argv): cc_arch = check_compiler_arch(options, cc, info_arch, source_paths) if cc_arch is not None and cc_arch != options.arch: - logging.warning("Configured target is %s but compiler probe indicates %s", options.arch, cc_arch) + logging.error("Configured target is %s but compiler probe indicates %s", options.arch, cc_arch) else: cc_min_version = options.cc_min_version or "0.0" diff --git a/doc/side_channels.rst b/doc/side_channels.rst index f18625911..5fe660171 100644 --- a/doc/side_channels.rst +++ b/doc/side_channels.rst @@ -244,12 +244,11 @@ Some x86, ARMv8 and POWER processors support AES instructions which are fast and are thought to be side channel silent. These instructions are used when available. -On x86 processors without AES-NI but with SSSE3 (which includes older Intel -Atoms and Core2 Duos, and even now some embedded or low power x86 chips), a -version of AES using pshufb is used which is both fast and side channel silent. -It is based on code by Mike Hamburg [VectorAes], see aes_ssse3.cpp. This same -technique could be applied with NEON or AltiVec, and the paper suggests some -optimizations for the AltiVec shuffle. +On CPUs which do not have hardware AES instructions but do support SIMD vectors +with a byte shuffle (including x86's SSSE3 and ARM's NEON), a version of AES is +implemented which is side channel silent. This version is based on code by Mike +Hamburg [VectorAes], see aes_vperm.cpp. This same technique could be applied +with AltiVec, and the paper suggests some optimizations for the AltiVec shuffle. On all other processors, a table lookup version (T-tables) is used. This approach is relatively fast, but known to be very vulnerable to side diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt index 6614d5a63..0312055aa 100644 --- a/src/build-data/cc/clang.txt +++ b/src/build-data/cc/clang.txt @@ -52,7 +52,7 @@ sse41 -> "-msse4.1" sse42 -> "-msse4.2" avx2 -> "-mavx2" bmi2 -> "-mbmi -mbmi2" -aesni -> "-maes -mpclmul -mssse3" +aesni -> "-maes -mpclmul" rdrand -> "-mrdrnd" rdseed -> "-mrdseed" sha -> "-msha" diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt index cc3ce99e1..7393358a6 100644 --- a/src/build-data/cc/gcc.txt +++ b/src/build-data/cc/gcc.txt @@ -56,7 +56,7 @@ sse41 -> "-msse4.1" sse42 -> "-msse4.2" avx2 -> "-mavx2" bmi2 -> "-mbmi -mbmi2" -aesni -> "-maes -mpclmul -mssse3" +aesni -> "-maes -mpclmul" rdrand -> "-mrdrnd" rdseed -> "-mrdseed" sha -> "-msha" diff --git a/src/build-data/policy/bsi.txt b/src/build-data/policy/bsi.txt index 04a29b817..d89f4433d 100644 --- a/src/build-data/policy/bsi.txt +++ b/src/build-data/policy/bsi.txt @@ -51,7 +51,7 @@ hmac_drbg <if_available> # block aes_ni -aes_ssse3 +aes_vperm aes_armv8 aes_power8 diff --git a/src/build-data/policy/modern.txt b/src/build-data/policy/modern.txt index b417a2342..2a9c12613 100644 --- a/src/build-data/policy/modern.txt +++ b/src/build-data/policy/modern.txt @@ -59,7 +59,7 @@ locking_allocator http_util # needed by x509 for OCSP online checks aes_ni -aes_ssse3 +aes_vperm aes_armv8 aes_power8 serpent_simd diff --git a/src/build-data/policy/nist.txt b/src/build-data/policy/nist.txt index 573c1c721..2fdf60ea6 100644 --- a/src/build-data/policy/nist.txt +++ b/src/build-data/policy/nist.txt @@ -48,7 +48,7 @@ rfc3394 <if_available> # block aes_ni -aes_ssse3 +aes_vperm aes_armv8 aes_power8 diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 568dfb1b3..2813a5f5a 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -456,13 +456,6 @@ const char* aes_provider() } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return "ssse3"; - } -#endif - #if defined(BOTAN_HAS_AES_POWER8) if(CPUID::has_ppc_crypto()) { @@ -477,6 +470,13 @@ const char* aes_provider() } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return "vperm"; + } +#endif + return "base"; } @@ -501,13 +501,6 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_encrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -522,6 +515,13 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_encrypt_n(in, out, blocks); + } +#endif + aes_encrypt_n(in, out, blocks, m_EK, m_ME); } @@ -536,13 +536,6 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_decrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -557,6 +550,13 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_decrypt_n(in, out, blocks); + } +#endif + aes_decrypt_n(in, out, blocks, m_DK, m_MD); } @@ -569,10 +569,24 @@ void AES_128::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) { - return ssse3_key_schedule(key, length); + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_key_schedule(key, length); } #endif @@ -598,13 +612,6 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_encrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -619,6 +626,13 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_encrypt_n(in, out, blocks); + } +#endif + aes_encrypt_n(in, out, blocks, m_EK, m_ME); } @@ -633,13 +647,6 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_decrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -654,6 +661,13 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_decrypt_n(in, out, blocks); + } +#endif + aes_decrypt_n(in, out, blocks, m_DK, m_MD); } @@ -666,10 +680,24 @@ void AES_192::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) { - return ssse3_key_schedule(key, length); + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_key_schedule(key, length); } #endif @@ -695,13 +723,6 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_encrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -716,6 +737,13 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_encrypt_n(in, out, blocks); + } +#endif + aes_encrypt_n(in, out, blocks, m_EK, m_ME); } @@ -730,13 +758,6 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_decrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -751,6 +772,13 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_decrypt_n(in, out, blocks); + } +#endif + aes_decrypt_n(in, out, blocks, m_DK, m_MD); } @@ -763,10 +791,24 @@ void AES_256::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) { - return ssse3_key_schedule(key, length); + return vperm_key_schedule(key, length); } #endif diff --git a/src/lib/block/aes/aes.h b/src/lib/block/aes/aes.h index 294cdcad3..6083467b6 100644 --- a/src/lib/block/aes/aes.h +++ b/src/lib/block/aes/aes.h @@ -31,10 +31,10 @@ class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16, private: void key_schedule(const uint8_t key[], size_t length) override; -#if defined(BOTAN_HAS_AES_SSSE3) - void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_key_schedule(const uint8_t key[], size_t length); +#if defined(BOTAN_HAS_AES_VPERM) + void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_key_schedule(const uint8_t key[], size_t length); #endif #if defined(BOTAN_HAS_AES_NI) @@ -74,10 +74,10 @@ class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16, size_t parallelism() const override; private: -#if defined(BOTAN_HAS_AES_SSSE3) - void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_key_schedule(const uint8_t key[], size_t length); +#if defined(BOTAN_HAS_AES_VPERM) + void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_key_schedule(const uint8_t key[], size_t length); #endif #if defined(BOTAN_HAS_AES_NI) @@ -120,10 +120,10 @@ class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16, size_t parallelism() const override; private: -#if defined(BOTAN_HAS_AES_SSSE3) - void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_key_schedule(const uint8_t key[], size_t length); +#if defined(BOTAN_HAS_AES_VPERM) + void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_key_schedule(const uint8_t key[], size_t length); #endif #if defined(BOTAN_HAS_AES_NI) diff --git a/src/lib/block/aes/aes_ni/info.txt b/src/lib/block/aes/aes_ni/info.txt index 7fff19923..2e9749fb8 100644 --- a/src/lib/block/aes/aes_ni/info.txt +++ b/src/lib/block/aes/aes_ni/info.txt @@ -3,5 +3,7 @@ AES_NI -> 20131128 </defines> <isa> +sse2 +ssse3 aesni </isa> diff --git a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp b/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp deleted file mode 100644 index fa8bf4faa..000000000 --- a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp +++ /dev/null @@ -1,517 +0,0 @@ -/* -* AES using SSSE3 -* (C) 2010,2016,2019 Jack Lloyd -* -* This is more or less a direct translation of public domain x86-64 -* assembly written by Mike Hamburg, described in "Accelerating AES -* with Vector Permute Instructions" (CHES 2009). His original code is -* available at https://crypto.stanford.edu/vpaes/ -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#include <botan/aes.h> -#include <botan/internal/ct_utils.h> -#include <botan/internal/simd_32.h> - -#if defined(BOTAN_SIMD_USE_SSE2) - #include <tmmintrin.h> -#elif defined(BOTAN_SIMD_USE_NEON) - #include <arm_neon.h> -#endif - -namespace Botan { - -namespace { - -inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b) - { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw())); -#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64) - - const int8x16_t tbl = vreinterpretq_s8_m128i(a.raw()); - const uint8x16_t idx = vreinterpretq_u8_m128i(b.raw()); - - // fixme use vdupq_n_s8 - const uint8_t alignas(16) mask[16] = { - 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, - 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F - }; - - const uint8x16_t idx_masked = - vandq_u8(idx, vld1q_u8(mask)); // avoid using meaningless bits - - return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); -#else - #error "No shuffle implementation available" -#endif - } - -template<size_t I1, size_t I2, size_t I3, size_t I4> -inline SIMD_4x32 shuffle32(SIMD_4x32 x) - { - return SIMD_4x32(_mm_shuffle_epi32(x.raw(), _MM_SHUFFLE(I1, I2, I3, I4))); - } - -template<size_t I> -inline SIMD_4x32 slli(SIMD_4x32 x) - { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I)); -#else - #error "No ssli implementation available" -#endif - } - -inline SIMD_4x32 zero_top_half(SIMD_4x32 x) - { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_slli_si128(_mm_srli_si128(x.raw(), 8), 8)); -#else - #error "No zero_top_half implementation available" -#endif - } - -template<int C> -inline SIMD_4x32 alignr(SIMD_4x32 a, SIMD_4x32 b) - { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), C)); -#else - #error "No alignr implementation available" -#endif - } - -const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090); -const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC); - -const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309); -const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C); - -const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E); -const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1); - -const SIMD_4x32 mc_forward[4] = { - SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), - SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), - SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), - SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09) -}; - -const SIMD_4x32 sr[4] = { - SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C), - SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C), - SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C), - SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C), -}; - -const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); -const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); - -inline SIMD_4x32 low_nibs(SIMD_4x32 x) - { - return lo_nibs_mask & x; - } - -inline SIMD_4x32 high_nibs(SIMD_4x32 x) - { - return (hi_nibs_mask & x).shr<4>(); - } - -SIMD_4x32 aes_vperm_encrypt(SIMD_4x32 B, const uint32_t* keys, size_t rounds) - { - const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955); - const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8); - - const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A); - const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1); - - const SIMD_4x32 mc_backward[4] = { - SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F), - SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B), - SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407), - SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003), - }; - - B = shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ SIMD_4x32(&keys[0]); - - for(size_t r = 1; ; ++r) - { - const SIMD_4x32 K(&keys[4*r]); - - SIMD_4x32 t = high_nibs(B); - B = low_nibs(B); - - SIMD_4x32 t2 = shuffle(k_inv2, B); - - B ^= t; - - SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t); - SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, B); - - SIMD_4x32 t5 = B ^ shuffle(k_inv1, t3); - SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4); - - if(r == rounds) - { - return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, sr[r % 4]); - } - - SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K; - - SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]); - - B = shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8; - } - } - -SIMD_4x32 aes_vperm_decrypt(SIMD_4x32 B, const uint32_t keys[], size_t rounds) - { - const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E); - const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772); - - const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50); - const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E); - - const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004); - const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B); - - const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13); - const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D); - - const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6); - const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E); - - const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9); - const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159); - - SIMD_4x32 mc(mc_forward[3]); - - B = shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ SIMD_4x32(&keys[0]); - - for(size_t r = 1; ; ++r) - { - const SIMD_4x32 K(&keys[4*r]); - - SIMD_4x32 t = high_nibs(B); - B = low_nibs(B); - - SIMD_4x32 t2 = shuffle(k_inv2, B); - - B ^= t; - - const SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t); - const SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, B); - const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t3); - const SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4); - - if(r == rounds) - { - const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K; - const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16; - return shuffle(x, sr[which_sr]); - } - - const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K; - const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6); - const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6); - - B = shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6); - - mc = alignr<12>(mc, mc); - } - } - -void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, - const uint32_t keys[], size_t rounds) - { - CT::poison(in, blocks * 16); - - BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) - { - SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ??? - B = aes_vperm_encrypt(B, keys, rounds); - B.store_le(out + i*16); - } - - CT::unpoison(in, blocks * 16); - CT::unpoison(out, blocks * 16); - } - -void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, - const uint32_t keys[], size_t rounds) - { - CT::poison(in, blocks * 16); - - BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) - { - SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ??? - B = aes_vperm_decrypt(B, keys, rounds); - B.store_le(out + i*16); - } - - CT::unpoison(in, blocks * 16); - CT::unpoison(out, blocks * 16); - } - -} - -void AES_128::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const - { - vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 10); - } - -void AES_128::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const - { - vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 10); - } - -void AES_192::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const - { - vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 12); - } - -void AES_192::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const - { - vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 12); - } - -void AES_256::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const - { - vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 14); - } - -void AES_256::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const - { - vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 14); - } - -namespace { - -SIMD_4x32 aes_schedule_transform(SIMD_4x32 input, - SIMD_4x32 table_1, - SIMD_4x32 table_2) - { - return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input)); - } - -SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) - { - const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); - const SIMD_4x32 srx(sr[round_no % 4]); - - SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0); - SIMD_4x32 t2 = t; - t = shuffle(t, mc_forward0); - t2 = t ^ t2 ^ shuffle(t, mc_forward0); - return shuffle(t2, srx); - } - -SIMD_4x32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) - { - const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); - - const SIMD_4x32 dsk[8] = { - SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334), - SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC), - SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A), - SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C), - SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9), - SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D), - SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3), - SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4), - }; - - SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]); - SIMD_4x32 output = shuffle(t, mc_forward0); - - t = aes_schedule_transform(t, dsk[2], dsk[3]); - output = shuffle(t ^ output, mc_forward0); - - t = aes_schedule_transform(t, dsk[4], dsk[5]); - output = shuffle(t ^ output, mc_forward0); - - t = aes_schedule_transform(t, dsk[6], dsk[7]); - output = shuffle(t ^ output, mc_forward0); - - return shuffle(output, SIMD_4x32(sr[round_no % 4])); - } - -SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) - { - const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121); - const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); - - k = shuffle(k, SIMD_4x32(sr[round_no % 4])); - k ^= SIMD_4x32::splat_u8(0x5B); - return aes_schedule_transform(k, out_tr1, out_tr2); - } - -SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k) - { - const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A); - const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB); - - k ^= SIMD_4x32::splat_u8(0x5B); - return aes_schedule_transform(k, deskew1, deskew2); - } - -SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) - { - SIMD_4x32 smeared = input2 ^ slli<1>(input2); - smeared ^= slli<2>(smeared); - smeared ^= SIMD_4x32::splat_u8(0x5B); - - SIMD_4x32 t = high_nibs(input1); - input1 = low_nibs(input1); - - SIMD_4x32 t2 = shuffle(k_inv2, input1); - - input1 ^= t; - - SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t); - SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1); - - SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3); - SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4); - - return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6); - } - -SIMD_4x32 aes_schedule_round(SIMD_4x32& rcon, SIMD_4x32 input1, SIMD_4x32 input2) - { - input2 ^= alignr<15>(SIMD_4x32(), rcon); - rcon = alignr<15>(rcon, rcon); - input1 = shuffle32<3,3,3,3>(input1); - input1 = alignr<1>(input1, input1); - - return aes_schedule_round(input1, input2); - } - -SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) - { - return y ^ shuffle32<3,3,3,2>(x) ^ shuffle32<2,0,0,0>(y); - } - -} - -void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t) - { - m_EK.resize(11*4); - m_DK.resize(11*4); - - SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808); - - SIMD_4x32 key = SIMD_4x32::load_le(keyb); - - shuffle(key, sr[2]).store_le(&m_DK[4*10]); - - key = aes_schedule_transform(key, k_ipt1, k_ipt2); - key.store_le(&m_EK[0]); - - for(size_t i = 1; i != 10; ++i) - { - key = aes_schedule_round(rcon, key, key); - - aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]); - - aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]); - } - - key = aes_schedule_round(rcon, key, key); - aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]); - aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]); - } - -void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t) - { - m_EK.resize(13*4); - m_DK.resize(13*4); - - SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808); - - SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); - SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8); - - shuffle(key1, sr[0]).store_le(&m_DK[12*4]); - - key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); - key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); - - key1.store_le(&m_EK[0]); - - for(size_t i = 0; i != 4; ++i) - { - // key2 with 8 high bytes masked off - SIMD_4x32 t = zero_top_half(key2); - key2 = aes_schedule_round(rcon, key2, key1); - - // fixme cse - aes_schedule_mangle(alignr<8>(key2, t), (i+3)%4).store_le(&m_EK[4*(3*i+1)]); - aes_schedule_mangle_dec(alignr<8>(key2, t), (i+3)%4).store_le(&m_DK[4*(11-3*i)]); - - t = aes_schedule_192_smear(key2, t); - - aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]); - aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]); - - key2 = aes_schedule_round(rcon, t, key2); - - if(i == 3) - { - aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]); - aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]); - } - else - { - aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]); - aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]); - } - - key1 = key2; - key2 = aes_schedule_192_smear(key2, zero_top_half(t)); - } - } - -void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t) - { - m_EK.resize(15*4); - m_DK.resize(15*4); - - SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808); - - SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); - SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16); - - shuffle(key1, sr[2]).store_le(&m_DK[4*14]); - - key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); - key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); - - key1.store_le(&m_EK[0]); - aes_schedule_mangle(key2, 3).store_le(&m_EK[4]); - - aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]); - - for(size_t i = 2; i != 14; i += 2) - { - const SIMD_4x32 k_t = key2; - key1 = key2 = aes_schedule_round(rcon, key2, key1); - - aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]); - aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]); - - key2 = aes_schedule_round(shuffle32<3,3,3,3>(key2), k_t); - - aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]); - aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]); - } - - key2 = aes_schedule_round(rcon, key2, key1); - - aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]); - aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]); - } - -} diff --git a/src/lib/block/aes/aes_ssse3/info.txt b/src/lib/block/aes/aes_ssse3/info.txt deleted file mode 100644 index 49d9a9214..000000000 --- a/src/lib/block/aes/aes_ssse3/info.txt +++ /dev/null @@ -1,18 +0,0 @@ -<defines> -AES_SSSE3 -> 20131128 -</defines> - -<isa> -ssse3 -</isa> - -<requires> -simd -</requires> - -<cc> -gcc -clang -msvc:19.10 # VC 2017 -sunstudio -</cc> diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp new file mode 100644 index 000000000..b7e82876c --- /dev/null +++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp @@ -0,0 +1,634 @@ +/* +* AES using vector permutes (SSSE3, NEON) +* (C) 2010,2016,2019 Jack Lloyd +* +* Based on public domain x86-64 assembly written by Mike Hamburg, +* described in "Accelerating AES with Vector Permute Instructions" +* (CHES 2009). His original code is available at +* https://crypto.stanford.edu/vpaes/ +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/aes.h> +#include <botan/internal/ct_utils.h> +#include <botan/internal/simd_32.h> + +#if defined(BOTAN_SIMD_USE_SSE2) + #include <tmmintrin.h> +#endif + +namespace Botan { + +namespace { + +inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b) + { +#if defined(BOTAN_SIMD_USE_SSE2) + return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw())); +#elif defined(BOTAN_SIMD_USE_NEON) + const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw()); + const uint8x16_t idx = vreinterpretq_u8_u32(b.raw()); + +#if defined(BOTAN_TARGET_ARCH_IS_ARM32) + const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) }; + + return SIMD_4x32(vreinterpretq_u32_u8( + vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)), + vtbl2_u8(tbl2, vget_high_u8(idx))))); + +#else + return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx))); +#endif + +#else + #error "No shuffle implementation available" +#endif + } + +template<size_t I> +inline SIMD_4x32 shift_elems_left(SIMD_4x32 x) + { +#if defined(BOTAN_SIMD_USE_SSE2) + return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I)); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u32(x.raw()), 16 - 4*I))); +#else + #error "No shift_elems_left implementation available" +#endif + } + +inline SIMD_4x32 alignr8(SIMD_4x32 a, SIMD_4x32 b) + { +#if defined(BOTAN_SIMD_USE_SSE2) + return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8)); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), 8))); +#else + #error "No alignr8 implementation available" +#endif + } + +const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090); +const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC); + +const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309); +const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C); + +const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E); +const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1); + +const SIMD_4x32 mc_forward[4] = { + SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), + SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), + SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), + SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09) +}; + +const SIMD_4x32 sr[4] = { + SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C), + SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C), + SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C), + SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C), +}; + +const SIMD_4x32 rcon[10] = { + SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000), + SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000), +}; + +const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); +const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); +const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B); + +inline SIMD_4x32 low_nibs(SIMD_4x32 x) + { + return lo_nibs_mask & x; + } + +inline SIMD_4x32 high_nibs(SIMD_4x32 x) + { + return (hi_nibs_mask & x).shr<4>(); + } + +inline SIMD_4x32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K) + { + return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K; + } + +inline SIMD_4x32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) + { + const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955); + const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8); + + const SIMD_4x32 mc_backward[4] = { + SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F), + SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B), + SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407), + SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003), + }; + + const SIMD_4x32 Bh = high_nibs(B); + SIMD_4x32 Bl = low_nibs(B); + const SIMD_4x32 t2 = shuffle(k_inv2, Bl); + Bl ^= Bh; + + const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); + const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); + + const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K; + const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]); + + return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8; + } + +inline SIMD_4x32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) + { + const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A); + const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1); + + const SIMD_4x32 Bh = high_nibs(B); + SIMD_4x32 Bl = low_nibs(B); + const SIMD_4x32 t2 = shuffle(k_inv2, Bl); + Bl ^= Bh; + + const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); + const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); + + return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, sr[r % 4]); + } + +inline SIMD_4x32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K) + { + const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E); + const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772); + + return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K; + } + +inline SIMD_4x32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) + { + const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50); + const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E); + + const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004); + const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B); + + const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13); + const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D); + + const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6); + const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E); + + const SIMD_4x32 mcx[4] = { + SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09), + SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), + SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), + SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), + }; + + const SIMD_4x32 Bh = high_nibs(B); + B = low_nibs(B); + const SIMD_4x32 t2 = shuffle(k_inv2, B); + + B ^= Bh; + + const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); + const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B)); + + const SIMD_4x32 mc = mcx[(r-1)%4]; + + const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K; + const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6); + const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6); + return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6); + } + +inline SIMD_4x32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) + { + const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9); + const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159); + + const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16; + + const SIMD_4x32 Bh = high_nibs(B); + B = low_nibs(B); + const SIMD_4x32 t2 = shuffle(k_inv2, B); + + B ^= Bh; + + const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); + const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B)); + + const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K; + return shuffle(x, sr[which_sr]); + } + +void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, + const SIMD_4x32 K[], size_t rounds) + { + CT::poison(in, blocks * 16); + + const size_t blocks2 = blocks - (blocks % 2); + + for(size_t i = 0; i != blocks2; i += 2) + { + SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16); + SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16); + + B0 = aes_enc_first_round(B0, K[0]); + B1 = aes_enc_first_round(B1, K[0]); + + for(size_t r = 1; r != rounds; ++r) + { + B0 = aes_enc_round(B0, K[r], r); + B1 = aes_enc_round(B1, K[r], r); + } + + B0 = aes_enc_last_round(B0, K[rounds], rounds); + B1 = aes_enc_last_round(B1, K[rounds], rounds); + + B0.store_le(out + i*16); + B1.store_le(out + (i+1)*16); + } + + for(size_t i = blocks2; i < blocks; ++i) + { + SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ??? + + B = aes_enc_first_round(B, K[0]); + + for(size_t r = 1; r != rounds; ++r) + { + B = aes_enc_round(B, K[r], r); + } + + B = aes_enc_last_round(B, K[rounds], rounds); + B.store_le(out + i*16); + } + + CT::unpoison(in, blocks * 16); + CT::unpoison(out, blocks * 16); + } + +void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, + const SIMD_4x32 K[], size_t rounds) + { + CT::poison(in, blocks * 16); + + const size_t blocks2 = blocks - (blocks % 2); + + for(size_t i = 0; i != blocks2; i += 2) + { + SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16); + SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16); + + B0 = aes_dec_first_round(B0, K[0]); + B1 = aes_dec_first_round(B1, K[0]); + + for(size_t r = 1; r != rounds; ++r) + { + B0 = aes_dec_round(B0, K[r], r); + B1 = aes_dec_round(B1, K[r], r); + } + + B0 = aes_dec_last_round(B0, K[rounds], rounds); + B1 = aes_dec_last_round(B1, K[rounds], rounds); + + B0.store_le(out + i*16); + B1.store_le(out + (i+1)*16); + } + + for(size_t i = blocks2; i < blocks; ++i) + { + SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ??? + + B = aes_dec_first_round(B, K[0]); + + for(size_t r = 1; r != rounds; ++r) + { + B = aes_dec_round(B, K[r], r); + } + + B = aes_dec_last_round(B, K[rounds], rounds); + B.store_le(out + i*16); + } + + CT::unpoison(in, blocks * 16); + CT::unpoison(out, blocks * 16); + } + +} + +void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const SIMD_4x32 K[11] = { + SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]), + SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]), + SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]), + SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), + }; + + return vperm_encrypt_blocks(in, out, blocks, K, 10); + } + +void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const SIMD_4x32 K[11] = { + SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]), + SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]), + SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]), + SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), + }; + + return vperm_decrypt_blocks(in, out, blocks, K, 10); + } + +void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const SIMD_4x32 K[13] = { + SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]), + SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]), + SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]), + SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]), + SIMD_4x32(&m_EK[4*12]), + }; + + return vperm_encrypt_blocks(in, out, blocks, K, 12); + } + +void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const SIMD_4x32 K[13] = { + SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]), + SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]), + SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]), + SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]), + SIMD_4x32(&m_DK[4*12]), + }; + + return vperm_decrypt_blocks(in, out, blocks, K, 12); + } + +void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const SIMD_4x32 K[15] = { + SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]), + SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]), + SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]), + SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]), + SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]), + }; + + return vperm_encrypt_blocks(in, out, blocks, K, 14); + } + +void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const SIMD_4x32 K[15] = { + SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]), + SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]), + SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]), + SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]), + SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]), + }; + + return vperm_decrypt_blocks(in, out, blocks, K, 14); + } + +namespace { + +SIMD_4x32 aes_schedule_transform(SIMD_4x32 input, + SIMD_4x32 table_1, + SIMD_4x32 table_2) + { + return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input)); + } + +SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) + { + const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); + const SIMD_4x32 srx(sr[round_no % 4]); + + SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0); + SIMD_4x32 t2 = t; + t = shuffle(t, mc_forward0); + t2 = t ^ t2 ^ shuffle(t, mc_forward0); + return shuffle(t2, srx); + } + +SIMD_4x32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) + { + const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); + + const SIMD_4x32 dsk[8] = { + SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334), + SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC), + SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A), + SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C), + SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9), + SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D), + SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3), + SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4), + }; + + SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]); + SIMD_4x32 output = shuffle(t, mc_forward0); + + t = aes_schedule_transform(t, dsk[2], dsk[3]); + output = shuffle(t ^ output, mc_forward0); + + t = aes_schedule_transform(t, dsk[4], dsk[5]); + output = shuffle(t ^ output, mc_forward0); + + t = aes_schedule_transform(t, dsk[6], dsk[7]); + output = shuffle(t ^ output, mc_forward0); + + return shuffle(output, sr[round_no % 4]); + } + +SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) + { + const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121); + const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); + + k = shuffle(k, sr[round_no % 4]); + k ^= xor_5B; + return aes_schedule_transform(k, out_tr1, out_tr2); + } + +SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k) + { + const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A); + const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB); + + k ^= xor_5B; + return aes_schedule_transform(k, deskew1, deskew2); + } + +SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) + { + SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2); + smeared ^= shift_elems_left<2>(smeared); + smeared ^= xor_5B; + + SIMD_4x32 t = high_nibs(input1); + input1 = low_nibs(input1); + + SIMD_4x32 t2 = shuffle(k_inv2, input1); + + input1 ^= t; + + SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t); + SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1); + + SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3); + SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4); + + return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6); + } + +SIMD_4x32 aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2) + { + // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3))); + const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D); + return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc); + } + +SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) + { + const SIMD_4x32 shuffle3332 = + SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C); + const SIMD_4x32 shuffle2000 = + SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908); + + const SIMD_4x32 zero_top_half(0, 0, ~0, ~0); + y &= zero_top_half; + return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000); + } + +} + +void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t) + { + m_EK.resize(11*4); + m_DK.resize(11*4); + + SIMD_4x32 key = SIMD_4x32::load_le(keyb); + + shuffle(key, sr[2]).store_le(&m_DK[4*10]); + + key = aes_schedule_transform(key, k_ipt1, k_ipt2); + key.store_le(&m_EK[0]); + + for(size_t i = 1; i != 10; ++i) + { + key = aes_schedule_round(rcon[i-1], key, key); + + aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]); + + aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]); + } + + key = aes_schedule_round(rcon[9], key, key); + aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]); + aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]); + } + +void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t) + { + m_EK.resize(13*4); + m_DK.resize(13*4); + + SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); + SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8); + + shuffle(key1, sr[0]).store_le(&m_DK[12*4]); + + key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); + key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); + + key1.store_le(&m_EK[0]); + + for(size_t i = 0; i != 4; ++i) + { + // key2 with 8 high bytes masked off + SIMD_4x32 t = key2; + key2 = aes_schedule_round(rcon[2*i], key2, key1); + + const SIMD_4x32 key2t = alignr8(key2, t); + aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]); + aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]); + + t = aes_schedule_192_smear(key2, t); + + aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]); + aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]); + + key2 = aes_schedule_round(rcon[2*i+1], t, key2); + + if(i == 3) + { + aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]); + aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]); + } + else + { + aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]); + aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]); + } + + key1 = key2; + key2 = aes_schedule_192_smear(key2, t); + } + } + +void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t) + { + m_EK.resize(15*4); + m_DK.resize(15*4); + + SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); + SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16); + + shuffle(key1, sr[2]).store_le(&m_DK[4*14]); + + key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); + key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); + + key1.store_le(&m_EK[0]); + aes_schedule_mangle(key2, 3).store_le(&m_EK[4]); + + aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]); + + const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C); + + for(size_t i = 2; i != 14; i += 2) + { + const SIMD_4x32 k_t = key2; + key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1); + + aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]); + aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]); + + key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t); + + aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]); + aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]); + } + + key2 = aes_schedule_round(rcon[6], key2, key1); + + aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]); + aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]); + } + +} diff --git a/src/lib/block/aes/aes_vperm/info.txt b/src/lib/block/aes/aes_vperm/info.txt new file mode 100644 index 000000000..f771ca2c3 --- /dev/null +++ b/src/lib/block/aes/aes_vperm/info.txt @@ -0,0 +1,30 @@ +<defines> +AES_VPERM -> 20190901 +</defines> + +<isa> +x86_32:sse2 +x86_64:sse2 +x86_32:ssse3 +x86_64:ssse3 +arm32:neon +arm64:neon +</isa> + +<arch> +x86_32 +x86_64 +arm32 +arm64 +</arch> + +<requires> +simd +</requires> + +<cc> +gcc +clang +msvc:19.10 # VC 2017 +sunstudio +</cc> diff --git a/src/lib/block/shacal2/shacal2_x86/info.txt b/src/lib/block/shacal2/shacal2_x86/info.txt index 723400f76..298833048 100644 --- a/src/lib/block/shacal2/shacal2_x86/info.txt +++ b/src/lib/block/shacal2/shacal2_x86/info.txt @@ -8,6 +8,7 @@ shacal2 <isa> sha +sse2 ssse3 </isa> diff --git a/src/lib/hash/sha1/sha1_x86/info.txt b/src/lib/hash/sha1/sha1_x86/info.txt index 9dba8bf00..0a46d980a 100644 --- a/src/lib/hash/sha1/sha1_x86/info.txt +++ b/src/lib/hash/sha1/sha1_x86/info.txt @@ -4,6 +4,7 @@ SHA1_X86_SHA_NI -> 20170518 <isa> sha +sse2 ssse3 sse41 </isa> diff --git a/src/lib/hash/sha2_32/sha2_32_x86/info.txt b/src/lib/hash/sha2_32/sha2_32_x86/info.txt index bc167ef04..8d9fb4149 100644 --- a/src/lib/hash/sha2_32/sha2_32_x86/info.txt +++ b/src/lib/hash/sha2_32/sha2_32_x86/info.txt @@ -4,6 +4,7 @@ SHA2_32_X86 -> 20170518 <isa> sha +sse2 ssse3 sse41 </isa> diff --git a/src/lib/modes/aead/gcm/clmul/info.txt b/src/lib/modes/aead/gcm/clmul/info.txt index b8d45cda4..d4b6a1c1f 100644 --- a/src/lib/modes/aead/gcm/clmul/info.txt +++ b/src/lib/modes/aead/gcm/clmul/info.txt @@ -3,6 +3,8 @@ GCM_CLMUL -> 20131227 </defines> <isa> +sse2 +ssse3 aesni </isa> diff --git a/src/lib/modes/aead/gcm/clmul_ssse3/info.txt b/src/lib/modes/aead/gcm/clmul_ssse3/info.txt index 8e4e143bb..47fc290cf 100644 --- a/src/lib/modes/aead/gcm/clmul_ssse3/info.txt +++ b/src/lib/modes/aead/gcm/clmul_ssse3/info.txt @@ -3,6 +3,7 @@ GCM_CLMUL_SSSE3 -> 20171020 </defines> <isa> +sse2 ssse3 </isa> diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h index 256c6cc57..f50f40f1d 100644 --- a/src/lib/utils/cpuid/cpuid.h +++ b/src/lib/utils/cpuid/cpuid.h @@ -303,6 +303,21 @@ class BOTAN_PUBLIC_API(2,1) CPUID final { return has_cpuid_bit(CPUID_RDSEED_BIT); } #endif + /** + * Check if the processor supports byte-level vector permutes + * (SSSE3, NEON, Altivec) + */ + static bool has_vperm() + { +#if defined(BOTAN_TARGET_CPU_IS_X86_FAMILY) + return has_ssse3(); +#elif defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY) + return has_neon(); +#else + return false; +#endif + } + /* * Clear a CPUID bit * Call CPUID::initialize to reset diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h index 7b6929c6d..6f3134bce 100644 --- a/src/lib/utils/simd/simd_32.h +++ b/src/lib/utils/simd/simd_32.h @@ -167,7 +167,15 @@ class SIMD_4x32 final #elif defined(BOTAN_SIMD_USE_NEON) SIMD_4x32 l(vld1q_u32(static_cast<const uint32_t*>(in))); + +#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) + return l.bswap(); +#elif defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) + return l; +#else return CPUID::is_big_endian() ? l.bswap() : l; +#endif + #else SIMD_4x32 out; Botan::load_le(out.m_simd.val, static_cast<const uint8_t*>(in), 4); @@ -181,11 +189,9 @@ class SIMD_4x32 final static SIMD_4x32 load_be(const void* in) { #if defined(BOTAN_SIMD_USE_SSE2) - return load_le(in).bswap(); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - uint32_t R[4]; Botan::load_be(R, static_cast<const uint8_t*>(in), 4); return SIMD_4x32(R); @@ -193,7 +199,14 @@ class SIMD_4x32 final #elif defined(BOTAN_SIMD_USE_NEON) SIMD_4x32 l(vld1q_u32(static_cast<const uint32_t*>(in))); + +#if defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) + return l.bswap(); +#elif defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) + return l; +#else return CPUID::is_little_endian() ? l.bswap() : l; +#endif #else SIMD_4x32 out; @@ -214,7 +227,7 @@ class SIMD_4x32 final { #if defined(BOTAN_SIMD_USE_SSE2) - _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_simd); + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), raw()); #elif defined(BOTAN_SIMD_USE_ALTIVEC) @@ -222,19 +235,26 @@ class SIMD_4x32 final __vector unsigned int V; uint32_t R[4]; } vec; - vec.V = m_simd; + vec.V = raw(); Botan::store_le(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]); #elif defined(BOTAN_SIMD_USE_NEON) - if(CPUID::is_big_endian()) +#if defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) + vst1q_u8(out, vreinterpretq_u8_u32(m_simd)); +#elif defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) + vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd)); +#else + if(CPUID::is_little_endian()) { - bswap().store_le(out); + vst1q_u8(out, vreinterpretq_u8_u32(m_simd)); } else { - vst1q_u8(out, vreinterpretq_u8_u32(m_simd)); + vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd)); } +#endif + #else Botan::store_le(out, m_simd.val[0], m_simd.val[1], m_simd.val[2], m_simd.val[3]); #endif @@ -260,14 +280,20 @@ class SIMD_4x32 final #elif defined(BOTAN_SIMD_USE_NEON) +#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) + vst1q_u8(out, vreinterpretq_u8_u32(m_simd); +#elif defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) + vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd)); +#else if(CPUID::is_little_endian()) { - bswap().store_le(out); + vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd)); } else { vst1q_u8(out, vreinterpretq_u8_u32(m_simd)); } +#endif #else Botan::store_be(out, m_simd.val[0], m_simd.val[1], m_simd.val[2], m_simd.val[3]); |