21 files changed, 885 insertions, 635 deletions
diff --git a/configure.py b/configure.py
index 8fe2a1aaa..21459edc8 100755
--- a/configure.py
+++ b/configure.py
@@ -901,8 +901,17 @@ class ModuleInfo(InfoObject):
             if supp_arch not in arch_info:
                 raise InternalError('Module %s mentions unknown arch %s' % (self.infofile, supp_arch))
 
+        def known_isa(isa):
+            if isa in all_isa_extn:
+                return True
+
+            compound_isa = isa.split(':')
+            if len(compound_isa) == 2 and compound_isa[0] in arch_info and compound_isa[1] in all_isa_extn:
+                return True
+            return False
+
         for isa in self.isa:
-            if isa not in all_isa_extn:
+            if not known_isa(isa):
                 raise InternalError('Module %s uses unknown ISA extension %s' % (self.infofile, isa))
 
     def sources(self):
@@ -917,6 +926,17 @@ class ModuleInfo(InfoObject):
     def external_headers(self):
         return self.header_external
 
+    def isas_needed(self, arch):
+        isas = []
+
+        for isa in self.isa:
+            if isa.find(':') == -1:
+                isas.append(isa)
+            elif isa.startswith(arch + ':'):
+                isas.append(isa[len(arch)+1:])
+
+        return isas
+
     def defines(self):
         return [(key + ' ' + value) for key, value in self._defines.items()]
 
@@ -925,6 +945,12 @@ class ModuleInfo(InfoObject):
         cpu_name = options.cpu
 
         for isa in self.isa:
+            if isa.find(':') > 0:
+                (arch, isa) = isa.split(':')
+
+                if arch != arch_name:
+                    continue
+
             if isa in options.disable_intrinsics:
                 return False # explicitly disabled
 
@@ -1182,11 +1208,19 @@ class CompilerInfo(InfoObject): # pylint: disable=too-many-instance-attributes
                 raise InternalError("Compiler %s has so_link_command for unknown OS %s" % (self.infofile, os_name))
 
     def isa_flags_for(self, isa, arch):
+        if isa.find(':') > 0:
+            (isa_arch, isa) = isa.split(':')
+            if isa_arch != arch:
+                return ''
+            if isa in self.isa_flags:
+                return self.isa_flags[isa]
+
         if isa in self.isa_flags:
             return self.isa_flags[isa]
         arch_isa = '%s:%s' % (arch, isa)
         if arch_isa in self.isa_flags:
             return self.isa_flags[arch_isa]
+
         return None
 
     def get_isa_specific_flags(self, isas, arch, options):
@@ -1734,7 +1768,7 @@ def generate_build_info(build_paths, modules, cc, arch, osinfo, options):
 
         if src in module_that_owns:
             module = module_that_owns[src]
-            isas = module.isa
+            isas = module.isas_needed(arch.basename)
             if 'simd' in module.dependencies(osinfo):
                 isas.append('simd')
 
@@ -2190,10 +2224,6 @@ class ModulesChooser(object):
         sorted_modules_to_load = sorted(modules_to_load)
 
         for modname in sorted_modules_to_load:
-            if modname.startswith('simd_') and modname != 'simd_engine':
-                logging.info('Using SIMD module ' + modname)
-
-        for modname in sorted_modules_to_load:
             if all_modules[modname].comment:
                 logging.info('%s: %s' % (modname, all_modules[modname].comment))
             if all_modules[modname].warning:
@@ -2615,8 +2645,9 @@ class AmalgamationGenerator(object):
     def _target_for_module(self, mod):
         target = ''
         if not self._options.single_amalgamation_file:
-            if mod.isa != []:
-                target = '_'.join(sorted(mod.isa))
+            isas = mod.isas_needed(self._options.arch)
+            if isas != []:
+                target = '_'.join(sorted(isas))
                 if target == 'sse2' and self._options.arch == 'x86_64':
                     target = '' # SSE2 is always available on x86-64
 
@@ -2629,9 +2660,9 @@ class AmalgamationGenerator(object):
             # Only first module for target is considered. Does this make sense?
             if self._target_for_module(mod) == target:
                 out = set()
-                for isa in mod.isa:
+                for isa in mod.isas_needed(self._options.arch):
                     if isa == 'aesni':
-                        isa = "aes,ssse3,pclmul"
+                        isa = "aes,pclmul"
                     elif isa == 'rdrand':
                         isa = 'rdrnd'
                     out.add(isa)
@@ -3331,7 +3362,7 @@ def main(argv):
         cc_arch = check_compiler_arch(options, cc, info_arch, source_paths)
 
         if cc_arch is not None and cc_arch != options.arch:
-            logging.warning("Configured target is %s but compiler probe indicates %s", options.arch, cc_arch)
+            logging.error("Configured target is %s but compiler probe indicates %s", options.arch, cc_arch)
     else:
         cc_min_version = options.cc_min_version or "0.0"
 
diff --git a/doc/side_channels.rst b/doc/side_channels.rst
index f18625911..5fe660171 100644
--- a/doc/side_channels.rst
+++ b/doc/side_channels.rst
@@ -244,12 +244,11 @@ Some x86, ARMv8 and POWER processors support AES instructions which
 are fast and are thought to be side channel silent. These instructions
 are used when available.
 
-On x86 processors without AES-NI but with SSSE3 (which includes older Intel
-Atoms and Core2 Duos, and even now some embedded or low power x86 chips), a
-version of AES using pshufb is used which is both fast and side channel silent.
-It is based on code by Mike Hamburg [VectorAes], see aes_ssse3.cpp. This same
-technique could be applied with NEON or AltiVec, and the paper suggests some
-optimizations for the AltiVec shuffle.
+On CPUs which do not have hardware AES instructions but do support SIMD vectors
+with a byte shuffle (including x86's SSSE3 and ARM's NEON), a version of AES is
+implemented which is side channel silent. This version is based on code by Mike
+Hamburg [VectorAes], see aes_vperm.cpp. This same technique could be applied
+with AltiVec, and the paper suggests some optimizations for the AltiVec shuffle.
 
 On all other processors, a table lookup version (T-tables) is used.  This
 approach is relatively fast, but known to be very vulnerable to side
diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt
index 6614d5a63..0312055aa 100644
--- a/src/build-data/cc/clang.txt
+++ b/src/build-data/cc/clang.txt
@@ -52,7 +52,7 @@ sse41  -> "-msse4.1"
 sse42  -> "-msse4.2"
 avx2   -> "-mavx2"
 bmi2   -> "-mbmi -mbmi2"
-aesni  -> "-maes -mpclmul -mssse3"
+aesni  -> "-maes -mpclmul"
 rdrand -> "-mrdrnd"
 rdseed -> "-mrdseed"
 sha    -> "-msha"
diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt
index cc3ce99e1..7393358a6 100644
--- a/src/build-data/cc/gcc.txt
+++ b/src/build-data/cc/gcc.txt
@@ -56,7 +56,7 @@ sse41   -> "-msse4.1"
 sse42   -> "-msse4.2"
 avx2    -> "-mavx2"
 bmi2    -> "-mbmi -mbmi2"
-aesni   -> "-maes -mpclmul -mssse3"
+aesni   -> "-maes -mpclmul"
 rdrand  -> "-mrdrnd"
 rdseed  -> "-mrdseed"
 sha     -> "-msha"
diff --git a/src/build-data/policy/bsi.txt b/src/build-data/policy/bsi.txt
index 04a29b817..d89f4433d 100644
--- a/src/build-data/policy/bsi.txt
+++ b/src/build-data/policy/bsi.txt
@@ -51,7 +51,7 @@ hmac_drbg
 <if_available>
 # block
 aes_ni
-aes_ssse3
+aes_vperm
 aes_armv8
 aes_power8
 
diff --git a/src/build-data/policy/modern.txt b/src/build-data/policy/modern.txt
index b417a2342..2a9c12613 100644
--- a/src/build-data/policy/modern.txt
+++ b/src/build-data/policy/modern.txt
@@ -59,7 +59,7 @@ locking_allocator
 http_util # needed by x509 for OCSP online checks
 
 aes_ni
-aes_ssse3
+aes_vperm
 aes_armv8
 aes_power8
 serpent_simd
diff --git a/src/build-data/policy/nist.txt b/src/build-data/policy/nist.txt
index 573c1c721..2fdf60ea6 100644
--- a/src/build-data/policy/nist.txt
+++ b/src/build-data/policy/nist.txt
@@ -48,7 +48,7 @@ rfc3394
 <if_available>
 # block
 aes_ni
-aes_ssse3
+aes_vperm
 aes_armv8
 aes_power8
 
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp
index 568dfb1b3..2813a5f5a 100644
--- a/src/lib/block/aes/aes.cpp
+++ b/src/lib/block/aes/aes.cpp
@@ -456,13 +456,6 @@ const char* aes_provider()
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return "ssse3";
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_POWER8)
    if(CPUID::has_ppc_crypto())
       {
@@ -477,6 +470,13 @@ const char* aes_provider()
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return "vperm";
+      }
+#endif
+
    return "base";
    }
 
@@ -501,13 +501,6 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return ssse3_encrypt_n(in, out, blocks);
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_ARMV8)
    if(CPUID::has_arm_aes())
       {
@@ -522,6 +515,13 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_encrypt_n(in, out, blocks);
+      }
+#endif
+
    aes_encrypt_n(in, out, blocks, m_EK, m_ME);
    }
 
@@ -536,13 +536,6 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return ssse3_decrypt_n(in, out, blocks);
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_ARMV8)
    if(CPUID::has_arm_aes())
       {
@@ -557,6 +550,13 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_decrypt_n(in, out, blocks);
+      }
+#endif
+
    aes_decrypt_n(in, out, blocks, m_DK, m_MD);
    }
 
@@ -569,10 +569,24 @@ void AES_128::key_schedule(const uint8_t key[], size_t length)
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
+#if defined(BOTAN_HAS_AES_ARMV8)
+   if(CPUID::has_arm_aes())
+      {
+      return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD);
+      }
+#endif
+
+#if defined(BOTAN_HAS_AES_POWER8)
+   if(CPUID::has_ppc_crypto())
       {
-      return ssse3_key_schedule(key, length);
+      return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD);
+      }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_key_schedule(key, length);
       }
 #endif
 
@@ -598,13 +612,6 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return ssse3_encrypt_n(in, out, blocks);
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_ARMV8)
    if(CPUID::has_arm_aes())
       {
@@ -619,6 +626,13 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_encrypt_n(in, out, blocks);
+      }
+#endif
+
    aes_encrypt_n(in, out, blocks, m_EK, m_ME);
    }
 
@@ -633,13 +647,6 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return ssse3_decrypt_n(in, out, blocks);
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_ARMV8)
    if(CPUID::has_arm_aes())
       {
@@ -654,6 +661,13 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_decrypt_n(in, out, blocks);
+      }
+#endif
+
    aes_decrypt_n(in, out, blocks, m_DK, m_MD);
    }
 
@@ -666,10 +680,24 @@ void AES_192::key_schedule(const uint8_t key[], size_t length)
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
+#if defined(BOTAN_HAS_AES_ARMV8)
+   if(CPUID::has_arm_aes())
       {
-      return ssse3_key_schedule(key, length);
+      return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD);
+      }
+#endif
+
+#if defined(BOTAN_HAS_AES_POWER8)
+   if(CPUID::has_ppc_crypto())
+      {
+      return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD);
+      }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_key_schedule(key, length);
       }
 #endif
 
@@ -695,13 +723,6 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return ssse3_encrypt_n(in, out, blocks);
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_ARMV8)
    if(CPUID::has_arm_aes())
       {
@@ -716,6 +737,13 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_encrypt_n(in, out, blocks);
+      }
+#endif
+
    aes_encrypt_n(in, out, blocks, m_EK, m_ME);
    }
 
@@ -730,13 +758,6 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
-      {
-      return ssse3_decrypt_n(in, out, blocks);
-      }
-#endif
-
 #if defined(BOTAN_HAS_AES_ARMV8)
    if(CPUID::has_arm_aes())
       {
@@ -751,6 +772,13 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
       }
 #endif
 
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
+      {
+      return vperm_decrypt_n(in, out, blocks);
+      }
+#endif
+
    aes_decrypt_n(in, out, blocks, m_DK, m_MD);
    }
 
@@ -763,10 +791,24 @@ void AES_256::key_schedule(const uint8_t key[], size_t length)
       }
 #endif
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-   if(CPUID::has_ssse3())
+#if defined(BOTAN_HAS_AES_ARMV8)
+   if(CPUID::has_arm_aes())
+      {
+      return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD);
+      }
+#endif
+
+#if defined(BOTAN_HAS_AES_POWER8)
+   if(CPUID::has_ppc_crypto())
+      {
+      return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD);
+      }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+   if(CPUID::has_vperm())
       {
-      return ssse3_key_schedule(key, length);
+      return vperm_key_schedule(key, length);
       }
 #endif
 
diff --git a/src/lib/block/aes/aes.h b/src/lib/block/aes/aes.h
index 294cdcad3..6083467b6 100644
--- a/src/lib/block/aes/aes.h
+++ b/src/lib/block/aes/aes.h
@@ -31,10 +31,10 @@ class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16,
    private:
       void key_schedule(const uint8_t key[], size_t length) override;
 
-#if defined(BOTAN_HAS_AES_SSSE3)
-      void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
-      void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
-      void ssse3_key_schedule(const uint8_t key[], size_t length);
+#if defined(BOTAN_HAS_AES_VPERM)
+      void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+      void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+      void vperm_key_schedule(const uint8_t key[], size_t length);
 #endif
 
 #if defined(BOTAN_HAS_AES_NI)
@@ -74,10 +74,10 @@ class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16,
       size_t parallelism() const override;
 
    private:
-#if defined(BOTAN_HAS_AES_SSSE3)
-      void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
-      void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
-      void ssse3_key_schedule(const uint8_t key[], size_t length);
+#if defined(BOTAN_HAS_AES_VPERM)
+      void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+      void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+      void vperm_key_schedule(const uint8_t key[], size_t length);
 #endif
 
 #if defined(BOTAN_HAS_AES_NI)
@@ -120,10 +120,10 @@ class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16,
       size_t parallelism() const override;
 
    private:
-#if defined(BOTAN_HAS_AES_SSSE3)
-      void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
-      void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
-      void ssse3_key_schedule(const uint8_t key[], size_t length);
+#if defined(BOTAN_HAS_AES_VPERM)
+      void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+      void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+      void vperm_key_schedule(const uint8_t key[], size_t length);
 #endif
 
 #if defined(BOTAN_HAS_AES_NI)
diff --git a/src/lib/block/aes/aes_ni/info.txt b/src/lib/block/aes/aes_ni/info.txt
index 7fff19923..2e9749fb8 100644
--- a/src/lib/block/aes/aes_ni/info.txt
+++ b/src/lib/block/aes/aes_ni/info.txt
@@ -3,5 +3,7 @@ AES_NI -> 20131128
 </defines>
 
 <isa>
+sse2
+ssse3
 aesni
 </isa>
diff --git a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp b/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp
deleted file mode 100644
index fa8bf4faa..000000000
--- a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
-* AES using SSSE3
-* (C) 2010,2016,2019 Jack Lloyd
-*
-* This is more or less a direct translation of public domain x86-64
-* assembly written by Mike Hamburg, described in "Accelerating AES
-* with Vector Permute Instructions" (CHES 2009). His original code is
-* available at https://crypto.stanford.edu/vpaes/
-*
-* Botan is released under the Simplified BSD License (see license.txt)
-*/
-
-#include <botan/aes.h>
-#include <botan/internal/ct_utils.h>
-#include <botan/internal/simd_32.h>
-
-#if defined(BOTAN_SIMD_USE_SSE2)
-  #include <tmmintrin.h>
-#elif defined(BOTAN_SIMD_USE_NEON)
-  #include <arm_neon.h>
-#endif
-
-namespace Botan {
-
-namespace {
-
-inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b)
-   {
-#if defined(BOTAN_SIMD_USE_SSE2)
-   return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw()));
-#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64)
-
-   const int8x16_t tbl = vreinterpretq_s8_m128i(a.raw());
-   const uint8x16_t idx = vreinterpretq_u8_m128i(b.raw());
-
-   // fixme use vdupq_n_s8
-   const uint8_t alignas(16) mask[16] = {
-      0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F,
-      0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F
-   };
-
-   const uint8x16_t idx_masked =
-      vandq_u8(idx, vld1q_u8(mask));  // avoid using meaningless bits
-
-   return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#else
-   #error "No shuffle implementation available"
-#endif
-   }
-
-template<size_t I1, size_t I2, size_t I3, size_t I4>
-inline SIMD_4x32 shuffle32(SIMD_4x32 x)
-   {
-   return SIMD_4x32(_mm_shuffle_epi32(x.raw(), _MM_SHUFFLE(I1, I2, I3, I4)));
-   }
-
-template<size_t I>
-inline SIMD_4x32 slli(SIMD_4x32 x)
-   {
-#if defined(BOTAN_SIMD_USE_SSE2)
-   return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I));
-#else
-   #error "No ssli implementation available"
-#endif
-   }
-
-inline SIMD_4x32 zero_top_half(SIMD_4x32 x)
-   {
-#if defined(BOTAN_SIMD_USE_SSE2)
-   return SIMD_4x32(_mm_slli_si128(_mm_srli_si128(x.raw(), 8), 8));
-#else
-   #error "No zero_top_half implementation available"
-#endif
-   }
-
-template<int C>
-inline SIMD_4x32 alignr(SIMD_4x32 a, SIMD_4x32 b)
-   {
-#if defined(BOTAN_SIMD_USE_SSE2)
-   return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), C));
-#else
-   #error "No alignr implementation available"
-#endif
-   }
-
-const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
-const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
-
-const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
-const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
-
-const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
-const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
-
-const SIMD_4x32 mc_forward[4] = {
-   SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
-   SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
-   SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
-   SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)
-};
-
-const SIMD_4x32 sr[4] = {
-   SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
-   SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
-   SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
-   SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
-};
-
-const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
-const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
-
-inline SIMD_4x32 low_nibs(SIMD_4x32 x)
-   {
-   return lo_nibs_mask & x;
-   }
-
-inline SIMD_4x32 high_nibs(SIMD_4x32 x)
-   {
-   return (hi_nibs_mask & x).shr<4>();
-   }
-
-SIMD_4x32 aes_vperm_encrypt(SIMD_4x32 B, const uint32_t* keys, size_t rounds)
-   {
-   const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
-   const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
-
-   const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
-   const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
-
-   const SIMD_4x32 mc_backward[4] = {
-      SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
-      SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
-      SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
-      SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
-   };
-
-   B = shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ SIMD_4x32(&keys[0]);
-
-   for(size_t r = 1; ; ++r)
-      {
-      const SIMD_4x32 K(&keys[4*r]);
-
-      SIMD_4x32 t = high_nibs(B);
-      B = low_nibs(B);
-
-      SIMD_4x32 t2 = shuffle(k_inv2, B);
-
-      B ^= t;
-
-      SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
-      SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, B);
-
-      SIMD_4x32 t5 = B ^ shuffle(k_inv1, t3);
-      SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
-
-      if(r == rounds)
-         {
-         return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, sr[r % 4]);
-         }
-
-      SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
-
-      SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
-
-      B = shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
-      }
-   }
-
-SIMD_4x32 aes_vperm_decrypt(SIMD_4x32 B, const uint32_t keys[], size_t rounds)
-   {
-   const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
-   const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
-
-   const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
-   const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
-
-   const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
-   const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
-
-   const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
-   const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
-
-   const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
-   const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
-
-   const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
-   const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
-
-   SIMD_4x32 mc(mc_forward[3]);
-
-   B = shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ SIMD_4x32(&keys[0]);
-
-   for(size_t r = 1; ; ++r)
-      {
-      const SIMD_4x32 K(&keys[4*r]);
-
-      SIMD_4x32 t = high_nibs(B);
-      B = low_nibs(B);
-
-      SIMD_4x32 t2 = shuffle(k_inv2, B);
-
-      B ^= t;
-
-      const SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
-      const SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, B);
-      const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t3);
-      const SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
-
-      if(r == rounds)
-         {
-         const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K;
-         const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
-         return shuffle(x, sr[which_sr]);
-         }
-
-      const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
-      const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
-      const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
-
-      B = shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
-
-      mc = alignr<12>(mc, mc);
-      }
-   }
-
-void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
-                          const uint32_t keys[], size_t rounds)
-   {
-   CT::poison(in, blocks * 16);
-
-   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
-      {
-      SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
-      B = aes_vperm_encrypt(B, keys, rounds);
-      B.store_le(out + i*16);
-      }
-
-   CT::unpoison(in,  blocks * 16);
-   CT::unpoison(out, blocks * 16);
-   }
-
-void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
-                          const uint32_t keys[], size_t rounds)
-   {
-   CT::poison(in, blocks * 16);
-
-   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
-      {
-      SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
-      B = aes_vperm_decrypt(B, keys, rounds);
-      B.store_le(out + i*16);
-      }
-
-   CT::unpoison(in,  blocks * 16);
-   CT::unpoison(out, blocks * 16);
-   }
-
-}
-
-void AES_128::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
-   {
-   vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 10);
-   }
-
-void AES_128::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
-   {
-   vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 10);
-   }
-
-void AES_192::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
-   {
-   vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 12);
-   }
-
-void AES_192::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
-   {
-   vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 12);
-   }
-
-void AES_256::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
-   {
-   vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 14);
-   }
-
-void AES_256::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
-   {
-   vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 14);
-   }
-
-namespace {
-
-SIMD_4x32 aes_schedule_transform(SIMD_4x32 input,
-                                 SIMD_4x32 table_1,
-                                 SIMD_4x32 table_2)
-   {
-   return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
-   }
-
-SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
-   {
-   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
-   const SIMD_4x32 srx(sr[round_no % 4]);
-
-   SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
-   SIMD_4x32 t2 = t;
-   t = shuffle(t, mc_forward0);
-   t2 = t ^ t2 ^ shuffle(t, mc_forward0);
-   return shuffle(t2, srx);
-   }
-
-SIMD_4x32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no)
-   {
-   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
-
-   const SIMD_4x32 dsk[8] = {
-      SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
-      SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
-      SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
-      SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
-      SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
-      SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
-      SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
-      SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
-   };
-
-   SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
-   SIMD_4x32 output = shuffle(t, mc_forward0);
-
-   t = aes_schedule_transform(t, dsk[2], dsk[3]);
-   output = shuffle(t ^ output, mc_forward0);
-
-   t = aes_schedule_transform(t, dsk[4], dsk[5]);
-   output = shuffle(t ^ output, mc_forward0);
-
-   t = aes_schedule_transform(t, dsk[6], dsk[7]);
-   output = shuffle(t ^ output, mc_forward0);
-
-   return shuffle(output, SIMD_4x32(sr[round_no % 4]));
-   }
-
-SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
-   {
-   const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
-   const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
-
-   k = shuffle(k, SIMD_4x32(sr[round_no % 4]));
-   k ^= SIMD_4x32::splat_u8(0x5B);
-   return aes_schedule_transform(k, out_tr1, out_tr2);
-   }
-
-SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
-   {
-   const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
-   const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
-
-   k ^= SIMD_4x32::splat_u8(0x5B);
-   return aes_schedule_transform(k, deskew1, deskew2);
-   }
-
-SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
-   {
-   SIMD_4x32 smeared = input2 ^ slli<1>(input2);
-   smeared ^= slli<2>(smeared);
-   smeared ^= SIMD_4x32::splat_u8(0x5B);
-
-   SIMD_4x32 t = high_nibs(input1);
-   input1 = low_nibs(input1);
-
-   SIMD_4x32 t2 = shuffle(k_inv2, input1);
-
-   input1 ^= t;
-
-   SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
-   SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1);
-
-   SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3);
-   SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
-
-   return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
-   }
-
-SIMD_4x32 aes_schedule_round(SIMD_4x32& rcon, SIMD_4x32 input1, SIMD_4x32 input2)
-   {
-   input2 ^= alignr<15>(SIMD_4x32(), rcon);
-   rcon = alignr<15>(rcon, rcon);
-   input1 = shuffle32<3,3,3,3>(input1);
-   input1 = alignr<1>(input1, input1);
-
-   return aes_schedule_round(input1, input2);
-   }
-
-SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
-   {
-   return y ^ shuffle32<3,3,3,2>(x) ^ shuffle32<2,0,0,0>(y);
-   }
-
-}
-
-void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t)
-   {
-   m_EK.resize(11*4);
-   m_DK.resize(11*4);
-
-   SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808);
-
-   SIMD_4x32 key = SIMD_4x32::load_le(keyb);
-
-   shuffle(key, sr[2]).store_le(&m_DK[4*10]);
-
-   key = aes_schedule_transform(key, k_ipt1, k_ipt2);
-   key.store_le(&m_EK[0]);
-
-   for(size_t i = 1; i != 10; ++i)
-      {
-      key = aes_schedule_round(rcon, key, key);
-
-      aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]);
-
-      aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]);
-      }
-
-   key = aes_schedule_round(rcon, key, key);
-   aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]);
-   aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
-   }
-
-void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t)
-   {
-   m_EK.resize(13*4);
-   m_DK.resize(13*4);
-
-   SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808);
-
-   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
-   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
-
-   shuffle(key1, sr[0]).store_le(&m_DK[12*4]);
-
-   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
-   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
-
-   key1.store_le(&m_EK[0]);
-
-   for(size_t i = 0; i != 4; ++i)
-      {
-      // key2 with 8 high bytes masked off
-      SIMD_4x32 t = zero_top_half(key2);
-      key2 = aes_schedule_round(rcon, key2, key1);
-
-      // fixme cse
-      aes_schedule_mangle(alignr<8>(key2, t), (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
-      aes_schedule_mangle_dec(alignr<8>(key2, t), (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
-
-      t = aes_schedule_192_smear(key2, t);
-
-      aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]);
-      aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]);
-
-      key2 = aes_schedule_round(rcon, t, key2);
-
-      if(i == 3)
-         {
-         aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
-         aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]);
-         }
-      else
-         {
-         aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
-         aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]);
-         }
-
-      key1 = key2;
-      key2 = aes_schedule_192_smear(key2, zero_top_half(t));
-      }
-   }
-
-void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t)
-   {
-   m_EK.resize(15*4);
-   m_DK.resize(15*4);
-
-   SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808);
-
-   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
-   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
-
-   shuffle(key1, sr[2]).store_le(&m_DK[4*14]);
-
-   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
-   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
-
-   key1.store_le(&m_EK[0]);
-   aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
-
-   aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]);
-
-   for(size_t i = 2; i != 14; i += 2)
-      {
-      const SIMD_4x32 k_t = key2;
-      key1 = key2 = aes_schedule_round(rcon, key2, key1);
-
-      aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]);
-      aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]);
-
-      key2 = aes_schedule_round(shuffle32<3,3,3,3>(key2), k_t);
-
-      aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]);
-      aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]);
-      }
-
-   key2 = aes_schedule_round(rcon, key2, key1);
-
-   aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]);
-   aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
-   }
-
-}
diff --git a/src/lib/block/aes/aes_ssse3/info.txt b/src/lib/block/aes/aes_ssse3/info.txt
deleted file mode 100644
index 49d9a9214..000000000
--- a/src/lib/block/aes/aes_ssse3/info.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-<defines>
-AES_SSSE3 -> 20131128
-</defines>
-
-<isa>
-ssse3
-</isa>
-
-<requires>
-simd
-</requires>
-
-<cc>
-gcc
-clang
-msvc:19.10 # VC 2017
-sunstudio
-</cc>
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
new file mode 100644
index 000000000..b7e82876c
--- /dev/null
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -0,0 +1,634 @@
+/*
+* AES using vector permutes (SSSE3, NEON)
+* (C) 2010,2016,2019 Jack Lloyd
+*
+* Based on public domain x86-64 assembly written by Mike Hamburg,
+* described in "Accelerating AES with Vector Permute Instructions"
+* (CHES 2009). His original code is available at
+* https://crypto.stanford.edu/vpaes/
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/aes.h>
+#include <botan/internal/ct_utils.h>
+#include <botan/internal/simd_32.h>
+
+#if defined(BOTAN_SIMD_USE_SSE2)
+  #include <tmmintrin.h>
+#endif
+
+namespace Botan {
+
+namespace {
+
+inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b)
+   {
+#if defined(BOTAN_SIMD_USE_SSE2)
+   return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw()));
+#elif defined(BOTAN_SIMD_USE_NEON)
+   const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw());
+   const uint8x16_t idx = vreinterpretq_u8_u32(b.raw());
+
+#if defined(BOTAN_TARGET_ARCH_IS_ARM32)
+   const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) };
+
+   return SIMD_4x32(vreinterpretq_u32_u8(
+                       vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)),
+                                   vtbl2_u8(tbl2, vget_high_u8(idx)))));
+
+#else
+   return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx)));
+#endif
+
+#else
+   #error "No shuffle implementation available"
+#endif
+   }
+
+template<size_t I>
+inline SIMD_4x32 shift_elems_left(SIMD_4x32 x)
+   {
+#if defined(BOTAN_SIMD_USE_SSE2)
+   return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I));
+#elif defined(BOTAN_SIMD_USE_NEON)
+   return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u32(x.raw()), 16 - 4*I)));
+#else
+   #error "No shift_elems_left implementation available"
+#endif
+   }
+
+inline SIMD_4x32 alignr8(SIMD_4x32 a, SIMD_4x32 b)
+   {
+#if defined(BOTAN_SIMD_USE_SSE2)
+   return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
+#elif defined(BOTAN_SIMD_USE_NEON)
+   return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), 8)));
+#else
+   #error "No alignr8 implementation available"
+#endif
+   }
+
+const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
+const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
+
+const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
+const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
+
+const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
+const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
+
+const SIMD_4x32 mc_forward[4] = {
+   SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
+   SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
+   SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
+   SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)
+};
+
+const SIMD_4x32 sr[4] = {
+   SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
+   SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
+   SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
+   SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
+};
+
+const SIMD_4x32 rcon[10] = {
+   SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
+};
+
+const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
+const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
+const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B);
+
+inline SIMD_4x32 low_nibs(SIMD_4x32 x)
+   {
+   return lo_nibs_mask & x;
+   }
+
+inline SIMD_4x32 high_nibs(SIMD_4x32 x)
+   {
+   return (hi_nibs_mask & x).shr<4>();
+   }
+
+inline SIMD_4x32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K)
+   {
+   return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
+   }
+
+inline SIMD_4x32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
+   const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
+   const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
+
+   const SIMD_4x32 mc_backward[4] = {
+      SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
+      SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
+      SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
+      SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
+   };
+
+   const SIMD_4x32 Bh = high_nibs(B);
+   SIMD_4x32 Bl = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+   Bl ^= Bh;
+
+   const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
+
+   const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
+   const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
+
+   return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
+   }
+
+inline SIMD_4x32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
+   const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
+   const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
+
+   const SIMD_4x32 Bh = high_nibs(B);
+   SIMD_4x32 Bl = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+   Bl ^= Bh;
+
+   const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
+
+   return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, sr[r % 4]);
+   }
+
+inline SIMD_4x32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K)
+   {
+   const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
+   const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
+
+   return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
+   }
+
+inline SIMD_4x32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
+   const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
+   const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
+
+   const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
+   const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
+
+   const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
+   const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
+
+   const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
+   const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
+
+   const SIMD_4x32 mcx[4] = {
+      SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
+      SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
+      SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
+      SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
+   };
+
+   const SIMD_4x32 Bh = high_nibs(B);
+   B = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, B);
+
+   B ^= Bh;
+
+   const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
+
+   const SIMD_4x32 mc = mcx[(r-1)%4];
+
+   const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
+   const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
+   const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
+   return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
+   }
+
+inline SIMD_4x32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
+   const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
+   const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
+
+   const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
+
+   const SIMD_4x32 Bh = high_nibs(B);
+   B = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, B);
+
+   B ^= Bh;
+
+   const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
+
+   const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K;
+   return shuffle(x, sr[which_sr]);
+   }
+
+void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
+                          const SIMD_4x32 K[], size_t rounds)
+   {
+   CT::poison(in, blocks * 16);
+
+   const size_t blocks2 = blocks - (blocks % 2);
+
+   for(size_t i = 0; i != blocks2; i += 2)
+      {
+      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
+      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
+
+      B0 = aes_enc_first_round(B0, K[0]);
+      B1 = aes_enc_first_round(B1, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B0 = aes_enc_round(B0, K[r], r);
+         B1 = aes_enc_round(B1, K[r], r);
+         }
+
+      B0 = aes_enc_last_round(B0, K[rounds], rounds);
+      B1 = aes_enc_last_round(B1, K[rounds], rounds);
+
+      B0.store_le(out + i*16);
+      B1.store_le(out + (i+1)*16);
+      }
+
+   for(size_t i = blocks2; i < blocks; ++i)
+      {
+      SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
+
+      B = aes_enc_first_round(B, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B = aes_enc_round(B, K[r], r);
+         }
+
+      B = aes_enc_last_round(B, K[rounds], rounds);
+      B.store_le(out + i*16);
+      }
+
+   CT::unpoison(in,  blocks * 16);
+   CT::unpoison(out, blocks * 16);
+   }
+
+void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
+                          const SIMD_4x32 K[], size_t rounds)
+   {
+   CT::poison(in, blocks * 16);
+
+   const size_t blocks2 = blocks - (blocks % 2);
+
+   for(size_t i = 0; i != blocks2; i += 2)
+      {
+      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
+      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
+
+      B0 = aes_dec_first_round(B0, K[0]);
+      B1 = aes_dec_first_round(B1, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B0 = aes_dec_round(B0, K[r], r);
+         B1 = aes_dec_round(B1, K[r], r);
+         }
+
+      B0 = aes_dec_last_round(B0, K[rounds], rounds);
+      B1 = aes_dec_last_round(B1, K[rounds], rounds);
+
+      B0.store_le(out + i*16);
+      B1.store_le(out + (i+1)*16);
+      }
+
+   for(size_t i = blocks2; i < blocks; ++i)
+      {
+      SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
+
+      B = aes_dec_first_round(B, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B = aes_dec_round(B, K[r], r);
+         }
+
+      B = aes_dec_last_round(B, K[rounds], rounds);
+      B.store_le(out + i*16);
+      }
+
+   CT::unpoison(in,  blocks * 16);
+   CT::unpoison(out, blocks * 16);
+   }
+
+}
+
+void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   const SIMD_4x32 K[11] = {
+      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]),
+   };
+
+   return vperm_encrypt_blocks(in, out, blocks, K, 10);
+   }
+
+void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   const SIMD_4x32 K[11] = {
+      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]),
+   };
+
+   return vperm_decrypt_blocks(in, out, blocks, K, 10);
+   }
+
+void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   const SIMD_4x32 K[13] = {
+      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
+      SIMD_4x32(&m_EK[4*12]),
+   };
+
+   return vperm_encrypt_blocks(in, out, blocks, K, 12);
+   }
+
+void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   const SIMD_4x32 K[13] = {
+      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
+      SIMD_4x32(&m_DK[4*12]),
+   };
+
+   return vperm_decrypt_blocks(in, out, blocks, K, 12);
+   }
+
+void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   const SIMD_4x32 K[15] = {
+      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
+      SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]),
+   };
+
+   return vperm_encrypt_blocks(in, out, blocks, K, 14);
+   }
+
+void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   const SIMD_4x32 K[15] = {
+      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
+      SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]),
+   };
+
+   return vperm_decrypt_blocks(in, out, blocks, K, 14);
+   }
+
+namespace {
+
+SIMD_4x32 aes_schedule_transform(SIMD_4x32 input,
+                                 SIMD_4x32 table_1,
+                                 SIMD_4x32 table_2)
+   {
+   return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
+   }
+
+SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
+   {
+   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
+   const SIMD_4x32 srx(sr[round_no % 4]);
+
+   SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0);
+   SIMD_4x32 t2 = t;
+   t = shuffle(t, mc_forward0);
+   t2 = t ^ t2 ^ shuffle(t, mc_forward0);
+   return shuffle(t2, srx);
+   }
+
+SIMD_4x32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no)
+   {
+   const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
+
+   const SIMD_4x32 dsk[8] = {
+      SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
+      SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
+      SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
+      SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
+      SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
+      SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
+      SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
+      SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
+   };
+
+   SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
+   SIMD_4x32 output = shuffle(t, mc_forward0);
+
+   t = aes_schedule_transform(t, dsk[2], dsk[3]);
+   output = shuffle(t ^ output, mc_forward0);
+
+   t = aes_schedule_transform(t, dsk[4], dsk[5]);
+   output = shuffle(t ^ output, mc_forward0);
+
+   t = aes_schedule_transform(t, dsk[6], dsk[7]);
+   output = shuffle(t ^ output, mc_forward0);
+
+   return shuffle(output, sr[round_no % 4]);
+   }
+
+SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
+   {
+   const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
+   const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
+
+   k = shuffle(k, sr[round_no % 4]);
+   k ^= xor_5B;
+   return aes_schedule_transform(k, out_tr1, out_tr2);
+   }
+
+SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
+   {
+   const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
+   const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
+
+   k ^= xor_5B;
+   return aes_schedule_transform(k, deskew1, deskew2);
+   }
+
+SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
+   {
+   SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2);
+   smeared ^= shift_elems_left<2>(smeared);
+   smeared ^= xor_5B;
+
+   SIMD_4x32 t = high_nibs(input1);
+   input1 = low_nibs(input1);
+
+   SIMD_4x32 t2 = shuffle(k_inv2, input1);
+
+   input1 ^= t;
+
+   SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
+   SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1);
+
+   SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3);
+   SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
+
+   return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
+   }
+
+SIMD_4x32 aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2)
+   {
+   // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
+   const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
+   return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
+   }
+
+SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
+   {
+   const SIMD_4x32 shuffle3332 =
+      SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
+   const SIMD_4x32 shuffle2000 =
+      SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
+
+   const SIMD_4x32 zero_top_half(0, 0, ~0, ~0);
+   y &= zero_top_half;
+   return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
+   }
+
+}
+
+void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t)
+   {
+   m_EK.resize(11*4);
+   m_DK.resize(11*4);
+
+   SIMD_4x32 key = SIMD_4x32::load_le(keyb);
+
+   shuffle(key, sr[2]).store_le(&m_DK[4*10]);
+
+   key = aes_schedule_transform(key, k_ipt1, k_ipt2);
+   key.store_le(&m_EK[0]);
+
+   for(size_t i = 1; i != 10; ++i)
+      {
+      key = aes_schedule_round(rcon[i-1], key, key);
+
+      aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]);
+
+      aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]);
+      }
+
+   key = aes_schedule_round(rcon[9], key, key);
+   aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]);
+   aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
+   }
+
+void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
+   {
+   m_EK.resize(13*4);
+   m_DK.resize(13*4);
+
+   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
+   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
+
+   shuffle(key1, sr[0]).store_le(&m_DK[12*4]);
+
+   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
+   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
+
+   key1.store_le(&m_EK[0]);
+
+   for(size_t i = 0; i != 4; ++i)
+      {
+      // key2 with 8 high bytes masked off
+      SIMD_4x32 t = key2;
+      key2 = aes_schedule_round(rcon[2*i], key2, key1);
+
+      const SIMD_4x32 key2t = alignr8(key2, t);
+      aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
+      aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
+
+      t = aes_schedule_192_smear(key2, t);
+
+      aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]);
+      aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]);
+
+      key2 = aes_schedule_round(rcon[2*i+1], t, key2);
+
+      if(i == 3)
+         {
+         aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
+         aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]);
+         }
+      else
+         {
+         aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
+         aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]);
+         }
+
+      key1 = key2;
+      key2 = aes_schedule_192_smear(key2, t);
+      }
+   }
+
+void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
+   {
+   m_EK.resize(15*4);
+   m_DK.resize(15*4);
+
+   SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
+   SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
+
+   shuffle(key1, sr[2]).store_le(&m_DK[4*14]);
+
+   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
+   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
+
+   key1.store_le(&m_EK[0]);
+   aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
+
+   aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]);
+
+   const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
+
+   for(size_t i = 2; i != 14; i += 2)
+      {
+      const SIMD_4x32 k_t = key2;
+      key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1);
+
+      aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]);
+      aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]);
+
+      key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
+
+      aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]);
+      aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]);
+      }
+
+   key2 = aes_schedule_round(rcon[6], key2, key1);
+
+   aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]);
+   aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
+   }
+
+}
diff --git a/src/lib/block/aes/aes_vperm/info.txt b/src/lib/block/aes/aes_vperm/info.txt
new file mode 100644
index 000000000..f771ca2c3
--- /dev/null
+++ b/src/lib/block/aes/aes_vperm/info.txt
@@ -0,0 +1,30 @@
+<defines>
+AES_VPERM -> 20190901
+</defines>
+
+<isa>
+x86_32:sse2
+x86_64:sse2
+x86_32:ssse3
+x86_64:ssse3
+arm32:neon
+arm64:neon
+</isa>
+
+<arch>
+x86_32
+x86_64
+arm32
+arm64
+</arch>
+
+<requires>
+simd
+</requires>
+
+<cc>
+gcc
+clang
+msvc:19.10 # VC 2017
+sunstudio
+</cc>
diff --git a/src/lib/block/shacal2/shacal2_x86/info.txt b/src/lib/block/shacal2/shacal2_x86/info.txt
index 723400f76..298833048 100644
--- a/src/lib/block/shacal2/shacal2_x86/info.txt
+++ b/src/lib/block/shacal2/shacal2_x86/info.txt
@@ -8,6 +8,7 @@ shacal2
 
 <isa>
 sha
+sse2
 ssse3
 </isa>
 
diff --git a/src/lib/hash/sha1/sha1_x86/info.txt b/src/lib/hash/sha1/sha1_x86/info.txt
index 9dba8bf00..0a46d980a 100644
--- a/src/lib/hash/sha1/sha1_x86/info.txt
+++ b/src/lib/hash/sha1/sha1_x86/info.txt
@@ -4,6 +4,7 @@ SHA1_X86_SHA_NI -> 20170518
 
 <isa>
 sha
+sse2
 ssse3
 sse41
 </isa>
diff --git a/src/lib/hash/sha2_32/sha2_32_x86/info.txt b/src/lib/hash/sha2_32/sha2_32_x86/info.txt
index bc167ef04..8d9fb4149 100644
--- a/src/lib/hash/sha2_32/sha2_32_x86/info.txt
+++ b/src/lib/hash/sha2_32/sha2_32_x86/info.txt
@@ -4,6 +4,7 @@ SHA2_32_X86 -> 20170518
 
 <isa>
 sha
+sse2
 ssse3
 sse41
 </isa>
diff --git a/src/lib/modes/aead/gcm/clmul/info.txt b/src/lib/modes/aead/gcm/clmul/info.txt
index b8d45cda4..d4b6a1c1f 100644
--- a/src/lib/modes/aead/gcm/clmul/info.txt
+++ b/src/lib/modes/aead/gcm/clmul/info.txt
@@ -3,6 +3,8 @@ GCM_CLMUL -> 20131227
 </defines>
 
 <isa>
+sse2
+ssse3
 aesni
 </isa>
 
diff --git a/src/lib/modes/aead/gcm/clmul_ssse3/info.txt b/src/lib/modes/aead/gcm/clmul_ssse3/info.txt
index 8e4e143bb..47fc290cf 100644
--- a/src/lib/modes/aead/gcm/clmul_ssse3/info.txt
+++ b/src/lib/modes/aead/gcm/clmul_ssse3/info.txt
@@ -3,6 +3,7 @@ GCM_CLMUL_SSSE3 -> 20171020
 </defines>
 
 <isa>
+sse2
 ssse3
 </isa>
 
diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h
index 256c6cc57..f50f40f1d 100644
--- a/src/lib/utils/cpuid/cpuid.h
+++ b/src/lib/utils/cpuid/cpuid.h
@@ -303,6 +303,21 @@ class BOTAN_PUBLIC_API(2,1) CPUID final
          { return has_cpuid_bit(CPUID_RDSEED_BIT); }
 #endif
 
+      /**
+      * Check if the processor supports byte-level vector permutes
+      * (SSSE3, NEON, Altivec)
+      */
+      static bool has_vperm()
+         {
+#if defined(BOTAN_TARGET_CPU_IS_X86_FAMILY)
+         return has_ssse3();
+#elif defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY)
+         return has_neon();
+#else
+         return false;
+#endif
+         }
+
       /*
       * Clear a CPUID bit
       * Call CPUID::initialize to reset
diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h
index 7b6929c6d..6f3134bce 100644
--- a/src/lib/utils/simd/simd_32.h
+++ b/src/lib/utils/simd/simd_32.h
@@ -167,7 +167,15 @@ class SIMD_4x32 final
 #elif defined(BOTAN_SIMD_USE_NEON)
 
          SIMD_4x32 l(vld1q_u32(static_cast<const uint32_t*>(in)));
+
+#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN)
+         return l.bswap();
+#elif defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN)
+         return l;
+#else
          return CPUID::is_big_endian() ? l.bswap() : l;
+#endif
+
 #else
          SIMD_4x32 out;
          Botan::load_le(out.m_simd.val, static_cast<const uint8_t*>(in), 4);
@@ -181,11 +189,9 @@ class SIMD_4x32 final
       static SIMD_4x32 load_be(const void* in)
          {
 #if defined(BOTAN_SIMD_USE_SSE2)
-
          return load_le(in).bswap();
 
 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
-
          uint32_t R[4];
          Botan::load_be(R, static_cast<const uint8_t*>(in), 4);
          return SIMD_4x32(R);
@@ -193,7 +199,14 @@ class SIMD_4x32 final
 #elif defined(BOTAN_SIMD_USE_NEON)
 
          SIMD_4x32 l(vld1q_u32(static_cast<const uint32_t*>(in)));
+
+#if defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN)
+         return l.bswap();
+#elif defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN)
+         return l;
+#else
          return CPUID::is_little_endian() ? l.bswap() : l;
+#endif
 
 #else
          SIMD_4x32 out;
@@ -214,7 +227,7 @@ class SIMD_4x32 final
          {
 #if defined(BOTAN_SIMD_USE_SSE2)
 
-         _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_simd);
+         _mm_storeu_si128(reinterpret_cast<__m128i*>(out), raw());
 
 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
 
@@ -222,19 +235,26 @@ class SIMD_4x32 final
             __vector unsigned int V;
             uint32_t R[4];
             } vec;
-         vec.V = m_simd;
+         vec.V = raw();
          Botan::store_le(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
 
 #elif defined(BOTAN_SIMD_USE_NEON)
 
-         if(CPUID::is_big_endian())
+#if defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN)
+         vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
+#elif defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN)
+         vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd));
+#else
+         if(CPUID::is_little_endian())
             {
-            bswap().store_le(out);
+            vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
             }
          else
             {
-            vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
+            vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd));
             }
+#endif
+
 #else
          Botan::store_le(out, m_simd.val[0], m_simd.val[1], m_simd.val[2], m_simd.val[3]);
 #endif
@@ -260,14 +280,20 @@ class SIMD_4x32 final
 
 #elif defined(BOTAN_SIMD_USE_NEON)
 
+#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN)
+         vst1q_u8(out, vreinterpretq_u8_u32(m_simd);
+#elif defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN)
+         vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd));
+#else
          if(CPUID::is_little_endian())
             {
-            bswap().store_le(out);
+            vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd));
             }
          else
             {
             vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
             }
+#endif
 
 #else
          Botan::store_be(out, m_simd.val[0], m_simd.val[1], m_simd.val[2], m_simd.val[3]);