diff options
author | Jack Lloyd <[email protected]> | 2019-09-01 07:16:58 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2019-09-01 09:03:54 -0400 |
commit | 1456e825379a8621fbb8cd8c5c7e7e3201ddd2fe (patch) | |
tree | 24349d15adbd75466557d4a929703e0633cbf2f7 /src/lib/block/aes | |
parent | 3a3a7b38160dbfd76fc0e073b23e7f35e480cbd9 (diff) |
Support NEON for AES vector permutes
Rename aes_ssse3 -> aes_vperm
Diffstat (limited to 'src/lib/block/aes')
-rw-r--r-- | src/lib/block/aes/aes.cpp | 158 | ||||
-rw-r--r-- | src/lib/block/aes/aes.h | 24 | ||||
-rw-r--r-- | src/lib/block/aes/aes_vperm/aes_vperm.cpp (renamed from src/lib/block/aes/aes_ssse3/aes_ssse3.cpp) | 78 | ||||
-rw-r--r-- | src/lib/block/aes/aes_vperm/info.txt (renamed from src/lib/block/aes/aes_ssse3/info.txt) | 4 |
4 files changed, 154 insertions, 110 deletions
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 568dfb1b3..2813a5f5a 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -456,13 +456,6 @@ const char* aes_provider() } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return "ssse3"; - } -#endif - #if defined(BOTAN_HAS_AES_POWER8) if(CPUID::has_ppc_crypto()) { @@ -477,6 +470,13 @@ const char* aes_provider() } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return "vperm"; + } +#endif + return "base"; } @@ -501,13 +501,6 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_encrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -522,6 +515,13 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_encrypt_n(in, out, blocks); + } +#endif + aes_encrypt_n(in, out, blocks, m_EK, m_ME); } @@ -536,13 +536,6 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_decrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -557,6 +550,13 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_decrypt_n(in, out, blocks); + } +#endif + aes_decrypt_n(in, out, blocks, m_DK, m_MD); } @@ -569,10 +569,24 @@ void AES_128::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) { - return ssse3_key_schedule(key, length); + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_key_schedule(key, length); } #endif @@ -598,13 +612,6 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_encrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -619,6 +626,13 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_encrypt_n(in, out, blocks); + } +#endif + aes_encrypt_n(in, out, blocks, m_EK, m_ME); } @@ -633,13 +647,6 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_decrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -654,6 +661,13 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_decrypt_n(in, out, blocks); + } +#endif + aes_decrypt_n(in, out, blocks, m_DK, m_MD); } @@ -666,10 +680,24 @@ void AES_192::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) { - return ssse3_key_schedule(key, length); + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_key_schedule(key, length); } #endif @@ -695,13 +723,6 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_encrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -716,6 +737,13 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_encrypt_n(in, out, blocks); + } +#endif + aes_encrypt_n(in, out, blocks, m_EK, m_ME); } @@ -730,13 +758,6 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) - { - return ssse3_decrypt_n(in, out, blocks); - } -#endif - #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { @@ -751,6 +772,13 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return vperm_decrypt_n(in, out, blocks); + } +#endif + aes_decrypt_n(in, out, blocks, m_DK, m_MD); } @@ -763,10 +791,24 @@ void AES_256::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_SSSE3) - if(CPUID::has_ssse3()) +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } +#endif + +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) { - return ssse3_key_schedule(key, length); + return vperm_key_schedule(key, length); } #endif diff --git a/src/lib/block/aes/aes.h b/src/lib/block/aes/aes.h index 294cdcad3..6083467b6 100644 --- a/src/lib/block/aes/aes.h +++ b/src/lib/block/aes/aes.h @@ -31,10 +31,10 @@ class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16, private: void key_schedule(const uint8_t key[], size_t length) override; -#if defined(BOTAN_HAS_AES_SSSE3) - void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_key_schedule(const uint8_t key[], size_t length); +#if defined(BOTAN_HAS_AES_VPERM) + void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_key_schedule(const uint8_t key[], size_t length); #endif #if defined(BOTAN_HAS_AES_NI) @@ -74,10 +74,10 @@ class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16, size_t parallelism() const override; private: -#if defined(BOTAN_HAS_AES_SSSE3) - void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_key_schedule(const uint8_t key[], size_t length); +#if defined(BOTAN_HAS_AES_VPERM) + void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_key_schedule(const uint8_t key[], size_t length); #endif #if defined(BOTAN_HAS_AES_NI) @@ -120,10 +120,10 @@ class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16, size_t parallelism() const override; private: -#if defined(BOTAN_HAS_AES_SSSE3) - void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void ssse3_key_schedule(const uint8_t key[], size_t length); +#if defined(BOTAN_HAS_AES_VPERM) + void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void vperm_key_schedule(const uint8_t key[], size_t length); #endif #if defined(BOTAN_HAS_AES_NI) diff --git a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp index fa8bf4faa..a36118cbd 100644 --- a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp +++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp @@ -1,5 +1,5 @@ /* -* AES using SSSE3 +* AES using vector permutes (SSSE3, NEON) * (C) 2010,2016,2019 Jack Lloyd * * This is more or less a direct translation of public domain x86-64 @@ -28,39 +28,33 @@ inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b) { #if defined(BOTAN_SIMD_USE_SSE2) return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw())); -#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64) +#elif defined(BOTAN_SIMD_USE_NEON) + const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw()); + const uint8x16_t idx = vreinterpretq_u8_u32(b.raw()); - const int8x16_t tbl = vreinterpretq_s8_m128i(a.raw()); - const uint8x16_t idx = vreinterpretq_u8_m128i(b.raw()); +#if defined(BOTAN_TARGET_ARCH_IS_ARM32) + uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) }; - // fixme use vdupq_n_s8 - const uint8_t alignas(16) mask[16] = { - 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, - 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F, 0x8F - }; + return SIMD_4x32(vreinterpretq_u32_u8( + vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)), + vtbl2_u8(tbl2, vget_high_u8(idx))))); - const uint8x16_t idx_masked = - vandq_u8(idx, vld1q_u8(mask)); // avoid using meaningless bits +#else + return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx))); +#endif - return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #else #error "No shuffle implementation available" #endif } -template<size_t I1, size_t I2, size_t I3, size_t I4> -inline SIMD_4x32 shuffle32(SIMD_4x32 x) - { - return SIMD_4x32(_mm_shuffle_epi32(x.raw(), _MM_SHUFFLE(I1, I2, I3, I4))); - } - template<size_t I> inline SIMD_4x32 slli(SIMD_4x32 x) { #if defined(BOTAN_SIMD_USE_SSE2) return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I)); -#else - #error "No ssli implementation available" +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u32(x.raw()), 16 - 4*I))); #endif } @@ -68,8 +62,10 @@ inline SIMD_4x32 zero_top_half(SIMD_4x32 x) { #if defined(BOTAN_SIMD_USE_SSE2) return SIMD_4x32(_mm_slli_si128(_mm_srli_si128(x.raw(), 8), 8)); -#else - #error "No zero_top_half implementation available" +#elif defined(BOTAN_SIMD_USE_NEON) + // fixme do better ? + SIMD_4x32 mask(0, 0, ~0, ~0); + return x & mask; #endif } @@ -78,8 +74,8 @@ inline SIMD_4x32 alignr(SIMD_4x32 a, SIMD_4x32 b) { #if defined(BOTAN_SIMD_USE_SSE2) return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), C)); -#else - #error "No alignr implementation available" +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), C))); #endif } @@ -109,6 +105,8 @@ const SIMD_4x32 sr[4] = { const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); +const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C); + inline SIMD_4x32 low_nibs(SIMD_4x32 x) { return lo_nibs_mask & x; @@ -257,32 +255,32 @@ void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, } -void AES_128::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 10); } -void AES_128::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 10); } -void AES_192::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 12); } -void AES_192::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 12); } -void AES_256::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 14); } -void AES_256::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 14); } @@ -335,7 +333,7 @@ SIMD_4x32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) t = aes_schedule_transform(t, dsk[6], dsk[7]); output = shuffle(t ^ output, mc_forward0); - return shuffle(output, SIMD_4x32(sr[round_no % 4])); + return shuffle(output, sr[round_no % 4]); } SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) @@ -343,7 +341,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121); const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); - k = shuffle(k, SIMD_4x32(sr[round_no % 4])); + k = shuffle(k, sr[round_no % 4]); k ^= SIMD_4x32::splat_u8(0x5B); return aes_schedule_transform(k, out_tr1, out_tr2); } @@ -383,7 +381,7 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32& rcon, SIMD_4x32 input1, SIMD_4x32 input2 { input2 ^= alignr<15>(SIMD_4x32(), rcon); rcon = alignr<15>(rcon, rcon); - input1 = shuffle32<3,3,3,3>(input1); + input1 = shuffle(input1, shuffle3333); input1 = alignr<1>(input1, input1); return aes_schedule_round(input1, input2); @@ -391,12 +389,16 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32& rcon, SIMD_4x32 input1, SIMD_4x32 input2 SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) { - return y ^ shuffle32<3,3,3,2>(x) ^ shuffle32<2,0,0,0>(y); + const SIMD_4x32 shuffle3332 = + SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C); + const SIMD_4x32 shuffle2000 = + SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908); + return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000); } } -void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t) +void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t) { m_EK.resize(11*4); m_DK.resize(11*4); @@ -424,7 +426,7 @@ void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t) aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]); } -void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t) +void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t) { m_EK.resize(13*4); m_DK.resize(13*4); @@ -474,7 +476,7 @@ void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t) } } -void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t) +void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t) { m_EK.resize(15*4); m_DK.resize(15*4); @@ -502,7 +504,7 @@ void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t) aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]); aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]); - key2 = aes_schedule_round(shuffle32<3,3,3,3>(key2), k_t); + key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t); aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]); aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]); diff --git a/src/lib/block/aes/aes_ssse3/info.txt b/src/lib/block/aes/aes_vperm/info.txt index 49d9a9214..5ff0c2aa2 100644 --- a/src/lib/block/aes/aes_ssse3/info.txt +++ b/src/lib/block/aes/aes_vperm/info.txt @@ -1,9 +1,9 @@ <defines> -AES_SSSE3 -> 20131128 +AES_VPERM -> 20190901 </defines> <isa> -ssse3 +neon </isa> <requires> |