From 7e2356173973a1c9c3040ab5f59141f98430d8e4 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 4 Sep 2019 10:00:12 -0400 Subject: Avoid dynamic endian dispatch if we don't need it --- src/lib/utils/cpuid/cpuid.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'src') diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h index 256c6cc57..bb6af55a2 100644 --- a/src/lib/utils/cpuid/cpuid.h +++ b/src/lib/utils/cpuid/cpuid.h @@ -70,12 +70,24 @@ class BOTAN_PUBLIC_API(2,1) CPUID final static bool is_little_endian() { +#if defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) + return true; +#elif defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) + return false; +#else return state().endian_status() == Endian_Status::Little; +#endif } static bool is_big_endian() { +#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) + return true; +#elif defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) + return false; +#else return state().endian_status() == Endian_Status::Big; +#endif } enum CPUID_bits : uint64_t { -- cgit v1.2.3 From 05e4367ef0ed92730bf46048479ebab8b42d9971 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 4 Sep 2019 11:52:45 -0400 Subject: Fix gcc warnings in Altivec SIMD_4x32 code --- src/lib/utils/simd/simd_32.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h index 7b6929c6d..7934e0c93 100644 --- a/src/lib/utils/simd/simd_32.h +++ b/src/lib/utils/simd/simd_32.h @@ -90,7 +90,8 @@ class SIMD_4x32 final #if defined(BOTAN_SIMD_USE_SSE2) m_simd = _mm_loadu_si128(reinterpret_cast(B)); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_simd = (__vector unsigned int){B[0], B[1], B[2], B[3]}; + __vector unsigned int val = { B[0], B[1], B[2], B[3]}; + m_simd = val; #elif defined(BOTAN_SIMD_USE_NEON) m_simd = vld1q_u32(B); #else @@ -109,7 +110,8 @@ class SIMD_4x32 final #if defined(BOTAN_SIMD_USE_SSE2) m_simd = _mm_set_epi32(B3, B2, B1, B0); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_simd = (__vector unsigned int){B0, B1, B2, B3}; + __vector unsigned int val = {B0, B1, B2, B3}; + m_simd = val; #elif defined(BOTAN_SIMD_USE_NEON) // Better way to do this? const uint32_t B[4] = { B0, B1, B2, B3 }; @@ -303,7 +305,8 @@ class SIMD_4x32 final #elif defined(BOTAN_SIMD_USE_ALTIVEC) const unsigned int r = static_cast(ROT); - return SIMD_4x32(vec_rl(m_simd, (__vector unsigned int){r, r, r, r})); + __vector unsigned int rot = {r, r, r, r}; + return SIMD_4x32(vec_rl(m_simd, rot)); #elif defined(BOTAN_SIMD_USE_NEON) @@ -488,7 +491,8 @@ class SIMD_4x32 final #elif defined(BOTAN_SIMD_USE_ALTIVEC) const unsigned int s = static_cast(SHIFT); - return SIMD_4x32(vec_sl(m_simd, (__vector unsigned int){s, s, s, s})); + const __vector unsigned int shifts = {s, s, s, s}; + return SIMD_4x32(vec_sl(m_simd, shifts)); #elif defined(BOTAN_SIMD_USE_NEON) return SIMD_4x32(vshlq_n_u32(m_simd, SHIFT)); #else @@ -506,7 +510,8 @@ class SIMD_4x32 final #elif defined(BOTAN_SIMD_USE_ALTIVEC) const unsigned int s = static_cast(SHIFT); - return SIMD_4x32(vec_sr(m_simd, (__vector unsigned int){s, s, s, s})); + const __vector unsigned int shifts = {s, s, s, s}; + return SIMD_4x32(vec_sr(m_simd, shifts)); #elif defined(BOTAN_SIMD_USE_NEON) return SIMD_4x32(vshrq_n_u32(m_simd, SHIFT)); #else -- cgit v1.2.3 From ef6af5e062551f6b8b6423ddca5a51d86df2800e Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 4 Sep 2019 11:53:00 -0400 Subject: Unroll POWER8 AES instructions by 4x Improves performance by 20-30% on POWER9 --- src/lib/block/aes/aes_power8/aes_power8.cpp | 433 +++++++++++++++++++++------- 1 file changed, 328 insertions(+), 105 deletions(-) (limited to 'src') diff --git a/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/lib/block/aes/aes_power8/aes_power8.cpp index e90a51131..b9af23ef2 100644 --- a/src/lib/block/aes/aes_power8/aes_power8.cpp +++ b/src/lib/block/aes/aes_power8/aes_power8.cpp @@ -1,10 +1,10 @@ /* -* AES using POWER8 crypto extensions +* AES using POWER8/POWER9 crypto extensions * * Contributed by Jeffrey Walton * * Further changes -* (C) 2018 Jack Lloyd +* (C) 2018,2019 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -18,30 +18,34 @@ namespace Botan { +typedef __vector unsigned long long Altivec64x2; +typedef __vector unsigned int Altivec32x4; +typedef __vector unsigned char Altivec8x16; + namespace { -__vector unsigned long long LoadKey(const uint32_t* src) +inline Altivec64x2 load_key(const uint32_t key[]) { - __vector unsigned int vec = vec_vsx_ld(0, src); + Altivec32x4 vec = vec_vsx_ld(0, key); if(CPUID::is_little_endian()) { - const __vector unsigned char mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; - const __vector unsigned char zero = {0}; - return (__vector unsigned long long)vec_perm((__vector unsigned char)vec, zero, mask); + const Altivec8x16 mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; + const Altivec8x16 zero = {0}; + return (Altivec64x2)vec_perm((Altivec8x16)vec, zero, mask); } else { - return (__vector unsigned long long)vec; + return (Altivec64x2)vec; } } -__vector unsigned char Reverse8x16(const __vector unsigned char src) +inline Altivec8x16 reverse_vec(Altivec8x16 src) { if(CPUID::is_little_endian()) { - const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const __vector unsigned char zero = {0}; + const Altivec8x16 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const Altivec8x16 zero = {0}; return vec_perm(src, zero, mask); } else @@ -50,36 +54,107 @@ __vector unsigned char Reverse8x16(const __vector unsigned char src) } } -__vector unsigned long long LoadBlock(const uint8_t* src) +inline Altivec64x2 load_block(const uint8_t src[]) { - return (__vector unsigned long long)Reverse8x16(vec_vsx_ld(0, src)); + return (Altivec64x2)reverse_vec(vec_vsx_ld(0, src)); } -void StoreBlock(const __vector unsigned long long src, uint8_t* dest) +inline void store_block(Altivec64x2 src, uint8_t dest[]) { - vec_vsx_st(Reverse8x16((__vector unsigned char)src), 0, dest); + vec_vsx_st(reverse_vec((Altivec8x16)src), 0, dest); } +inline void store_blocks(Altivec64x2 B0, Altivec64x2 B1, + Altivec64x2 B2, Altivec64x2 B3, + uint8_t out[]) + { + store_block(B0, out); + store_block(B1, out+16); + store_block(B2, out+16*2); + store_block(B3, out+16*3); + } + +#define AES_XOR_4(B0, B1, B2, B3, K) do { \ + B0 = vec_xor(B0, K); \ + B1 = vec_xor(B1, K); \ + B2 = vec_xor(B2, K); \ + B3 = vec_xor(B3, K); \ + } while(0) + +#define AES_ENCRYPT_4(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vcipher(B0, K); \ + B1 = __builtin_crypto_vcipher(B1, K); \ + B2 = __builtin_crypto_vcipher(B2, K); \ + B3 = __builtin_crypto_vcipher(B3, K); \ + } while(0) + +#define AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vcipherlast(B0, K); \ + B1 = __builtin_crypto_vcipherlast(B1, K); \ + B2 = __builtin_crypto_vcipherlast(B2, K); \ + B3 = __builtin_crypto_vcipherlast(B3, K); \ + } while(0) + +#define AES_DECRYPT_4(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vncipher(B0, K); \ + B1 = __builtin_crypto_vncipher(B1, K); \ + B2 = __builtin_crypto_vncipher(B2, K); \ + B3 = __builtin_crypto_vncipher(B3, K); \ + } while(0) + +#define AES_DECRYPT_4_LAST(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vncipherlast(B0, K); \ + B1 = __builtin_crypto_vncipherlast(B1, K); \ + B2 = __builtin_crypto_vncipherlast(B2, K); \ + B3 = __builtin_crypto_vncipherlast(B3, K); \ + } while(0) + } BOTAN_FUNC_ISA("crypto") void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadKey(&m_EK[0]); - const __vector unsigned long long K1 = LoadKey(&m_EK[4]); - const __vector unsigned long long K2 = LoadKey(&m_EK[8]); - const __vector unsigned long long K3 = LoadKey(&m_EK[12]); - const __vector unsigned long long K4 = LoadKey(&m_EK[16]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[32]); - const __vector unsigned long long K9 = LoadKey(&m_EK[36]); - const __vector unsigned long long K10 = LoadBlock(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[0]); + const Altivec64x2 K1 = load_key(&m_EK[4]); + const Altivec64x2 K2 = load_key(&m_EK[8]); + const Altivec64x2 K3 = load_key(&m_EK[12]); + const Altivec64x2 K4 = load_key(&m_EK[16]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[32]); + const Altivec64x2 K9 = load_key(&m_EK[36]); + const Altivec64x2 K10 = load_block(m_ME.data()); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_ENCRYPT_4(B0, B1, B2, B3, K1); + AES_ENCRYPT_4(B0, B1, B2, B3, K2); + AES_ENCRYPT_4(B0, B1, B2, B3, K3); + AES_ENCRYPT_4(B0, B1, B2, B3, K4); + AES_ENCRYPT_4(B0, B1, B2, B3, K5); + AES_ENCRYPT_4(B0, B1, B2, B3, K6); + AES_ENCRYPT_4(B0, B1, B2, B3, K7); + AES_ENCRYPT_4(B0, B1, B2, B3, K8); + AES_ENCRYPT_4(B0, B1, B2, B3, K9); + AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K10); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vcipher(B, K1); @@ -93,7 +168,7 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vcipher(B, K9); B = __builtin_crypto_vcipherlast(B, K10); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -103,21 +178,47 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadBlock(m_ME.data()); - const __vector unsigned long long K1 = LoadKey(&m_EK[36]); - const __vector unsigned long long K2 = LoadKey(&m_EK[32]); - const __vector unsigned long long K3 = LoadKey(&m_EK[28]); - const __vector unsigned long long K4 = LoadKey(&m_EK[24]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[16]); - const __vector unsigned long long K7 = LoadKey(&m_EK[12]); - const __vector unsigned long long K8 = LoadKey(&m_EK[8]); - const __vector unsigned long long K9 = LoadKey(&m_EK[4]); - const __vector unsigned long long K10 = LoadKey(&m_EK[0]); + const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K1 = load_key(&m_EK[36]); + const Altivec64x2 K2 = load_key(&m_EK[32]); + const Altivec64x2 K3 = load_key(&m_EK[28]); + const Altivec64x2 K4 = load_key(&m_EK[24]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[16]); + const Altivec64x2 K7 = load_key(&m_EK[12]); + const Altivec64x2 K8 = load_key(&m_EK[8]); + const Altivec64x2 K9 = load_key(&m_EK[4]); + const Altivec64x2 K10 = load_key(&m_EK[0]); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_DECRYPT_4(B0, B1, B2, B3, K1); + AES_DECRYPT_4(B0, B1, B2, B3, K2); + AES_DECRYPT_4(B0, B1, B2, B3, K3); + AES_DECRYPT_4(B0, B1, B2, B3, K4); + AES_DECRYPT_4(B0, B1, B2, B3, K5); + AES_DECRYPT_4(B0, B1, B2, B3, K6); + AES_DECRYPT_4(B0, B1, B2, B3, K7); + AES_DECRYPT_4(B0, B1, B2, B3, K8); + AES_DECRYPT_4(B0, B1, B2, B3, K9); + AES_DECRYPT_4_LAST(B0, B1, B2, B3, K10); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vncipher(B, K1); @@ -131,7 +232,7 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vncipher(B, K9); B = __builtin_crypto_vncipherlast(B, K10); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -141,23 +242,51 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadKey(&m_EK[0]); - const __vector unsigned long long K1 = LoadKey(&m_EK[4]); - const __vector unsigned long long K2 = LoadKey(&m_EK[8]); - const __vector unsigned long long K3 = LoadKey(&m_EK[12]); - const __vector unsigned long long K4 = LoadKey(&m_EK[16]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[32]); - const __vector unsigned long long K9 = LoadKey(&m_EK[36]); - const __vector unsigned long long K10 = LoadKey(&m_EK[40]); - const __vector unsigned long long K11 = LoadKey(&m_EK[44]); - const __vector unsigned long long K12 = LoadBlock(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[0]); + const Altivec64x2 K1 = load_key(&m_EK[4]); + const Altivec64x2 K2 = load_key(&m_EK[8]); + const Altivec64x2 K3 = load_key(&m_EK[12]); + const Altivec64x2 K4 = load_key(&m_EK[16]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[32]); + const Altivec64x2 K9 = load_key(&m_EK[36]); + const Altivec64x2 K10 = load_key(&m_EK[40]); + const Altivec64x2 K11 = load_key(&m_EK[44]); + const Altivec64x2 K12 = load_block(m_ME.data()); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_ENCRYPT_4(B0, B1, B2, B3, K1); + AES_ENCRYPT_4(B0, B1, B2, B3, K2); + AES_ENCRYPT_4(B0, B1, B2, B3, K3); + AES_ENCRYPT_4(B0, B1, B2, B3, K4); + AES_ENCRYPT_4(B0, B1, B2, B3, K5); + AES_ENCRYPT_4(B0, B1, B2, B3, K6); + AES_ENCRYPT_4(B0, B1, B2, B3, K7); + AES_ENCRYPT_4(B0, B1, B2, B3, K8); + AES_ENCRYPT_4(B0, B1, B2, B3, K9); + AES_ENCRYPT_4(B0, B1, B2, B3, K10); + AES_ENCRYPT_4(B0, B1, B2, B3, K11); + AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K12); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vcipher(B, K1); @@ -173,7 +302,7 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vcipher(B, K11); B = __builtin_crypto_vcipherlast(B, K12); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -183,23 +312,51 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadBlock(m_ME.data()); - const __vector unsigned long long K1 = LoadKey(&m_EK[44]); - const __vector unsigned long long K2 = LoadKey(&m_EK[40]); - const __vector unsigned long long K3 = LoadKey(&m_EK[36]); - const __vector unsigned long long K4 = LoadKey(&m_EK[32]); - const __vector unsigned long long K5 = LoadKey(&m_EK[28]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[20]); - const __vector unsigned long long K8 = LoadKey(&m_EK[16]); - const __vector unsigned long long K9 = LoadKey(&m_EK[12]); - const __vector unsigned long long K10 = LoadKey(&m_EK[8]); - const __vector unsigned long long K11 = LoadKey(&m_EK[4]); - const __vector unsigned long long K12 = LoadKey(&m_EK[0]); + const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K1 = load_key(&m_EK[44]); + const Altivec64x2 K2 = load_key(&m_EK[40]); + const Altivec64x2 K3 = load_key(&m_EK[36]); + const Altivec64x2 K4 = load_key(&m_EK[32]); + const Altivec64x2 K5 = load_key(&m_EK[28]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[20]); + const Altivec64x2 K8 = load_key(&m_EK[16]); + const Altivec64x2 K9 = load_key(&m_EK[12]); + const Altivec64x2 K10 = load_key(&m_EK[8]); + const Altivec64x2 K11 = load_key(&m_EK[4]); + const Altivec64x2 K12 = load_key(&m_EK[0]); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_DECRYPT_4(B0, B1, B2, B3, K1); + AES_DECRYPT_4(B0, B1, B2, B3, K2); + AES_DECRYPT_4(B0, B1, B2, B3, K3); + AES_DECRYPT_4(B0, B1, B2, B3, K4); + AES_DECRYPT_4(B0, B1, B2, B3, K5); + AES_DECRYPT_4(B0, B1, B2, B3, K6); + AES_DECRYPT_4(B0, B1, B2, B3, K7); + AES_DECRYPT_4(B0, B1, B2, B3, K8); + AES_DECRYPT_4(B0, B1, B2, B3, K9); + AES_DECRYPT_4(B0, B1, B2, B3, K10); + AES_DECRYPT_4(B0, B1, B2, B3, K11); + AES_DECRYPT_4_LAST(B0, B1, B2, B3, K12); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vncipher(B, K1); @@ -215,7 +372,7 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vncipher(B, K11); B = __builtin_crypto_vncipherlast(B, K12); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -225,25 +382,55 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadKey(&m_EK[0]); - const __vector unsigned long long K1 = LoadKey(&m_EK[4]); - const __vector unsigned long long K2 = LoadKey(&m_EK[8]); - const __vector unsigned long long K3 = LoadKey(&m_EK[12]); - const __vector unsigned long long K4 = LoadKey(&m_EK[16]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[32]); - const __vector unsigned long long K9 = LoadKey(&m_EK[36]); - const __vector unsigned long long K10 = LoadKey(&m_EK[40]); - const __vector unsigned long long K11 = LoadKey(&m_EK[44]); - const __vector unsigned long long K12 = LoadKey(&m_EK[48]); - const __vector unsigned long long K13 = LoadKey(&m_EK[52]); - const __vector unsigned long long K14 = LoadBlock(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[0]); + const Altivec64x2 K1 = load_key(&m_EK[4]); + const Altivec64x2 K2 = load_key(&m_EK[8]); + const Altivec64x2 K3 = load_key(&m_EK[12]); + const Altivec64x2 K4 = load_key(&m_EK[16]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[32]); + const Altivec64x2 K9 = load_key(&m_EK[36]); + const Altivec64x2 K10 = load_key(&m_EK[40]); + const Altivec64x2 K11 = load_key(&m_EK[44]); + const Altivec64x2 K12 = load_key(&m_EK[48]); + const Altivec64x2 K13 = load_key(&m_EK[52]); + const Altivec64x2 K14 = load_block(m_ME.data()); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_ENCRYPT_4(B0, B1, B2, B3, K1); + AES_ENCRYPT_4(B0, B1, B2, B3, K2); + AES_ENCRYPT_4(B0, B1, B2, B3, K3); + AES_ENCRYPT_4(B0, B1, B2, B3, K4); + AES_ENCRYPT_4(B0, B1, B2, B3, K5); + AES_ENCRYPT_4(B0, B1, B2, B3, K6); + AES_ENCRYPT_4(B0, B1, B2, B3, K7); + AES_ENCRYPT_4(B0, B1, B2, B3, K8); + AES_ENCRYPT_4(B0, B1, B2, B3, K9); + AES_ENCRYPT_4(B0, B1, B2, B3, K10); + AES_ENCRYPT_4(B0, B1, B2, B3, K11); + AES_ENCRYPT_4(B0, B1, B2, B3, K12); + AES_ENCRYPT_4(B0, B1, B2, B3, K13); + AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K14); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vcipher(B, K1); @@ -261,7 +448,7 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vcipher(B, K13); B = __builtin_crypto_vcipherlast(B, K14); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -271,25 +458,55 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadBlock(m_ME.data()); - const __vector unsigned long long K1 = LoadKey(&m_EK[52]); - const __vector unsigned long long K2 = LoadKey(&m_EK[48]); - const __vector unsigned long long K3 = LoadKey(&m_EK[44]); - const __vector unsigned long long K4 = LoadKey(&m_EK[40]); - const __vector unsigned long long K5 = LoadKey(&m_EK[36]); - const __vector unsigned long long K6 = LoadKey(&m_EK[32]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[24]); - const __vector unsigned long long K9 = LoadKey(&m_EK[20]); - const __vector unsigned long long K10 = LoadKey(&m_EK[16]); - const __vector unsigned long long K11 = LoadKey(&m_EK[12]); - const __vector unsigned long long K12 = LoadKey(&m_EK[8]); - const __vector unsigned long long K13 = LoadKey(&m_EK[4]); - const __vector unsigned long long K14 = LoadKey(&m_EK[0]); + const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K1 = load_key(&m_EK[52]); + const Altivec64x2 K2 = load_key(&m_EK[48]); + const Altivec64x2 K3 = load_key(&m_EK[44]); + const Altivec64x2 K4 = load_key(&m_EK[40]); + const Altivec64x2 K5 = load_key(&m_EK[36]); + const Altivec64x2 K6 = load_key(&m_EK[32]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[24]); + const Altivec64x2 K9 = load_key(&m_EK[20]); + const Altivec64x2 K10 = load_key(&m_EK[16]); + const Altivec64x2 K11 = load_key(&m_EK[12]); + const Altivec64x2 K12 = load_key(&m_EK[8]); + const Altivec64x2 K13 = load_key(&m_EK[4]); + const Altivec64x2 K14 = load_key(&m_EK[0]); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_DECRYPT_4(B0, B1, B2, B3, K1); + AES_DECRYPT_4(B0, B1, B2, B3, K2); + AES_DECRYPT_4(B0, B1, B2, B3, K3); + AES_DECRYPT_4(B0, B1, B2, B3, K4); + AES_DECRYPT_4(B0, B1, B2, B3, K5); + AES_DECRYPT_4(B0, B1, B2, B3, K6); + AES_DECRYPT_4(B0, B1, B2, B3, K7); + AES_DECRYPT_4(B0, B1, B2, B3, K8); + AES_DECRYPT_4(B0, B1, B2, B3, K9); + AES_DECRYPT_4(B0, B1, B2, B3, K10); + AES_DECRYPT_4(B0, B1, B2, B3, K11); + AES_DECRYPT_4(B0, B1, B2, B3, K12); + AES_DECRYPT_4(B0, B1, B2, B3, K13); + AES_DECRYPT_4_LAST(B0, B1, B2, B3, K14); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vncipher(B, K1); @@ -307,11 +524,17 @@ void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vncipher(B, K13); B = __builtin_crypto_vncipherlast(B, K14); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; } } +#undef AES_XOR_4 +#undef AES_ENCRYPT_4 +#undef AES_ENCRYPT_4_LAST +#undef AES_DECRYPT_4 +#undef AES_DECRYPT_4_LAST + } -- cgit v1.2.3