diff options
author | Jack Lloyd <[email protected]> | 2019-09-04 11:53:00 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2019-09-04 11:53:55 -0400 |
commit | ef6af5e062551f6b8b6423ddca5a51d86df2800e (patch) | |
tree | 3caa77dd8ae8ff7a18dc683252464080c43014f8 | |
parent | 05e4367ef0ed92730bf46048479ebab8b42d9971 (diff) |
Unroll POWER8 AES instructions by 4x
Improves performance by 20-30% on POWER9
-rw-r--r-- | src/lib/block/aes/aes_power8/aes_power8.cpp | 433 |
1 files changed, 328 insertions, 105 deletions
diff --git a/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/lib/block/aes/aes_power8/aes_power8.cpp index e90a51131..b9af23ef2 100644 --- a/src/lib/block/aes/aes_power8/aes_power8.cpp +++ b/src/lib/block/aes/aes_power8/aes_power8.cpp @@ -1,10 +1,10 @@ /* -* AES using POWER8 crypto extensions +* AES using POWER8/POWER9 crypto extensions * * Contributed by Jeffrey Walton * * Further changes -* (C) 2018 Jack Lloyd +* (C) 2018,2019 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -18,30 +18,34 @@ namespace Botan { +typedef __vector unsigned long long Altivec64x2; +typedef __vector unsigned int Altivec32x4; +typedef __vector unsigned char Altivec8x16; + namespace { -__vector unsigned long long LoadKey(const uint32_t* src) +inline Altivec64x2 load_key(const uint32_t key[]) { - __vector unsigned int vec = vec_vsx_ld(0, src); + Altivec32x4 vec = vec_vsx_ld(0, key); if(CPUID::is_little_endian()) { - const __vector unsigned char mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; - const __vector unsigned char zero = {0}; - return (__vector unsigned long long)vec_perm((__vector unsigned char)vec, zero, mask); + const Altivec8x16 mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; + const Altivec8x16 zero = {0}; + return (Altivec64x2)vec_perm((Altivec8x16)vec, zero, mask); } else { - return (__vector unsigned long long)vec; + return (Altivec64x2)vec; } } -__vector unsigned char Reverse8x16(const __vector unsigned char src) +inline Altivec8x16 reverse_vec(Altivec8x16 src) { if(CPUID::is_little_endian()) { - const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const __vector unsigned char zero = {0}; + const Altivec8x16 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const Altivec8x16 zero = {0}; return vec_perm(src, zero, mask); } else @@ -50,36 +54,107 @@ __vector unsigned char Reverse8x16(const __vector unsigned char src) } } -__vector unsigned long long LoadBlock(const uint8_t* src) +inline Altivec64x2 load_block(const uint8_t src[]) { - return (__vector unsigned long long)Reverse8x16(vec_vsx_ld(0, src)); + return (Altivec64x2)reverse_vec(vec_vsx_ld(0, src)); } -void StoreBlock(const __vector unsigned long long src, uint8_t* dest) +inline void store_block(Altivec64x2 src, uint8_t dest[]) { - vec_vsx_st(Reverse8x16((__vector unsigned char)src), 0, dest); + vec_vsx_st(reverse_vec((Altivec8x16)src), 0, dest); } +inline void store_blocks(Altivec64x2 B0, Altivec64x2 B1, + Altivec64x2 B2, Altivec64x2 B3, + uint8_t out[]) + { + store_block(B0, out); + store_block(B1, out+16); + store_block(B2, out+16*2); + store_block(B3, out+16*3); + } + +#define AES_XOR_4(B0, B1, B2, B3, K) do { \ + B0 = vec_xor(B0, K); \ + B1 = vec_xor(B1, K); \ + B2 = vec_xor(B2, K); \ + B3 = vec_xor(B3, K); \ + } while(0) + +#define AES_ENCRYPT_4(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vcipher(B0, K); \ + B1 = __builtin_crypto_vcipher(B1, K); \ + B2 = __builtin_crypto_vcipher(B2, K); \ + B3 = __builtin_crypto_vcipher(B3, K); \ + } while(0) + +#define AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vcipherlast(B0, K); \ + B1 = __builtin_crypto_vcipherlast(B1, K); \ + B2 = __builtin_crypto_vcipherlast(B2, K); \ + B3 = __builtin_crypto_vcipherlast(B3, K); \ + } while(0) + +#define AES_DECRYPT_4(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vncipher(B0, K); \ + B1 = __builtin_crypto_vncipher(B1, K); \ + B2 = __builtin_crypto_vncipher(B2, K); \ + B3 = __builtin_crypto_vncipher(B3, K); \ + } while(0) + +#define AES_DECRYPT_4_LAST(B0, B1, B2, B3, K) do { \ + B0 = __builtin_crypto_vncipherlast(B0, K); \ + B1 = __builtin_crypto_vncipherlast(B1, K); \ + B2 = __builtin_crypto_vncipherlast(B2, K); \ + B3 = __builtin_crypto_vncipherlast(B3, K); \ + } while(0) + } BOTAN_FUNC_ISA("crypto") void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadKey(&m_EK[0]); - const __vector unsigned long long K1 = LoadKey(&m_EK[4]); - const __vector unsigned long long K2 = LoadKey(&m_EK[8]); - const __vector unsigned long long K3 = LoadKey(&m_EK[12]); - const __vector unsigned long long K4 = LoadKey(&m_EK[16]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[32]); - const __vector unsigned long long K9 = LoadKey(&m_EK[36]); - const __vector unsigned long long K10 = LoadBlock(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[0]); + const Altivec64x2 K1 = load_key(&m_EK[4]); + const Altivec64x2 K2 = load_key(&m_EK[8]); + const Altivec64x2 K3 = load_key(&m_EK[12]); + const Altivec64x2 K4 = load_key(&m_EK[16]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[32]); + const Altivec64x2 K9 = load_key(&m_EK[36]); + const Altivec64x2 K10 = load_block(m_ME.data()); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_ENCRYPT_4(B0, B1, B2, B3, K1); + AES_ENCRYPT_4(B0, B1, B2, B3, K2); + AES_ENCRYPT_4(B0, B1, B2, B3, K3); + AES_ENCRYPT_4(B0, B1, B2, B3, K4); + AES_ENCRYPT_4(B0, B1, B2, B3, K5); + AES_ENCRYPT_4(B0, B1, B2, B3, K6); + AES_ENCRYPT_4(B0, B1, B2, B3, K7); + AES_ENCRYPT_4(B0, B1, B2, B3, K8); + AES_ENCRYPT_4(B0, B1, B2, B3, K9); + AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K10); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vcipher(B, K1); @@ -93,7 +168,7 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vcipher(B, K9); B = __builtin_crypto_vcipherlast(B, K10); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -103,21 +178,47 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadBlock(m_ME.data()); - const __vector unsigned long long K1 = LoadKey(&m_EK[36]); - const __vector unsigned long long K2 = LoadKey(&m_EK[32]); - const __vector unsigned long long K3 = LoadKey(&m_EK[28]); - const __vector unsigned long long K4 = LoadKey(&m_EK[24]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[16]); - const __vector unsigned long long K7 = LoadKey(&m_EK[12]); - const __vector unsigned long long K8 = LoadKey(&m_EK[8]); - const __vector unsigned long long K9 = LoadKey(&m_EK[4]); - const __vector unsigned long long K10 = LoadKey(&m_EK[0]); + const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K1 = load_key(&m_EK[36]); + const Altivec64x2 K2 = load_key(&m_EK[32]); + const Altivec64x2 K3 = load_key(&m_EK[28]); + const Altivec64x2 K4 = load_key(&m_EK[24]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[16]); + const Altivec64x2 K7 = load_key(&m_EK[12]); + const Altivec64x2 K8 = load_key(&m_EK[8]); + const Altivec64x2 K9 = load_key(&m_EK[4]); + const Altivec64x2 K10 = load_key(&m_EK[0]); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_DECRYPT_4(B0, B1, B2, B3, K1); + AES_DECRYPT_4(B0, B1, B2, B3, K2); + AES_DECRYPT_4(B0, B1, B2, B3, K3); + AES_DECRYPT_4(B0, B1, B2, B3, K4); + AES_DECRYPT_4(B0, B1, B2, B3, K5); + AES_DECRYPT_4(B0, B1, B2, B3, K6); + AES_DECRYPT_4(B0, B1, B2, B3, K7); + AES_DECRYPT_4(B0, B1, B2, B3, K8); + AES_DECRYPT_4(B0, B1, B2, B3, K9); + AES_DECRYPT_4_LAST(B0, B1, B2, B3, K10); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vncipher(B, K1); @@ -131,7 +232,7 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vncipher(B, K9); B = __builtin_crypto_vncipherlast(B, K10); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -141,23 +242,51 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadKey(&m_EK[0]); - const __vector unsigned long long K1 = LoadKey(&m_EK[4]); - const __vector unsigned long long K2 = LoadKey(&m_EK[8]); - const __vector unsigned long long K3 = LoadKey(&m_EK[12]); - const __vector unsigned long long K4 = LoadKey(&m_EK[16]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[32]); - const __vector unsigned long long K9 = LoadKey(&m_EK[36]); - const __vector unsigned long long K10 = LoadKey(&m_EK[40]); - const __vector unsigned long long K11 = LoadKey(&m_EK[44]); - const __vector unsigned long long K12 = LoadBlock(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[0]); + const Altivec64x2 K1 = load_key(&m_EK[4]); + const Altivec64x2 K2 = load_key(&m_EK[8]); + const Altivec64x2 K3 = load_key(&m_EK[12]); + const Altivec64x2 K4 = load_key(&m_EK[16]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[32]); + const Altivec64x2 K9 = load_key(&m_EK[36]); + const Altivec64x2 K10 = load_key(&m_EK[40]); + const Altivec64x2 K11 = load_key(&m_EK[44]); + const Altivec64x2 K12 = load_block(m_ME.data()); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_ENCRYPT_4(B0, B1, B2, B3, K1); + AES_ENCRYPT_4(B0, B1, B2, B3, K2); + AES_ENCRYPT_4(B0, B1, B2, B3, K3); + AES_ENCRYPT_4(B0, B1, B2, B3, K4); + AES_ENCRYPT_4(B0, B1, B2, B3, K5); + AES_ENCRYPT_4(B0, B1, B2, B3, K6); + AES_ENCRYPT_4(B0, B1, B2, B3, K7); + AES_ENCRYPT_4(B0, B1, B2, B3, K8); + AES_ENCRYPT_4(B0, B1, B2, B3, K9); + AES_ENCRYPT_4(B0, B1, B2, B3, K10); + AES_ENCRYPT_4(B0, B1, B2, B3, K11); + AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K12); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vcipher(B, K1); @@ -173,7 +302,7 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vcipher(B, K11); B = __builtin_crypto_vcipherlast(B, K12); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -183,23 +312,51 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadBlock(m_ME.data()); - const __vector unsigned long long K1 = LoadKey(&m_EK[44]); - const __vector unsigned long long K2 = LoadKey(&m_EK[40]); - const __vector unsigned long long K3 = LoadKey(&m_EK[36]); - const __vector unsigned long long K4 = LoadKey(&m_EK[32]); - const __vector unsigned long long K5 = LoadKey(&m_EK[28]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[20]); - const __vector unsigned long long K8 = LoadKey(&m_EK[16]); - const __vector unsigned long long K9 = LoadKey(&m_EK[12]); - const __vector unsigned long long K10 = LoadKey(&m_EK[8]); - const __vector unsigned long long K11 = LoadKey(&m_EK[4]); - const __vector unsigned long long K12 = LoadKey(&m_EK[0]); + const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K1 = load_key(&m_EK[44]); + const Altivec64x2 K2 = load_key(&m_EK[40]); + const Altivec64x2 K3 = load_key(&m_EK[36]); + const Altivec64x2 K4 = load_key(&m_EK[32]); + const Altivec64x2 K5 = load_key(&m_EK[28]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[20]); + const Altivec64x2 K8 = load_key(&m_EK[16]); + const Altivec64x2 K9 = load_key(&m_EK[12]); + const Altivec64x2 K10 = load_key(&m_EK[8]); + const Altivec64x2 K11 = load_key(&m_EK[4]); + const Altivec64x2 K12 = load_key(&m_EK[0]); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_DECRYPT_4(B0, B1, B2, B3, K1); + AES_DECRYPT_4(B0, B1, B2, B3, K2); + AES_DECRYPT_4(B0, B1, B2, B3, K3); + AES_DECRYPT_4(B0, B1, B2, B3, K4); + AES_DECRYPT_4(B0, B1, B2, B3, K5); + AES_DECRYPT_4(B0, B1, B2, B3, K6); + AES_DECRYPT_4(B0, B1, B2, B3, K7); + AES_DECRYPT_4(B0, B1, B2, B3, K8); + AES_DECRYPT_4(B0, B1, B2, B3, K9); + AES_DECRYPT_4(B0, B1, B2, B3, K10); + AES_DECRYPT_4(B0, B1, B2, B3, K11); + AES_DECRYPT_4_LAST(B0, B1, B2, B3, K12); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vncipher(B, K1); @@ -215,7 +372,7 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vncipher(B, K11); B = __builtin_crypto_vncipherlast(B, K12); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -225,25 +382,55 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadKey(&m_EK[0]); - const __vector unsigned long long K1 = LoadKey(&m_EK[4]); - const __vector unsigned long long K2 = LoadKey(&m_EK[8]); - const __vector unsigned long long K3 = LoadKey(&m_EK[12]); - const __vector unsigned long long K4 = LoadKey(&m_EK[16]); - const __vector unsigned long long K5 = LoadKey(&m_EK[20]); - const __vector unsigned long long K6 = LoadKey(&m_EK[24]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[32]); - const __vector unsigned long long K9 = LoadKey(&m_EK[36]); - const __vector unsigned long long K10 = LoadKey(&m_EK[40]); - const __vector unsigned long long K11 = LoadKey(&m_EK[44]); - const __vector unsigned long long K12 = LoadKey(&m_EK[48]); - const __vector unsigned long long K13 = LoadKey(&m_EK[52]); - const __vector unsigned long long K14 = LoadBlock(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[0]); + const Altivec64x2 K1 = load_key(&m_EK[4]); + const Altivec64x2 K2 = load_key(&m_EK[8]); + const Altivec64x2 K3 = load_key(&m_EK[12]); + const Altivec64x2 K4 = load_key(&m_EK[16]); + const Altivec64x2 K5 = load_key(&m_EK[20]); + const Altivec64x2 K6 = load_key(&m_EK[24]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[32]); + const Altivec64x2 K9 = load_key(&m_EK[36]); + const Altivec64x2 K10 = load_key(&m_EK[40]); + const Altivec64x2 K11 = load_key(&m_EK[44]); + const Altivec64x2 K12 = load_key(&m_EK[48]); + const Altivec64x2 K13 = load_key(&m_EK[52]); + const Altivec64x2 K14 = load_block(m_ME.data()); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_ENCRYPT_4(B0, B1, B2, B3, K1); + AES_ENCRYPT_4(B0, B1, B2, B3, K2); + AES_ENCRYPT_4(B0, B1, B2, B3, K3); + AES_ENCRYPT_4(B0, B1, B2, B3, K4); + AES_ENCRYPT_4(B0, B1, B2, B3, K5); + AES_ENCRYPT_4(B0, B1, B2, B3, K6); + AES_ENCRYPT_4(B0, B1, B2, B3, K7); + AES_ENCRYPT_4(B0, B1, B2, B3, K8); + AES_ENCRYPT_4(B0, B1, B2, B3, K9); + AES_ENCRYPT_4(B0, B1, B2, B3, K10); + AES_ENCRYPT_4(B0, B1, B2, B3, K11); + AES_ENCRYPT_4(B0, B1, B2, B3, K12); + AES_ENCRYPT_4(B0, B1, B2, B3, K13); + AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K14); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vcipher(B, K1); @@ -261,7 +448,7 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vcipher(B, K13); B = __builtin_crypto_vcipherlast(B, K14); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; @@ -271,25 +458,55 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __vector unsigned long long K0 = LoadBlock(m_ME.data()); - const __vector unsigned long long K1 = LoadKey(&m_EK[52]); - const __vector unsigned long long K2 = LoadKey(&m_EK[48]); - const __vector unsigned long long K3 = LoadKey(&m_EK[44]); - const __vector unsigned long long K4 = LoadKey(&m_EK[40]); - const __vector unsigned long long K5 = LoadKey(&m_EK[36]); - const __vector unsigned long long K6 = LoadKey(&m_EK[32]); - const __vector unsigned long long K7 = LoadKey(&m_EK[28]); - const __vector unsigned long long K8 = LoadKey(&m_EK[24]); - const __vector unsigned long long K9 = LoadKey(&m_EK[20]); - const __vector unsigned long long K10 = LoadKey(&m_EK[16]); - const __vector unsigned long long K11 = LoadKey(&m_EK[12]); - const __vector unsigned long long K12 = LoadKey(&m_EK[8]); - const __vector unsigned long long K13 = LoadKey(&m_EK[4]); - const __vector unsigned long long K14 = LoadKey(&m_EK[0]); + const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K1 = load_key(&m_EK[52]); + const Altivec64x2 K2 = load_key(&m_EK[48]); + const Altivec64x2 K3 = load_key(&m_EK[44]); + const Altivec64x2 K4 = load_key(&m_EK[40]); + const Altivec64x2 K5 = load_key(&m_EK[36]); + const Altivec64x2 K6 = load_key(&m_EK[32]); + const Altivec64x2 K7 = load_key(&m_EK[28]); + const Altivec64x2 K8 = load_key(&m_EK[24]); + const Altivec64x2 K9 = load_key(&m_EK[20]); + const Altivec64x2 K10 = load_key(&m_EK[16]); + const Altivec64x2 K11 = load_key(&m_EK[12]); + const Altivec64x2 K12 = load_key(&m_EK[8]); + const Altivec64x2 K13 = load_key(&m_EK[4]); + const Altivec64x2 K14 = load_key(&m_EK[0]); + + while(blocks >= 4) + { + Altivec64x2 B0 = load_block(in); + Altivec64x2 B1 = load_block(in+16); + Altivec64x2 B2 = load_block(in+16*2); + Altivec64x2 B3 = load_block(in+16*3); + + AES_XOR_4(B0, B1, B2, B3, K0); + AES_DECRYPT_4(B0, B1, B2, B3, K1); + AES_DECRYPT_4(B0, B1, B2, B3, K2); + AES_DECRYPT_4(B0, B1, B2, B3, K3); + AES_DECRYPT_4(B0, B1, B2, B3, K4); + AES_DECRYPT_4(B0, B1, B2, B3, K5); + AES_DECRYPT_4(B0, B1, B2, B3, K6); + AES_DECRYPT_4(B0, B1, B2, B3, K7); + AES_DECRYPT_4(B0, B1, B2, B3, K8); + AES_DECRYPT_4(B0, B1, B2, B3, K9); + AES_DECRYPT_4(B0, B1, B2, B3, K10); + AES_DECRYPT_4(B0, B1, B2, B3, K11); + AES_DECRYPT_4(B0, B1, B2, B3, K12); + AES_DECRYPT_4(B0, B1, B2, B3, K13); + AES_DECRYPT_4_LAST(B0, B1, B2, B3, K14); + + store_blocks(B0, B1, B2, B3, out); + + out += 4*16; + in += 4*16; + blocks -= 4; + } for(size_t i = 0; i != blocks; ++i) { - __vector unsigned long long B = LoadBlock(in); + Altivec64x2 B = load_block(in); B = vec_xor(B, K0); B = __builtin_crypto_vncipher(B, K1); @@ -307,11 +524,17 @@ void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) B = __builtin_crypto_vncipher(B, K13); B = __builtin_crypto_vncipherlast(B, K14); - StoreBlock(B, out); + store_block(B, out); out += 16; in += 16; } } +#undef AES_XOR_4 +#undef AES_ENCRYPT_4 +#undef AES_ENCRYPT_4_LAST +#undef AES_DECRYPT_4 +#undef AES_DECRYPT_4_LAST + } |