aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2019-09-04 11:53:00 -0400
committerJack Lloyd <[email protected]>2019-09-04 11:53:55 -0400
commitef6af5e062551f6b8b6423ddca5a51d86df2800e (patch)
tree3caa77dd8ae8ff7a18dc683252464080c43014f8
parent05e4367ef0ed92730bf46048479ebab8b42d9971 (diff)
Unroll POWER8 AES instructions by 4x
Improves performance by 20-30% on POWER9
-rw-r--r--src/lib/block/aes/aes_power8/aes_power8.cpp433
1 files changed, 328 insertions, 105 deletions
diff --git a/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/lib/block/aes/aes_power8/aes_power8.cpp
index e90a51131..b9af23ef2 100644
--- a/src/lib/block/aes/aes_power8/aes_power8.cpp
+++ b/src/lib/block/aes/aes_power8/aes_power8.cpp
@@ -1,10 +1,10 @@
/*
-* AES using POWER8 crypto extensions
+* AES using POWER8/POWER9 crypto extensions
*
* Contributed by Jeffrey Walton
*
* Further changes
-* (C) 2018 Jack Lloyd
+* (C) 2018,2019 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
@@ -18,30 +18,34 @@
namespace Botan {
+typedef __vector unsigned long long Altivec64x2;
+typedef __vector unsigned int Altivec32x4;
+typedef __vector unsigned char Altivec8x16;
+
namespace {
-__vector unsigned long long LoadKey(const uint32_t* src)
+inline Altivec64x2 load_key(const uint32_t key[])
{
- __vector unsigned int vec = vec_vsx_ld(0, src);
+ Altivec32x4 vec = vec_vsx_ld(0, key);
if(CPUID::is_little_endian())
{
- const __vector unsigned char mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
- const __vector unsigned char zero = {0};
- return (__vector unsigned long long)vec_perm((__vector unsigned char)vec, zero, mask);
+ const Altivec8x16 mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+ const Altivec8x16 zero = {0};
+ return (Altivec64x2)vec_perm((Altivec8x16)vec, zero, mask);
}
else
{
- return (__vector unsigned long long)vec;
+ return (Altivec64x2)vec;
}
}
-__vector unsigned char Reverse8x16(const __vector unsigned char src)
+inline Altivec8x16 reverse_vec(Altivec8x16 src)
{
if(CPUID::is_little_endian())
{
- const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
- const __vector unsigned char zero = {0};
+ const Altivec8x16 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+ const Altivec8x16 zero = {0};
return vec_perm(src, zero, mask);
}
else
@@ -50,36 +54,107 @@ __vector unsigned char Reverse8x16(const __vector unsigned char src)
}
}
-__vector unsigned long long LoadBlock(const uint8_t* src)
+inline Altivec64x2 load_block(const uint8_t src[])
{
- return (__vector unsigned long long)Reverse8x16(vec_vsx_ld(0, src));
+ return (Altivec64x2)reverse_vec(vec_vsx_ld(0, src));
}
-void StoreBlock(const __vector unsigned long long src, uint8_t* dest)
+inline void store_block(Altivec64x2 src, uint8_t dest[])
{
- vec_vsx_st(Reverse8x16((__vector unsigned char)src), 0, dest);
+ vec_vsx_st(reverse_vec((Altivec8x16)src), 0, dest);
}
+inline void store_blocks(Altivec64x2 B0, Altivec64x2 B1,
+ Altivec64x2 B2, Altivec64x2 B3,
+ uint8_t out[])
+ {
+ store_block(B0, out);
+ store_block(B1, out+16);
+ store_block(B2, out+16*2);
+ store_block(B3, out+16*3);
+ }
+
+#define AES_XOR_4(B0, B1, B2, B3, K) do { \
+ B0 = vec_xor(B0, K); \
+ B1 = vec_xor(B1, K); \
+ B2 = vec_xor(B2, K); \
+ B3 = vec_xor(B3, K); \
+ } while(0)
+
+#define AES_ENCRYPT_4(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vcipher(B0, K); \
+ B1 = __builtin_crypto_vcipher(B1, K); \
+ B2 = __builtin_crypto_vcipher(B2, K); \
+ B3 = __builtin_crypto_vcipher(B3, K); \
+ } while(0)
+
+#define AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vcipherlast(B0, K); \
+ B1 = __builtin_crypto_vcipherlast(B1, K); \
+ B2 = __builtin_crypto_vcipherlast(B2, K); \
+ B3 = __builtin_crypto_vcipherlast(B3, K); \
+ } while(0)
+
+#define AES_DECRYPT_4(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vncipher(B0, K); \
+ B1 = __builtin_crypto_vncipher(B1, K); \
+ B2 = __builtin_crypto_vncipher(B2, K); \
+ B3 = __builtin_crypto_vncipher(B3, K); \
+ } while(0)
+
+#define AES_DECRYPT_4_LAST(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vncipherlast(B0, K); \
+ B1 = __builtin_crypto_vncipherlast(B1, K); \
+ B2 = __builtin_crypto_vncipherlast(B2, K); \
+ B3 = __builtin_crypto_vncipherlast(B3, K); \
+ } while(0)
+
}
BOTAN_FUNC_ISA("crypto")
void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __vector unsigned long long K0 = LoadKey(&m_EK[0]);
- const __vector unsigned long long K1 = LoadKey(&m_EK[4]);
- const __vector unsigned long long K2 = LoadKey(&m_EK[8]);
- const __vector unsigned long long K3 = LoadKey(&m_EK[12]);
- const __vector unsigned long long K4 = LoadKey(&m_EK[16]);
- const __vector unsigned long long K5 = LoadKey(&m_EK[20]);
- const __vector unsigned long long K6 = LoadKey(&m_EK[24]);
- const __vector unsigned long long K7 = LoadKey(&m_EK[28]);
- const __vector unsigned long long K8 = LoadKey(&m_EK[32]);
- const __vector unsigned long long K9 = LoadKey(&m_EK[36]);
- const __vector unsigned long long K10 = LoadBlock(m_ME.data());
+ const Altivec64x2 K0 = load_key(&m_EK[0]);
+ const Altivec64x2 K1 = load_key(&m_EK[4]);
+ const Altivec64x2 K2 = load_key(&m_EK[8]);
+ const Altivec64x2 K3 = load_key(&m_EK[12]);
+ const Altivec64x2 K4 = load_key(&m_EK[16]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[32]);
+ const Altivec64x2 K9 = load_key(&m_EK[36]);
+ const Altivec64x2 K10 = load_block(m_ME.data());
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K1);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K2);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K3);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K4);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K5);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K6);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K7);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K8);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K9);
+ AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K10);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
for(size_t i = 0; i != blocks; ++i)
{
- __vector unsigned long long B = LoadBlock(in);
+ Altivec64x2 B = load_block(in);
B = vec_xor(B, K0);
B = __builtin_crypto_vcipher(B, K1);
@@ -93,7 +168,7 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
B = __builtin_crypto_vcipher(B, K9);
B = __builtin_crypto_vcipherlast(B, K10);
- StoreBlock(B, out);
+ store_block(B, out);
out += 16;
in += 16;
@@ -103,21 +178,47 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
BOTAN_FUNC_ISA("crypto")
void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __vector unsigned long long K0 = LoadBlock(m_ME.data());
- const __vector unsigned long long K1 = LoadKey(&m_EK[36]);
- const __vector unsigned long long K2 = LoadKey(&m_EK[32]);
- const __vector unsigned long long K3 = LoadKey(&m_EK[28]);
- const __vector unsigned long long K4 = LoadKey(&m_EK[24]);
- const __vector unsigned long long K5 = LoadKey(&m_EK[20]);
- const __vector unsigned long long K6 = LoadKey(&m_EK[16]);
- const __vector unsigned long long K7 = LoadKey(&m_EK[12]);
- const __vector unsigned long long K8 = LoadKey(&m_EK[8]);
- const __vector unsigned long long K9 = LoadKey(&m_EK[4]);
- const __vector unsigned long long K10 = LoadKey(&m_EK[0]);
+ const Altivec64x2 K0 = load_block(m_ME.data());
+ const Altivec64x2 K1 = load_key(&m_EK[36]);
+ const Altivec64x2 K2 = load_key(&m_EK[32]);
+ const Altivec64x2 K3 = load_key(&m_EK[28]);
+ const Altivec64x2 K4 = load_key(&m_EK[24]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[16]);
+ const Altivec64x2 K7 = load_key(&m_EK[12]);
+ const Altivec64x2 K8 = load_key(&m_EK[8]);
+ const Altivec64x2 K9 = load_key(&m_EK[4]);
+ const Altivec64x2 K10 = load_key(&m_EK[0]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_DECRYPT_4(B0, B1, B2, B3, K1);
+ AES_DECRYPT_4(B0, B1, B2, B3, K2);
+ AES_DECRYPT_4(B0, B1, B2, B3, K3);
+ AES_DECRYPT_4(B0, B1, B2, B3, K4);
+ AES_DECRYPT_4(B0, B1, B2, B3, K5);
+ AES_DECRYPT_4(B0, B1, B2, B3, K6);
+ AES_DECRYPT_4(B0, B1, B2, B3, K7);
+ AES_DECRYPT_4(B0, B1, B2, B3, K8);
+ AES_DECRYPT_4(B0, B1, B2, B3, K9);
+ AES_DECRYPT_4_LAST(B0, B1, B2, B3, K10);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
for(size_t i = 0; i != blocks; ++i)
{
- __vector unsigned long long B = LoadBlock(in);
+ Altivec64x2 B = load_block(in);
B = vec_xor(B, K0);
B = __builtin_crypto_vncipher(B, K1);
@@ -131,7 +232,7 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
B = __builtin_crypto_vncipher(B, K9);
B = __builtin_crypto_vncipherlast(B, K10);
- StoreBlock(B, out);
+ store_block(B, out);
out += 16;
in += 16;
@@ -141,23 +242,51 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
BOTAN_FUNC_ISA("crypto")
void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __vector unsigned long long K0 = LoadKey(&m_EK[0]);
- const __vector unsigned long long K1 = LoadKey(&m_EK[4]);
- const __vector unsigned long long K2 = LoadKey(&m_EK[8]);
- const __vector unsigned long long K3 = LoadKey(&m_EK[12]);
- const __vector unsigned long long K4 = LoadKey(&m_EK[16]);
- const __vector unsigned long long K5 = LoadKey(&m_EK[20]);
- const __vector unsigned long long K6 = LoadKey(&m_EK[24]);
- const __vector unsigned long long K7 = LoadKey(&m_EK[28]);
- const __vector unsigned long long K8 = LoadKey(&m_EK[32]);
- const __vector unsigned long long K9 = LoadKey(&m_EK[36]);
- const __vector unsigned long long K10 = LoadKey(&m_EK[40]);
- const __vector unsigned long long K11 = LoadKey(&m_EK[44]);
- const __vector unsigned long long K12 = LoadBlock(m_ME.data());
+ const Altivec64x2 K0 = load_key(&m_EK[0]);
+ const Altivec64x2 K1 = load_key(&m_EK[4]);
+ const Altivec64x2 K2 = load_key(&m_EK[8]);
+ const Altivec64x2 K3 = load_key(&m_EK[12]);
+ const Altivec64x2 K4 = load_key(&m_EK[16]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[32]);
+ const Altivec64x2 K9 = load_key(&m_EK[36]);
+ const Altivec64x2 K10 = load_key(&m_EK[40]);
+ const Altivec64x2 K11 = load_key(&m_EK[44]);
+ const Altivec64x2 K12 = load_block(m_ME.data());
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K1);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K2);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K3);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K4);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K5);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K6);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K7);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K8);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K9);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K10);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K11);
+ AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K12);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
for(size_t i = 0; i != blocks; ++i)
{
- __vector unsigned long long B = LoadBlock(in);
+ Altivec64x2 B = load_block(in);
B = vec_xor(B, K0);
B = __builtin_crypto_vcipher(B, K1);
@@ -173,7 +302,7 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
B = __builtin_crypto_vcipher(B, K11);
B = __builtin_crypto_vcipherlast(B, K12);
- StoreBlock(B, out);
+ store_block(B, out);
out += 16;
in += 16;
@@ -183,23 +312,51 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
BOTAN_FUNC_ISA("crypto")
void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __vector unsigned long long K0 = LoadBlock(m_ME.data());
- const __vector unsigned long long K1 = LoadKey(&m_EK[44]);
- const __vector unsigned long long K2 = LoadKey(&m_EK[40]);
- const __vector unsigned long long K3 = LoadKey(&m_EK[36]);
- const __vector unsigned long long K4 = LoadKey(&m_EK[32]);
- const __vector unsigned long long K5 = LoadKey(&m_EK[28]);
- const __vector unsigned long long K6 = LoadKey(&m_EK[24]);
- const __vector unsigned long long K7 = LoadKey(&m_EK[20]);
- const __vector unsigned long long K8 = LoadKey(&m_EK[16]);
- const __vector unsigned long long K9 = LoadKey(&m_EK[12]);
- const __vector unsigned long long K10 = LoadKey(&m_EK[8]);
- const __vector unsigned long long K11 = LoadKey(&m_EK[4]);
- const __vector unsigned long long K12 = LoadKey(&m_EK[0]);
+ const Altivec64x2 K0 = load_block(m_ME.data());
+ const Altivec64x2 K1 = load_key(&m_EK[44]);
+ const Altivec64x2 K2 = load_key(&m_EK[40]);
+ const Altivec64x2 K3 = load_key(&m_EK[36]);
+ const Altivec64x2 K4 = load_key(&m_EK[32]);
+ const Altivec64x2 K5 = load_key(&m_EK[28]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[20]);
+ const Altivec64x2 K8 = load_key(&m_EK[16]);
+ const Altivec64x2 K9 = load_key(&m_EK[12]);
+ const Altivec64x2 K10 = load_key(&m_EK[8]);
+ const Altivec64x2 K11 = load_key(&m_EK[4]);
+ const Altivec64x2 K12 = load_key(&m_EK[0]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_DECRYPT_4(B0, B1, B2, B3, K1);
+ AES_DECRYPT_4(B0, B1, B2, B3, K2);
+ AES_DECRYPT_4(B0, B1, B2, B3, K3);
+ AES_DECRYPT_4(B0, B1, B2, B3, K4);
+ AES_DECRYPT_4(B0, B1, B2, B3, K5);
+ AES_DECRYPT_4(B0, B1, B2, B3, K6);
+ AES_DECRYPT_4(B0, B1, B2, B3, K7);
+ AES_DECRYPT_4(B0, B1, B2, B3, K8);
+ AES_DECRYPT_4(B0, B1, B2, B3, K9);
+ AES_DECRYPT_4(B0, B1, B2, B3, K10);
+ AES_DECRYPT_4(B0, B1, B2, B3, K11);
+ AES_DECRYPT_4_LAST(B0, B1, B2, B3, K12);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
for(size_t i = 0; i != blocks; ++i)
{
- __vector unsigned long long B = LoadBlock(in);
+ Altivec64x2 B = load_block(in);
B = vec_xor(B, K0);
B = __builtin_crypto_vncipher(B, K1);
@@ -215,7 +372,7 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
B = __builtin_crypto_vncipher(B, K11);
B = __builtin_crypto_vncipherlast(B, K12);
- StoreBlock(B, out);
+ store_block(B, out);
out += 16;
in += 16;
@@ -225,25 +382,55 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
BOTAN_FUNC_ISA("crypto")
void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __vector unsigned long long K0 = LoadKey(&m_EK[0]);
- const __vector unsigned long long K1 = LoadKey(&m_EK[4]);
- const __vector unsigned long long K2 = LoadKey(&m_EK[8]);
- const __vector unsigned long long K3 = LoadKey(&m_EK[12]);
- const __vector unsigned long long K4 = LoadKey(&m_EK[16]);
- const __vector unsigned long long K5 = LoadKey(&m_EK[20]);
- const __vector unsigned long long K6 = LoadKey(&m_EK[24]);
- const __vector unsigned long long K7 = LoadKey(&m_EK[28]);
- const __vector unsigned long long K8 = LoadKey(&m_EK[32]);
- const __vector unsigned long long K9 = LoadKey(&m_EK[36]);
- const __vector unsigned long long K10 = LoadKey(&m_EK[40]);
- const __vector unsigned long long K11 = LoadKey(&m_EK[44]);
- const __vector unsigned long long K12 = LoadKey(&m_EK[48]);
- const __vector unsigned long long K13 = LoadKey(&m_EK[52]);
- const __vector unsigned long long K14 = LoadBlock(m_ME.data());
+ const Altivec64x2 K0 = load_key(&m_EK[0]);
+ const Altivec64x2 K1 = load_key(&m_EK[4]);
+ const Altivec64x2 K2 = load_key(&m_EK[8]);
+ const Altivec64x2 K3 = load_key(&m_EK[12]);
+ const Altivec64x2 K4 = load_key(&m_EK[16]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[32]);
+ const Altivec64x2 K9 = load_key(&m_EK[36]);
+ const Altivec64x2 K10 = load_key(&m_EK[40]);
+ const Altivec64x2 K11 = load_key(&m_EK[44]);
+ const Altivec64x2 K12 = load_key(&m_EK[48]);
+ const Altivec64x2 K13 = load_key(&m_EK[52]);
+ const Altivec64x2 K14 = load_block(m_ME.data());
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K1);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K2);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K3);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K4);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K5);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K6);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K7);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K8);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K9);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K10);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K11);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K12);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K13);
+ AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K14);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
for(size_t i = 0; i != blocks; ++i)
{
- __vector unsigned long long B = LoadBlock(in);
+ Altivec64x2 B = load_block(in);
B = vec_xor(B, K0);
B = __builtin_crypto_vcipher(B, K1);
@@ -261,7 +448,7 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
B = __builtin_crypto_vcipher(B, K13);
B = __builtin_crypto_vcipherlast(B, K14);
- StoreBlock(B, out);
+ store_block(B, out);
out += 16;
in += 16;
@@ -271,25 +458,55 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
BOTAN_FUNC_ISA("crypto")
void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __vector unsigned long long K0 = LoadBlock(m_ME.data());
- const __vector unsigned long long K1 = LoadKey(&m_EK[52]);
- const __vector unsigned long long K2 = LoadKey(&m_EK[48]);
- const __vector unsigned long long K3 = LoadKey(&m_EK[44]);
- const __vector unsigned long long K4 = LoadKey(&m_EK[40]);
- const __vector unsigned long long K5 = LoadKey(&m_EK[36]);
- const __vector unsigned long long K6 = LoadKey(&m_EK[32]);
- const __vector unsigned long long K7 = LoadKey(&m_EK[28]);
- const __vector unsigned long long K8 = LoadKey(&m_EK[24]);
- const __vector unsigned long long K9 = LoadKey(&m_EK[20]);
- const __vector unsigned long long K10 = LoadKey(&m_EK[16]);
- const __vector unsigned long long K11 = LoadKey(&m_EK[12]);
- const __vector unsigned long long K12 = LoadKey(&m_EK[8]);
- const __vector unsigned long long K13 = LoadKey(&m_EK[4]);
- const __vector unsigned long long K14 = LoadKey(&m_EK[0]);
+ const Altivec64x2 K0 = load_block(m_ME.data());
+ const Altivec64x2 K1 = load_key(&m_EK[52]);
+ const Altivec64x2 K2 = load_key(&m_EK[48]);
+ const Altivec64x2 K3 = load_key(&m_EK[44]);
+ const Altivec64x2 K4 = load_key(&m_EK[40]);
+ const Altivec64x2 K5 = load_key(&m_EK[36]);
+ const Altivec64x2 K6 = load_key(&m_EK[32]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[24]);
+ const Altivec64x2 K9 = load_key(&m_EK[20]);
+ const Altivec64x2 K10 = load_key(&m_EK[16]);
+ const Altivec64x2 K11 = load_key(&m_EK[12]);
+ const Altivec64x2 K12 = load_key(&m_EK[8]);
+ const Altivec64x2 K13 = load_key(&m_EK[4]);
+ const Altivec64x2 K14 = load_key(&m_EK[0]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_DECRYPT_4(B0, B1, B2, B3, K1);
+ AES_DECRYPT_4(B0, B1, B2, B3, K2);
+ AES_DECRYPT_4(B0, B1, B2, B3, K3);
+ AES_DECRYPT_4(B0, B1, B2, B3, K4);
+ AES_DECRYPT_4(B0, B1, B2, B3, K5);
+ AES_DECRYPT_4(B0, B1, B2, B3, K6);
+ AES_DECRYPT_4(B0, B1, B2, B3, K7);
+ AES_DECRYPT_4(B0, B1, B2, B3, K8);
+ AES_DECRYPT_4(B0, B1, B2, B3, K9);
+ AES_DECRYPT_4(B0, B1, B2, B3, K10);
+ AES_DECRYPT_4(B0, B1, B2, B3, K11);
+ AES_DECRYPT_4(B0, B1, B2, B3, K12);
+ AES_DECRYPT_4(B0, B1, B2, B3, K13);
+ AES_DECRYPT_4_LAST(B0, B1, B2, B3, K14);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
for(size_t i = 0; i != blocks; ++i)
{
- __vector unsigned long long B = LoadBlock(in);
+ Altivec64x2 B = load_block(in);
B = vec_xor(B, K0);
B = __builtin_crypto_vncipher(B, K1);
@@ -307,11 +524,17 @@ void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
B = __builtin_crypto_vncipher(B, K13);
B = __builtin_crypto_vncipherlast(B, K14);
- StoreBlock(B, out);
+ store_block(B, out);
out += 16;
in += 16;
}
}
+#undef AES_XOR_4
+#undef AES_ENCRYPT_4
+#undef AES_ENCRYPT_4_LAST
+#undef AES_DECRYPT_4
+#undef AES_DECRYPT_4_LAST
+
}