aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2018-02-25 10:52:17 -0500
committerJack Lloyd <[email protected]>2018-02-25 11:50:36 -0500
commit974899425d1da0a6c09f1fc85e6acbbf7f7d6c46 (patch)
treeda9947a18ff193339e2c255e4b87511eb060bbfa
parent7e6aea7a9bff0714756aff5c7470c5c55cb31e0c (diff)
Unroll ARMv8 AES instructions by 4 to allow pipelining
Runs as much as 50% faster for bulk operations. Improves GCM by 10%
-rw-r--r--src/lib/block/aes/aes_armv8/aes_armv8.cpp391
1 files changed, 307 insertions, 84 deletions
diff --git a/src/lib/block/aes/aes_armv8/aes_armv8.cpp b/src/lib/block/aes/aes_armv8/aes_armv8.cpp
index 417854bfb..8a332ceaf 100644
--- a/src/lib/block/aes/aes_armv8/aes_armv8.cpp
+++ b/src/lib/block/aes/aes_armv8/aes_armv8.cpp
@@ -2,6 +2,9 @@
* AES using ARMv8
* Contributed by Jeffrey Walton
*
+* Further changes
+* (C) 2017,2018 Jack Lloyd
+*
* Botan is released under the Simplified BSD License (see license.txt)
*/
@@ -11,6 +14,42 @@
namespace Botan {
+#define AES_ENC_4_ROUNDS(K) \
+ do \
+ { \
+ B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \
+ B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \
+ B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \
+ B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \
+ } while(0)
+
+#define AES_ENC_4_LAST_ROUNDS(K, K2) \
+ do \
+ { \
+ B0 = veorq_u8(vaeseq_u8(B0, K), K2); \
+ B1 = veorq_u8(vaeseq_u8(B1, K), K2); \
+ B2 = veorq_u8(vaeseq_u8(B2, K), K2); \
+ B3 = veorq_u8(vaeseq_u8(B3, K), K2); \
+ } while(0)
+
+#define AES_DEC_4_ROUNDS(K) \
+ do \
+ { \
+ B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \
+ B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \
+ B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \
+ B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \
+ } while(0)
+
+#define AES_DEC_4_LAST_ROUNDS(K, K2) \
+ do \
+ { \
+ B0 = veorq_u8(vaesdq_u8(B0, K), K2); \
+ B1 = veorq_u8(vaesdq_u8(B1, K), K2); \
+ B2 = veorq_u8(vaesdq_u8(B2, K), K2); \
+ B3 = veorq_u8(vaesdq_u8(B3, K), K2); \
+ } while(0)
+
/*
* AES-128 Encryption
*/
@@ -34,20 +73,48 @@ void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
const uint8x16_t K9 = vld1q_u8(skey + 144);
const uint8x16_t K10 = vld1q_u8(mkey);
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_ENC_4_ROUNDS(K0);
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_LAST_ROUNDS(K9, K10);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
for(size_t i = 0; i != blocks; ++i)
{
- uint8x16_t data = vld1q_u8(in+16*i);
- data = vaesmcq_u8(vaeseq_u8(data, K0));
- data = vaesmcq_u8(vaeseq_u8(data, K1));
- data = vaesmcq_u8(vaeseq_u8(data, K2));
- data = vaesmcq_u8(vaeseq_u8(data, K3));
- data = vaesmcq_u8(vaeseq_u8(data, K4));
- data = vaesmcq_u8(vaeseq_u8(data, K5));
- data = vaesmcq_u8(vaeseq_u8(data, K6));
- data = vaesmcq_u8(vaeseq_u8(data, K7));
- data = vaesmcq_u8(vaeseq_u8(data, K8));
- data = veorq_u8(vaeseq_u8(data, K9), K10);
- vst1q_u8(out+16*i, data);
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesmcq_u8(vaeseq_u8(B, K0));
+ B = vaesmcq_u8(vaeseq_u8(B, K1));
+ B = vaesmcq_u8(vaeseq_u8(B, K2));
+ B = vaesmcq_u8(vaeseq_u8(B, K3));
+ B = vaesmcq_u8(vaeseq_u8(B, K4));
+ B = vaesmcq_u8(vaeseq_u8(B, K5));
+ B = vaesmcq_u8(vaeseq_u8(B, K6));
+ B = vaesmcq_u8(vaeseq_u8(B, K7));
+ B = vaesmcq_u8(vaeseq_u8(B, K8));
+ B = veorq_u8(vaeseq_u8(B, K9), K10);
+ vst1q_u8(out+16*i, B);
}
}
@@ -74,20 +141,48 @@ void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
const uint8x16_t K9 = vld1q_u8(skey + 144);
const uint8x16_t K10 = vld1q_u8(mkey);
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_DEC_4_ROUNDS(K0);
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_LAST_ROUNDS(K9, K10);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
for(size_t i = 0; i != blocks; ++i)
{
- uint8x16_t data = vld1q_u8(in+16*i);
- data = vaesimcq_u8(vaesdq_u8(data, K0));
- data = vaesimcq_u8(vaesdq_u8(data, K1));
- data = vaesimcq_u8(vaesdq_u8(data, K2));
- data = vaesimcq_u8(vaesdq_u8(data, K3));
- data = vaesimcq_u8(vaesdq_u8(data, K4));
- data = vaesimcq_u8(vaesdq_u8(data, K5));
- data = vaesimcq_u8(vaesdq_u8(data, K6));
- data = vaesimcq_u8(vaesdq_u8(data, K7));
- data = vaesimcq_u8(vaesdq_u8(data, K8));
- data = veorq_u8(vaesdq_u8(data, K9), K10);
- vst1q_u8(out+16*i, data);
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesimcq_u8(vaesdq_u8(B, K0));
+ B = vaesimcq_u8(vaesdq_u8(B, K1));
+ B = vaesimcq_u8(vaesdq_u8(B, K2));
+ B = vaesimcq_u8(vaesdq_u8(B, K3));
+ B = vaesimcq_u8(vaesdq_u8(B, K4));
+ B = vaesimcq_u8(vaesdq_u8(B, K5));
+ B = vaesimcq_u8(vaesdq_u8(B, K6));
+ B = vaesimcq_u8(vaesdq_u8(B, K7));
+ B = vaesimcq_u8(vaesdq_u8(B, K8));
+ B = veorq_u8(vaesdq_u8(B, K9), K10);
+ vst1q_u8(out+16*i, B);
}
}
@@ -116,22 +211,52 @@ void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
const uint8x16_t K11 = vld1q_u8(skey + 176);
const uint8x16_t K12 = vld1q_u8(mkey);
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_ENC_4_ROUNDS(K0);
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_ROUNDS(K10);
+ AES_ENC_4_LAST_ROUNDS(K11, K12);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
for(size_t i = 0; i != blocks; ++i)
{
- uint8x16_t data = vld1q_u8(in+16*i);
- data = vaesmcq_u8(vaeseq_u8(data, K0));
- data = vaesmcq_u8(vaeseq_u8(data, K1));
- data = vaesmcq_u8(vaeseq_u8(data, K2));
- data = vaesmcq_u8(vaeseq_u8(data, K3));
- data = vaesmcq_u8(vaeseq_u8(data, K4));
- data = vaesmcq_u8(vaeseq_u8(data, K5));
- data = vaesmcq_u8(vaeseq_u8(data, K6));
- data = vaesmcq_u8(vaeseq_u8(data, K7));
- data = vaesmcq_u8(vaeseq_u8(data, K8));
- data = vaesmcq_u8(vaeseq_u8(data, K9));
- data = vaesmcq_u8(vaeseq_u8(data, K10));
- data = veorq_u8(vaeseq_u8(data, K11), K12);
- vst1q_u8(out+16*i, data);
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesmcq_u8(vaeseq_u8(B, K0));
+ B = vaesmcq_u8(vaeseq_u8(B, K1));
+ B = vaesmcq_u8(vaeseq_u8(B, K2));
+ B = vaesmcq_u8(vaeseq_u8(B, K3));
+ B = vaesmcq_u8(vaeseq_u8(B, K4));
+ B = vaesmcq_u8(vaeseq_u8(B, K5));
+ B = vaesmcq_u8(vaeseq_u8(B, K6));
+ B = vaesmcq_u8(vaeseq_u8(B, K7));
+ B = vaesmcq_u8(vaeseq_u8(B, K8));
+ B = vaesmcq_u8(vaeseq_u8(B, K9));
+ B = vaesmcq_u8(vaeseq_u8(B, K10));
+ B = veorq_u8(vaeseq_u8(B, K11), K12);
+ vst1q_u8(out+16*i, B);
}
}
@@ -159,22 +284,52 @@ void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
const uint8x16_t K11 = vld1q_u8(skey + 176);
const uint8x16_t K12 = vld1q_u8(mkey);
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_DEC_4_ROUNDS(K0);
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_ROUNDS(K10);
+ AES_DEC_4_LAST_ROUNDS(K11, K12);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
for(size_t i = 0; i != blocks; ++i)
{
- uint8x16_t data = vld1q_u8(in+16*i);
- data = vaesimcq_u8(vaesdq_u8(data, K0));
- data = vaesimcq_u8(vaesdq_u8(data, K1));
- data = vaesimcq_u8(vaesdq_u8(data, K2));
- data = vaesimcq_u8(vaesdq_u8(data, K3));
- data = vaesimcq_u8(vaesdq_u8(data, K4));
- data = vaesimcq_u8(vaesdq_u8(data, K5));
- data = vaesimcq_u8(vaesdq_u8(data, K6));
- data = vaesimcq_u8(vaesdq_u8(data, K7));
- data = vaesimcq_u8(vaesdq_u8(data, K8));
- data = vaesimcq_u8(vaesdq_u8(data, K9));
- data = vaesimcq_u8(vaesdq_u8(data, K10));
- data = veorq_u8(vaesdq_u8(data, K11), K12);
- vst1q_u8(out+16*i, data);
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesimcq_u8(vaesdq_u8(B, K0));
+ B = vaesimcq_u8(vaesdq_u8(B, K1));
+ B = vaesimcq_u8(vaesdq_u8(B, K2));
+ B = vaesimcq_u8(vaesdq_u8(B, K3));
+ B = vaesimcq_u8(vaesdq_u8(B, K4));
+ B = vaesimcq_u8(vaesdq_u8(B, K5));
+ B = vaesimcq_u8(vaesdq_u8(B, K6));
+ B = vaesimcq_u8(vaesdq_u8(B, K7));
+ B = vaesimcq_u8(vaesdq_u8(B, K8));
+ B = vaesimcq_u8(vaesdq_u8(B, K9));
+ B = vaesimcq_u8(vaesdq_u8(B, K10));
+ B = veorq_u8(vaesdq_u8(B, K11), K12);
+ vst1q_u8(out+16*i, B);
}
}
@@ -205,24 +360,56 @@ void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
const uint8x16_t K13 = vld1q_u8(skey + 208);
const uint8x16_t K14 = vld1q_u8(mkey);
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_ENC_4_ROUNDS(K0);
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_ROUNDS(K10);
+ AES_ENC_4_ROUNDS(K11);
+ AES_ENC_4_ROUNDS(K12);
+ AES_ENC_4_LAST_ROUNDS(K13, K14);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
for(size_t i = 0; i != blocks; ++i)
{
- uint8x16_t data = vld1q_u8(in+16*i);
- data = vaesmcq_u8(vaeseq_u8(data, K0));
- data = vaesmcq_u8(vaeseq_u8(data, K1));
- data = vaesmcq_u8(vaeseq_u8(data, K2));
- data = vaesmcq_u8(vaeseq_u8(data, K3));
- data = vaesmcq_u8(vaeseq_u8(data, K4));
- data = vaesmcq_u8(vaeseq_u8(data, K5));
- data = vaesmcq_u8(vaeseq_u8(data, K6));
- data = vaesmcq_u8(vaeseq_u8(data, K7));
- data = vaesmcq_u8(vaeseq_u8(data, K8));
- data = vaesmcq_u8(vaeseq_u8(data, K9));
- data = vaesmcq_u8(vaeseq_u8(data, K10));
- data = vaesmcq_u8(vaeseq_u8(data, K11));
- data = vaesmcq_u8(vaeseq_u8(data, K12));
- data = veorq_u8(vaeseq_u8(data, K13), K14);
- vst1q_u8(out+16*i, data);
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesmcq_u8(vaeseq_u8(B, K0));
+ B = vaesmcq_u8(vaeseq_u8(B, K1));
+ B = vaesmcq_u8(vaeseq_u8(B, K2));
+ B = vaesmcq_u8(vaeseq_u8(B, K3));
+ B = vaesmcq_u8(vaeseq_u8(B, K4));
+ B = vaesmcq_u8(vaeseq_u8(B, K5));
+ B = vaesmcq_u8(vaeseq_u8(B, K6));
+ B = vaesmcq_u8(vaeseq_u8(B, K7));
+ B = vaesmcq_u8(vaeseq_u8(B, K8));
+ B = vaesmcq_u8(vaeseq_u8(B, K9));
+ B = vaesmcq_u8(vaeseq_u8(B, K10));
+ B = vaesmcq_u8(vaeseq_u8(B, K11));
+ B = vaesmcq_u8(vaeseq_u8(B, K12));
+ B = veorq_u8(vaeseq_u8(B, K13), K14);
+ vst1q_u8(out+16*i, B);
}
}
@@ -253,26 +440,62 @@ void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks)
const uint8x16_t K13 = vld1q_u8(skey + 208);
const uint8x16_t K14 = vld1q_u8(mkey);
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_DEC_4_ROUNDS(K0);
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_ROUNDS(K10);
+ AES_DEC_4_ROUNDS(K11);
+ AES_DEC_4_ROUNDS(K12);
+ AES_DEC_4_LAST_ROUNDS(K13, K14);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
for(size_t i = 0; i != blocks; ++i)
{
- uint8x16_t data = vld1q_u8(in+16*i);
- data = vaesimcq_u8(vaesdq_u8(data, K0));
- data = vaesimcq_u8(vaesdq_u8(data, K1));
- data = vaesimcq_u8(vaesdq_u8(data, K2));
- data = vaesimcq_u8(vaesdq_u8(data, K3));
- data = vaesimcq_u8(vaesdq_u8(data, K4));
- data = vaesimcq_u8(vaesdq_u8(data, K5));
- data = vaesimcq_u8(vaesdq_u8(data, K6));
- data = vaesimcq_u8(vaesdq_u8(data, K7));
- data = vaesimcq_u8(vaesdq_u8(data, K8));
- data = vaesimcq_u8(vaesdq_u8(data, K9));
- data = vaesimcq_u8(vaesdq_u8(data, K10));
- data = vaesimcq_u8(vaesdq_u8(data, K11));
- data = vaesimcq_u8(vaesdq_u8(data, K12));
- data = veorq_u8(vaesdq_u8(data, K13), K14);
- vst1q_u8(out+16*i, data);
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesimcq_u8(vaesdq_u8(B, K0));
+ B = vaesimcq_u8(vaesdq_u8(B, K1));
+ B = vaesimcq_u8(vaesdq_u8(B, K2));
+ B = vaesimcq_u8(vaesdq_u8(B, K3));
+ B = vaesimcq_u8(vaesdq_u8(B, K4));
+ B = vaesimcq_u8(vaesdq_u8(B, K5));
+ B = vaesimcq_u8(vaesdq_u8(B, K6));
+ B = vaesimcq_u8(vaesdq_u8(B, K7));
+ B = vaesimcq_u8(vaesdq_u8(B, K8));
+ B = vaesimcq_u8(vaesdq_u8(B, K9));
+ B = vaesimcq_u8(vaesdq_u8(B, K10));
+ B = vaesimcq_u8(vaesdq_u8(B, K11));
+ B = vaesimcq_u8(vaesdq_u8(B, K12));
+ B = veorq_u8(vaesdq_u8(B, K13), K14);
+ vst1q_u8(out+16*i, B);
}
}
+#undef AES_ENC_4_ROUNDS
+#undef AES_ENC_4_LAST_ROUNDS
+#undef AES_DEC_4_ROUNDS
+#undef AES_DEC_4_LAST_ROUNDS
}