diff options
author | Jack Lloyd <[email protected]> | 2018-02-25 10:52:17 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2018-02-25 11:50:36 -0500 |
commit | 974899425d1da0a6c09f1fc85e6acbbf7f7d6c46 (patch) | |
tree | da9947a18ff193339e2c255e4b87511eb060bbfa | |
parent | 7e6aea7a9bff0714756aff5c7470c5c55cb31e0c (diff) |
Unroll ARMv8 AES instructions by 4 to allow pipelining
Runs as much as 50% faster for bulk operations. Improves GCM by 10%
-rw-r--r-- | src/lib/block/aes/aes_armv8/aes_armv8.cpp | 391 |
1 files changed, 307 insertions, 84 deletions
diff --git a/src/lib/block/aes/aes_armv8/aes_armv8.cpp b/src/lib/block/aes/aes_armv8/aes_armv8.cpp index 417854bfb..8a332ceaf 100644 --- a/src/lib/block/aes/aes_armv8/aes_armv8.cpp +++ b/src/lib/block/aes/aes_armv8/aes_armv8.cpp @@ -2,6 +2,9 @@ * AES using ARMv8 * Contributed by Jeffrey Walton * +* Further changes +* (C) 2017,2018 Jack Lloyd +* * Botan is released under the Simplified BSD License (see license.txt) */ @@ -11,6 +14,42 @@ namespace Botan { +#define AES_ENC_4_ROUNDS(K) \ + do \ + { \ + B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \ + B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \ + B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \ + B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \ + } while(0) + +#define AES_ENC_4_LAST_ROUNDS(K, K2) \ + do \ + { \ + B0 = veorq_u8(vaeseq_u8(B0, K), K2); \ + B1 = veorq_u8(vaeseq_u8(B1, K), K2); \ + B2 = veorq_u8(vaeseq_u8(B2, K), K2); \ + B3 = veorq_u8(vaeseq_u8(B3, K), K2); \ + } while(0) + +#define AES_DEC_4_ROUNDS(K) \ + do \ + { \ + B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \ + B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \ + B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \ + B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \ + } while(0) + +#define AES_DEC_4_LAST_ROUNDS(K, K2) \ + do \ + { \ + B0 = veorq_u8(vaesdq_u8(B0, K), K2); \ + B1 = veorq_u8(vaesdq_u8(B1, K), K2); \ + B2 = veorq_u8(vaesdq_u8(B2, K), K2); \ + B3 = veorq_u8(vaesdq_u8(B3, K), K2); \ + } while(0) + /* * AES-128 Encryption */ @@ -34,20 +73,48 @@ void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint8x16_t K9 = vld1q_u8(skey + 144); const uint8x16_t K10 = vld1q_u8(mkey); + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_ENC_4_ROUNDS(K0); + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_LAST_ROUNDS(K9, K10); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + for(size_t i = 0; i != blocks; ++i) { - uint8x16_t data = vld1q_u8(in+16*i); - data = vaesmcq_u8(vaeseq_u8(data, K0)); - data = vaesmcq_u8(vaeseq_u8(data, K1)); - data = vaesmcq_u8(vaeseq_u8(data, K2)); - data = vaesmcq_u8(vaeseq_u8(data, K3)); - data = vaesmcq_u8(vaeseq_u8(data, K4)); - data = vaesmcq_u8(vaeseq_u8(data, K5)); - data = vaesmcq_u8(vaeseq_u8(data, K6)); - data = vaesmcq_u8(vaeseq_u8(data, K7)); - data = vaesmcq_u8(vaeseq_u8(data, K8)); - data = veorq_u8(vaeseq_u8(data, K9), K10); - vst1q_u8(out+16*i, data); + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesmcq_u8(vaeseq_u8(B, K0)); + B = vaesmcq_u8(vaeseq_u8(B, K1)); + B = vaesmcq_u8(vaeseq_u8(B, K2)); + B = vaesmcq_u8(vaeseq_u8(B, K3)); + B = vaesmcq_u8(vaeseq_u8(B, K4)); + B = vaesmcq_u8(vaeseq_u8(B, K5)); + B = vaesmcq_u8(vaeseq_u8(B, K6)); + B = vaesmcq_u8(vaeseq_u8(B, K7)); + B = vaesmcq_u8(vaeseq_u8(B, K8)); + B = veorq_u8(vaeseq_u8(B, K9), K10); + vst1q_u8(out+16*i, B); } } @@ -74,20 +141,48 @@ void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint8x16_t K9 = vld1q_u8(skey + 144); const uint8x16_t K10 = vld1q_u8(mkey); + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_DEC_4_ROUNDS(K0); + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_LAST_ROUNDS(K9, K10); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + for(size_t i = 0; i != blocks; ++i) { - uint8x16_t data = vld1q_u8(in+16*i); - data = vaesimcq_u8(vaesdq_u8(data, K0)); - data = vaesimcq_u8(vaesdq_u8(data, K1)); - data = vaesimcq_u8(vaesdq_u8(data, K2)); - data = vaesimcq_u8(vaesdq_u8(data, K3)); - data = vaesimcq_u8(vaesdq_u8(data, K4)); - data = vaesimcq_u8(vaesdq_u8(data, K5)); - data = vaesimcq_u8(vaesdq_u8(data, K6)); - data = vaesimcq_u8(vaesdq_u8(data, K7)); - data = vaesimcq_u8(vaesdq_u8(data, K8)); - data = veorq_u8(vaesdq_u8(data, K9), K10); - vst1q_u8(out+16*i, data); + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesimcq_u8(vaesdq_u8(B, K0)); + B = vaesimcq_u8(vaesdq_u8(B, K1)); + B = vaesimcq_u8(vaesdq_u8(B, K2)); + B = vaesimcq_u8(vaesdq_u8(B, K3)); + B = vaesimcq_u8(vaesdq_u8(B, K4)); + B = vaesimcq_u8(vaesdq_u8(B, K5)); + B = vaesimcq_u8(vaesdq_u8(B, K6)); + B = vaesimcq_u8(vaesdq_u8(B, K7)); + B = vaesimcq_u8(vaesdq_u8(B, K8)); + B = veorq_u8(vaesdq_u8(B, K9), K10); + vst1q_u8(out+16*i, B); } } @@ -116,22 +211,52 @@ void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint8x16_t K11 = vld1q_u8(skey + 176); const uint8x16_t K12 = vld1q_u8(mkey); + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_ENC_4_ROUNDS(K0); + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_LAST_ROUNDS(K11, K12); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + for(size_t i = 0; i != blocks; ++i) { - uint8x16_t data = vld1q_u8(in+16*i); - data = vaesmcq_u8(vaeseq_u8(data, K0)); - data = vaesmcq_u8(vaeseq_u8(data, K1)); - data = vaesmcq_u8(vaeseq_u8(data, K2)); - data = vaesmcq_u8(vaeseq_u8(data, K3)); - data = vaesmcq_u8(vaeseq_u8(data, K4)); - data = vaesmcq_u8(vaeseq_u8(data, K5)); - data = vaesmcq_u8(vaeseq_u8(data, K6)); - data = vaesmcq_u8(vaeseq_u8(data, K7)); - data = vaesmcq_u8(vaeseq_u8(data, K8)); - data = vaesmcq_u8(vaeseq_u8(data, K9)); - data = vaesmcq_u8(vaeseq_u8(data, K10)); - data = veorq_u8(vaeseq_u8(data, K11), K12); - vst1q_u8(out+16*i, data); + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesmcq_u8(vaeseq_u8(B, K0)); + B = vaesmcq_u8(vaeseq_u8(B, K1)); + B = vaesmcq_u8(vaeseq_u8(B, K2)); + B = vaesmcq_u8(vaeseq_u8(B, K3)); + B = vaesmcq_u8(vaeseq_u8(B, K4)); + B = vaesmcq_u8(vaeseq_u8(B, K5)); + B = vaesmcq_u8(vaeseq_u8(B, K6)); + B = vaesmcq_u8(vaeseq_u8(B, K7)); + B = vaesmcq_u8(vaeseq_u8(B, K8)); + B = vaesmcq_u8(vaeseq_u8(B, K9)); + B = vaesmcq_u8(vaeseq_u8(B, K10)); + B = veorq_u8(vaeseq_u8(B, K11), K12); + vst1q_u8(out+16*i, B); } } @@ -159,22 +284,52 @@ void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint8x16_t K11 = vld1q_u8(skey + 176); const uint8x16_t K12 = vld1q_u8(mkey); + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_DEC_4_ROUNDS(K0); + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_LAST_ROUNDS(K11, K12); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + for(size_t i = 0; i != blocks; ++i) { - uint8x16_t data = vld1q_u8(in+16*i); - data = vaesimcq_u8(vaesdq_u8(data, K0)); - data = vaesimcq_u8(vaesdq_u8(data, K1)); - data = vaesimcq_u8(vaesdq_u8(data, K2)); - data = vaesimcq_u8(vaesdq_u8(data, K3)); - data = vaesimcq_u8(vaesdq_u8(data, K4)); - data = vaesimcq_u8(vaesdq_u8(data, K5)); - data = vaesimcq_u8(vaesdq_u8(data, K6)); - data = vaesimcq_u8(vaesdq_u8(data, K7)); - data = vaesimcq_u8(vaesdq_u8(data, K8)); - data = vaesimcq_u8(vaesdq_u8(data, K9)); - data = vaesimcq_u8(vaesdq_u8(data, K10)); - data = veorq_u8(vaesdq_u8(data, K11), K12); - vst1q_u8(out+16*i, data); + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesimcq_u8(vaesdq_u8(B, K0)); + B = vaesimcq_u8(vaesdq_u8(B, K1)); + B = vaesimcq_u8(vaesdq_u8(B, K2)); + B = vaesimcq_u8(vaesdq_u8(B, K3)); + B = vaesimcq_u8(vaesdq_u8(B, K4)); + B = vaesimcq_u8(vaesdq_u8(B, K5)); + B = vaesimcq_u8(vaesdq_u8(B, K6)); + B = vaesimcq_u8(vaesdq_u8(B, K7)); + B = vaesimcq_u8(vaesdq_u8(B, K8)); + B = vaesimcq_u8(vaesdq_u8(B, K9)); + B = vaesimcq_u8(vaesdq_u8(B, K10)); + B = veorq_u8(vaesdq_u8(B, K11), K12); + vst1q_u8(out+16*i, B); } } @@ -205,24 +360,56 @@ void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint8x16_t K13 = vld1q_u8(skey + 208); const uint8x16_t K14 = vld1q_u8(mkey); + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_ENC_4_ROUNDS(K0); + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_ROUNDS(K11); + AES_ENC_4_ROUNDS(K12); + AES_ENC_4_LAST_ROUNDS(K13, K14); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + for(size_t i = 0; i != blocks; ++i) { - uint8x16_t data = vld1q_u8(in+16*i); - data = vaesmcq_u8(vaeseq_u8(data, K0)); - data = vaesmcq_u8(vaeseq_u8(data, K1)); - data = vaesmcq_u8(vaeseq_u8(data, K2)); - data = vaesmcq_u8(vaeseq_u8(data, K3)); - data = vaesmcq_u8(vaeseq_u8(data, K4)); - data = vaesmcq_u8(vaeseq_u8(data, K5)); - data = vaesmcq_u8(vaeseq_u8(data, K6)); - data = vaesmcq_u8(vaeseq_u8(data, K7)); - data = vaesmcq_u8(vaeseq_u8(data, K8)); - data = vaesmcq_u8(vaeseq_u8(data, K9)); - data = vaesmcq_u8(vaeseq_u8(data, K10)); - data = vaesmcq_u8(vaeseq_u8(data, K11)); - data = vaesmcq_u8(vaeseq_u8(data, K12)); - data = veorq_u8(vaeseq_u8(data, K13), K14); - vst1q_u8(out+16*i, data); + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesmcq_u8(vaeseq_u8(B, K0)); + B = vaesmcq_u8(vaeseq_u8(B, K1)); + B = vaesmcq_u8(vaeseq_u8(B, K2)); + B = vaesmcq_u8(vaeseq_u8(B, K3)); + B = vaesmcq_u8(vaeseq_u8(B, K4)); + B = vaesmcq_u8(vaeseq_u8(B, K5)); + B = vaesmcq_u8(vaeseq_u8(B, K6)); + B = vaesmcq_u8(vaeseq_u8(B, K7)); + B = vaesmcq_u8(vaeseq_u8(B, K8)); + B = vaesmcq_u8(vaeseq_u8(B, K9)); + B = vaesmcq_u8(vaeseq_u8(B, K10)); + B = vaesmcq_u8(vaeseq_u8(B, K11)); + B = vaesmcq_u8(vaeseq_u8(B, K12)); + B = veorq_u8(vaeseq_u8(B, K13), K14); + vst1q_u8(out+16*i, B); } } @@ -253,26 +440,62 @@ void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint8x16_t K13 = vld1q_u8(skey + 208); const uint8x16_t K14 = vld1q_u8(mkey); + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_DEC_4_ROUNDS(K0); + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_ROUNDS(K11); + AES_DEC_4_ROUNDS(K12); + AES_DEC_4_LAST_ROUNDS(K13, K14); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + for(size_t i = 0; i != blocks; ++i) { - uint8x16_t data = vld1q_u8(in+16*i); - data = vaesimcq_u8(vaesdq_u8(data, K0)); - data = vaesimcq_u8(vaesdq_u8(data, K1)); - data = vaesimcq_u8(vaesdq_u8(data, K2)); - data = vaesimcq_u8(vaesdq_u8(data, K3)); - data = vaesimcq_u8(vaesdq_u8(data, K4)); - data = vaesimcq_u8(vaesdq_u8(data, K5)); - data = vaesimcq_u8(vaesdq_u8(data, K6)); - data = vaesimcq_u8(vaesdq_u8(data, K7)); - data = vaesimcq_u8(vaesdq_u8(data, K8)); - data = vaesimcq_u8(vaesdq_u8(data, K9)); - data = vaesimcq_u8(vaesdq_u8(data, K10)); - data = vaesimcq_u8(vaesdq_u8(data, K11)); - data = vaesimcq_u8(vaesdq_u8(data, K12)); - data = veorq_u8(vaesdq_u8(data, K13), K14); - vst1q_u8(out+16*i, data); + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesimcq_u8(vaesdq_u8(B, K0)); + B = vaesimcq_u8(vaesdq_u8(B, K1)); + B = vaesimcq_u8(vaesdq_u8(B, K2)); + B = vaesimcq_u8(vaesdq_u8(B, K3)); + B = vaesimcq_u8(vaesdq_u8(B, K4)); + B = vaesimcq_u8(vaesdq_u8(B, K5)); + B = vaesimcq_u8(vaesdq_u8(B, K6)); + B = vaesimcq_u8(vaesdq_u8(B, K7)); + B = vaesimcq_u8(vaesdq_u8(B, K8)); + B = vaesimcq_u8(vaesdq_u8(B, K9)); + B = vaesimcq_u8(vaesdq_u8(B, K10)); + B = vaesimcq_u8(vaesdq_u8(B, K11)); + B = vaesimcq_u8(vaesdq_u8(B, K12)); + B = veorq_u8(vaesdq_u8(B, K13), K14); + vst1q_u8(out+16*i, B); } } +#undef AES_ENC_4_ROUNDS +#undef AES_ENC_4_LAST_ROUNDS +#undef AES_DEC_4_ROUNDS +#undef AES_DEC_4_LAST_ROUNDS } |