diff options
author | lloyd <[email protected]> | 2009-11-10 19:27:34 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2009-11-10 19:27:34 +0000 |
commit | 1a4210926dd857eff1a862806b8c05bda919981e (patch) | |
tree | 424d3c39a46c8783998ca40aa8f021f7a5ea2914 | |
parent | c3216ded9086f442f6378639de7bf5afe8c3228a (diff) |
Add AES-192 using AES-NI. Tested OK with Intel's simulator.
Currently requires SSE4.1 for _mm_extract_epi32 for the key schedule, it
would be nice to remove this dependency, though all currently known/scheduled
chips with AES-NI (Intel Westmere and Sandy Bridge, and AMD Bulldozer) are
supposed to include SSE 4.1 so this is not a huge problem.
-rw-r--r-- | doc/log.txt | 2 | ||||
-rw-r--r-- | src/block/aes_intel/aes_intel.cpp | 257 | ||||
-rw-r--r-- | src/block/aes_intel/aes_intel.h | 26 | ||||
-rw-r--r-- | src/engine/aes_isa_eng/aes_isa_engine.cpp | 2 |
4 files changed, 277 insertions, 10 deletions
diff --git a/doc/log.txt b/doc/log.txt index 86c0f0818..3a8733166 100644 --- a/doc/log.txt +++ b/doc/log.txt @@ -1,8 +1,8 @@ * 1.9.3-dev, ????-??-?? + - Add new AES implementation using Intel's AES instruction intrinsics - Allow use of any hash function in X.509 certificate creation - Set macros for available SIMD instructions in build.h - - Add AES-128 and AES-256 using Intel AES instruction intrinsics * 1.9.2, 2009-11-03 - Add SIMD version of XTEA diff --git a/src/block/aes_intel/aes_intel.cpp b/src/block/aes_intel/aes_intel.cpp index 5a3d664f7..e413991f8 100644 --- a/src/block/aes_intel/aes_intel.cpp +++ b/src/block/aes_intel/aes_intel.cpp @@ -6,7 +6,9 @@ */ #include <botan/aes_intel.h> +#include <botan/loadstor.h> #include <wmmintrin.h> +#include <smmintrin.h> namespace Botan { @@ -21,6 +23,35 @@ __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) return _mm_xor_si128(key, key_with_rcon); } +void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, + u32bit out[]) + { + __m128i key1 = *K1; + __m128i key2 = *K2; + + key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, key2_with_rcon); + + key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); + key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); + + *K1 = key1; + *K2 = key2; + + out[0] = _mm_extract_epi32(key1, 0); + out[1] = _mm_extract_epi32(key1, 1); + out[2] = _mm_extract_epi32(key1, 2); + out[3] = _mm_extract_epi32(key1, 3); + out[4] = _mm_extract_epi32(key2, 0); + out[5] = _mm_extract_epi32(key2, 1); + } + +/* +* The second half of the AES-256 key expansion (other half same as AES-128) +*/ __m128i aes_256_key_expansion(__m128i key, __m128i key2) { __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); @@ -227,12 +258,10 @@ void AES_128_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const */ void AES_128_Intel::key_schedule(const byte key[], u32bit) { - const __m128i* key_mm = (const __m128i*)key; - #define AES_128_key_exp(K, RCON) \ aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)) - __m128i K0 = _mm_loadu_si128(key_mm); + __m128i K0 = _mm_loadu_si128((const __m128i*)(key)); __m128i K1 = AES_128_key_exp(K0, 0x01); __m128i K2 = AES_128_key_exp(K1, 0x02); __m128i K3 = AES_128_key_exp(K2, 0x04); @@ -283,6 +312,222 @@ void AES_128_Intel::clear() } /** +* AES-192 Encryption +*/ +void AES_192_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const + { + const __m128i* in_mm = (const __m128i*)in; + __m128i* out_mm = (__m128i*)out; + + const __m128i* key_mm = (const __m128i*)&EK[0]; + + __m128i K0 = _mm_loadu_si128(key_mm); + __m128i K1 = _mm_loadu_si128(key_mm + 1); + __m128i K2 = _mm_loadu_si128(key_mm + 2); + __m128i K3 = _mm_loadu_si128(key_mm + 3); + __m128i K4 = _mm_loadu_si128(key_mm + 4); + __m128i K5 = _mm_loadu_si128(key_mm + 5); + __m128i K6 = _mm_loadu_si128(key_mm + 6); + __m128i K7 = _mm_loadu_si128(key_mm + 7); + __m128i K8 = _mm_loadu_si128(key_mm + 8); + __m128i K9 = _mm_loadu_si128(key_mm + 9); + __m128i K10 = _mm_loadu_si128(key_mm + 10); + __m128i K11 = _mm_loadu_si128(key_mm + 11); + __m128i K12 = _mm_loadu_si128(key_mm + 12); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_ROUNDS(K11); + AES_ENC_4_LAST_ROUNDS(K12); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(u32bit i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesenc_si128(B, K1); + B = _mm_aesenc_si128(B, K2); + B = _mm_aesenc_si128(B, K3); + B = _mm_aesenc_si128(B, K4); + B = _mm_aesenc_si128(B, K5); + B = _mm_aesenc_si128(B, K6); + B = _mm_aesenc_si128(B, K7); + B = _mm_aesenc_si128(B, K8); + B = _mm_aesenc_si128(B, K9); + B = _mm_aesenc_si128(B, K10); + B = _mm_aesenc_si128(B, K11); + B = _mm_aesenclast_si128(B, K12); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/** +* AES-192 Decryption +*/ +void AES_192_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const + { + const __m128i* in_mm = (const __m128i*)in; + __m128i* out_mm = (__m128i*)out; + + const __m128i* key_mm = (const __m128i*)&DK[0]; + + __m128i K0 = _mm_loadu_si128(key_mm); + __m128i K1 = _mm_loadu_si128(key_mm + 1); + __m128i K2 = _mm_loadu_si128(key_mm + 2); + __m128i K3 = _mm_loadu_si128(key_mm + 3); + __m128i K4 = _mm_loadu_si128(key_mm + 4); + __m128i K5 = _mm_loadu_si128(key_mm + 5); + __m128i K6 = _mm_loadu_si128(key_mm + 6); + __m128i K7 = _mm_loadu_si128(key_mm + 7); + __m128i K8 = _mm_loadu_si128(key_mm + 8); + __m128i K9 = _mm_loadu_si128(key_mm + 9); + __m128i K10 = _mm_loadu_si128(key_mm + 10); + __m128i K11 = _mm_loadu_si128(key_mm + 11); + __m128i K12 = _mm_loadu_si128(key_mm + 12); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_ROUNDS(K11); + AES_DEC_4_LAST_ROUNDS(K12); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(u32bit i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesdec_si128(B, K1); + B = _mm_aesdec_si128(B, K2); + B = _mm_aesdec_si128(B, K3); + B = _mm_aesdec_si128(B, K4); + B = _mm_aesdec_si128(B, K5); + B = _mm_aesdec_si128(B, K6); + B = _mm_aesdec_si128(B, K7); + B = _mm_aesdec_si128(B, K8); + B = _mm_aesdec_si128(B, K9); + B = _mm_aesdec_si128(B, K10); + B = _mm_aesdec_si128(B, K11); + B = _mm_aesdeclast_si128(B, K12); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/** +* AES-192 Key Schedule +*/ +void AES_192_Intel::key_schedule(const byte key[], u32bit) + { + __m128i K0 = _mm_loadu_si128((const __m128i*)(key)); + __m128i K1 = _mm_loadu_si128((const __m128i*)(key + 8)); + K1 = _mm_srli_si128(K1, 8); + + EK[0] = load_le<u32bit>(key, 0); + EK[1] = load_le<u32bit>(key, 1); + EK[2] = load_le<u32bit>(key, 2); + EK[3] = load_le<u32bit>(key, 3); + EK[4] = load_le<u32bit>(key, 4); + EK[5] = load_le<u32bit>(key, 5); + + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), EK + 6); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), EK + 12); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), EK + 18); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), EK + 24); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), EK + 30); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), EK + 36); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), EK + 42); + aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), EK + 48); + + // Now generate decryption keys + const __m128i* EK_mm = (const __m128i*)&EK[0]; + __m128i* DK_mm = (__m128i*)&DK[0]; + _mm_storeu_si128(DK_mm , EK_mm[12]); + _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(EK_mm[11])); + _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(EK_mm[10])); + _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(EK_mm[9])); + _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(EK_mm[8])); + _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(EK_mm[7])); + _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(EK_mm[6])); + _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(EK_mm[5])); + _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(EK_mm[4])); + _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(EK_mm[3])); + _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(EK_mm[2])); + _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(EK_mm[1])); + _mm_storeu_si128(DK_mm + 12, EK_mm[0]); + } + +/** +* Clear memory of sensitive data +*/ +void AES_192_Intel::clear() + { + EK.clear(); + DK.clear(); + } + +/** * AES-256 Encryption */ void AES_256_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const @@ -463,10 +708,8 @@ void AES_256_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const */ void AES_256_Intel::key_schedule(const byte key[], u32bit) { - const __m128i* key_mm = (const __m128i*)key; - - __m128i K0 = _mm_loadu_si128(key_mm); - __m128i K1 = _mm_loadu_si128(key_mm + 1); + __m128i K0 = _mm_loadu_si128((const __m128i*)(key)); + __m128i K1 = _mm_loadu_si128((const __m128i*)(key + 16)); __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01)); __m128i K3 = aes_256_key_expansion(K1, K2); diff --git a/src/block/aes_intel/aes_intel.h b/src/block/aes_intel/aes_intel.h index 7afd7aaec..373e95b9e 100644 --- a/src/block/aes_intel/aes_intel.h +++ b/src/block/aes_intel/aes_intel.h @@ -12,6 +12,9 @@ namespace Botan { +/** +* AES-128 using AES-NI +*/ class BOTAN_DLL AES_128_Intel : public BlockCipher { public: @@ -29,6 +32,29 @@ class BOTAN_DLL AES_128_Intel : public BlockCipher SecureBuffer<u32bit, 44> EK, DK; }; +/** +* AES-192 using AES-NI +*/ +class BOTAN_DLL AES_192_Intel : public BlockCipher + { + public: + void encrypt_n(const byte in[], byte out[], u32bit blocks) const; + void decrypt_n(const byte in[], byte out[], u32bit blocks) const; + + void clear(); + std::string name() const { return "AES-192"; } + BlockCipher* clone() const { return new AES_192_Intel; } + + AES_192_Intel() : BlockCipher(16, 24) { } + private: + void key_schedule(const byte[], u32bit); + + SecureBuffer<u32bit, 56> EK, DK; + }; + +/** +* AES-256 using AES-NI +*/ class BOTAN_DLL AES_256_Intel : public BlockCipher { public: diff --git a/src/engine/aes_isa_eng/aes_isa_engine.cpp b/src/engine/aes_isa_eng/aes_isa_engine.cpp index bbbdd288e..e57e4278d 100644 --- a/src/engine/aes_isa_eng/aes_isa_engine.cpp +++ b/src/engine/aes_isa_eng/aes_isa_engine.cpp @@ -27,10 +27,8 @@ AES_ISA_Engine::find_block_cipher(const SCAN_Name& request, { if(request.algo_name() == "AES-128") return new AES_128_Intel; - /* if(request.algo_name() == "AES-192") return new AES_192_Intel; - */ if(request.algo_name() == "AES-256") return new AES_256_Intel; } |