diff options
author | lloyd <[email protected]> | 2009-11-06 18:57:58 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2009-11-06 18:57:58 +0000 |
commit | 07412401c927e01da3504f0c2b7e94d4ac13ee33 (patch) | |
tree | 8a0a221d4b962db645d4a34ef9a97ccfd1cfcf3d /src/block/aes_intel/aes_intel.cpp | |
parent | 89da502ff80a9c63038b8b02a5062e460dff4649 (diff) |
Add a complete but untested AES-128 using the AES-NI intrinsics.
From looking at how key gen works in particular, it seems easiest to provide
only AES-128, AES-192, and AES-256 and not a general AES class that can
accept any key length. This also has the bonus of allowing full loop unrolling
which may be a win (how much so will depend on the latency/throughput of
the AES instructions which is currently unknown).
No block interleaving, though of course it works very nicely here, simply
due to the desire to keep things simple until what is currently here can
actually be tested. (Intel has an emulator that is supposed to work but
just crashes on my machine...)
I'm not entirely sure if byte swapping is required. Intel has a white paper
out that suggests it isn't (and really it would have been stupid of them to
not build this into the aes instructions), but who knows. If it turns
out to be necessary there is a pretty fast bswap instruction for SSE anyway.
Diffstat (limited to 'src/block/aes_intel/aes_intel.cpp')
-rw-r--r-- | src/block/aes_intel/aes_intel.cpp | 147 |
1 files changed, 134 insertions, 13 deletions
diff --git a/src/block/aes_intel/aes_intel.cpp b/src/block/aes_intel/aes_intel.cpp index bd814e6c8..8a8d0331a 100644 --- a/src/block/aes_intel/aes_intel.cpp +++ b/src/block/aes_intel/aes_intel.cpp @@ -6,16 +6,70 @@ */ #include <botan/aes_intel.h> +#include <wmmintrin.h> namespace Botan { +namespace { + +__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) + { + key_with_rcon = _mm_shuffle_epi32(key_with_rcon, 0xff); + + __m128i T = _mm_slli_si128 (key, 0x4); + key = _mm_xor_si128 (key, T); + T = _mm_slli_si128 (T, 0x4); + key = _mm_xor_si128 (key, T); + T = _mm_slli_si128 (T, 0x4); + + key = _mm_xor_si128 (key, T); + key = _mm_xor_si128 (key, key_with_rcon); + return key; + } + +} + /** * AES Encryption */ -void AES_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const +void AES_128_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const { + const __m128i* in_mm = (const __m128i*)in; + __m128i* out_mm = (__m128i*)out; + + const __m128i* key_mm = (const __m128i*)&EK[0]; + + __m128i K0 = _mm_loadu_si128(key_mm); + __m128i K1 = _mm_loadu_si128(key_mm + 1); + __m128i K2 = _mm_loadu_si128(key_mm + 2); + __m128i K3 = _mm_loadu_si128(key_mm + 3); + __m128i K4 = _mm_loadu_si128(key_mm + 4); + __m128i K5 = _mm_loadu_si128(key_mm + 5); + __m128i K6 = _mm_loadu_si128(key_mm + 6); + __m128i K7 = _mm_loadu_si128(key_mm + 7); + __m128i K8 = _mm_loadu_si128(key_mm + 8); + __m128i K9 = _mm_loadu_si128(key_mm + 9); + __m128i K10 = _mm_loadu_si128(key_mm + 10); + for(u32bit i = 0; i != blocks; ++i) { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesenc_si128(B, K1); + B = _mm_aesenc_si128(B, K2); + B = _mm_aesenc_si128(B, K3); + B = _mm_aesenc_si128(B, K4); + B = _mm_aesenc_si128(B, K5); + B = _mm_aesenc_si128(B, K6); + B = _mm_aesenc_si128(B, K7); + B = _mm_aesenc_si128(B, K8); + B = _mm_aesenc_si128(B, K9); + B = _mm_aesenclast_si128(B, K10); + + _mm_storeu_si128(out_mm + i, B); + in += BLOCK_SIZE; out += BLOCK_SIZE; } @@ -24,11 +78,43 @@ void AES_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const /** * AES Decryption */ -void AES_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const +void AES_128_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const { + const __m128i* in_mm = (const __m128i*)in; + __m128i* out_mm = (__m128i*)out; + + const __m128i* key_mm = (const __m128i*)&DK[0]; + + __m128i K0 = _mm_loadu_si128(key_mm); + __m128i K1 = _mm_loadu_si128(key_mm + 1); + __m128i K2 = _mm_loadu_si128(key_mm + 2); + __m128i K3 = _mm_loadu_si128(key_mm + 3); + __m128i K4 = _mm_loadu_si128(key_mm + 4); + __m128i K5 = _mm_loadu_si128(key_mm + 5); + __m128i K6 = _mm_loadu_si128(key_mm + 6); + __m128i K7 = _mm_loadu_si128(key_mm + 7); + __m128i K8 = _mm_loadu_si128(key_mm + 8); + __m128i K9 = _mm_loadu_si128(key_mm + 9); + __m128i K10 = _mm_loadu_si128(key_mm + 10); for(u32bit i = 0; i != blocks; ++i) { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesdec_si128(B, K1); + B = _mm_aesdec_si128(B, K2); + B = _mm_aesdec_si128(B, K3); + B = _mm_aesdec_si128(B, K4); + B = _mm_aesdec_si128(B, K5); + B = _mm_aesdec_si128(B, K6); + B = _mm_aesdec_si128(B, K7); + B = _mm_aesdec_si128(B, K8); + B = _mm_aesdec_si128(B, K9); + B = _mm_aesdeclast_si128(B, K10); + + _mm_storeu_si128(out_mm + i, B); in += BLOCK_SIZE; out += BLOCK_SIZE; @@ -38,25 +124,60 @@ void AES_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const /** * AES Key Schedule */ -void AES_Intel::key_schedule(const byte key[], u32bit length) +void AES_128_Intel::key_schedule(const byte key[], u32bit length) { - } -/** -* AES Constructor -*/ -AES_Intel::AES_Intel(u32bit key_size) : BlockCipher(16, key_size) - { - if(key_size != 16 && key_size != 24 && key_size != 32) - throw Invalid_Key_Length(name(), key_size); - ROUNDS = (key_size / 4) + 6; +#define AES_128_key_exp_with_rcon(K, RCON) \ + aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)); + + __m128i K0 = _mm_loadu_si128((const __m128i*)key); + __m128i K1 = AES_128_key_exp_with_rcon(K0, 0x01); + __m128i K2 = AES_128_key_exp_with_rcon(K1, 0x02); + __m128i K3 = AES_128_key_exp_with_rcon(K2, 0x04); + __m128i K4 = AES_128_key_exp_with_rcon(K3, 0x08); + __m128i K5 = AES_128_key_exp_with_rcon(K4, 0x10); + __m128i K6 = AES_128_key_exp_with_rcon(K5, 0x20); + __m128i K7 = AES_128_key_exp_with_rcon(K6, 0x40); + __m128i K8 = AES_128_key_exp_with_rcon(K7, 0x80); + __m128i K9 = AES_128_key_exp_with_rcon(K8, 0x1B); + __m128i K10 = AES_128_key_exp_with_rcon(K9, 0x36); + + __m128i* EK_mm = (__m128i*)&EK[0]; + _mm_storeu_si128(EK_mm , K0); + _mm_storeu_si128(EK_mm + 1, K1); + _mm_storeu_si128(EK_mm + 2, K2); + _mm_storeu_si128(EK_mm + 3, K3); + _mm_storeu_si128(EK_mm + 4, K4); + _mm_storeu_si128(EK_mm + 5, K5); + _mm_storeu_si128(EK_mm + 6, K6); + _mm_storeu_si128(EK_mm + 7, K7); + _mm_storeu_si128(EK_mm + 8, K8); + _mm_storeu_si128(EK_mm + 9, K9); + _mm_storeu_si128(EK_mm + 10, K10); + + // Now generate decryption keys + + __m128i* DK_mm = (__m128i*)&DK[0]; + _mm_storeu_si128(DK_mm , K10); + _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); + _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); + _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); + _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); + _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); + _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); + _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); + _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); + _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); + _mm_storeu_si128(DK_mm + 10, K0); } /** * Clear memory of sensitive data */ -void AES_Intel::clear() +void AES_128_Intel::clear() { + EK.clear(); + DK.clear(); } } |