aboutsummaryrefslogtreecommitdiffstats
path: root/src/block/aes_intel/aes_intel.cpp
diff options
context:
space:
mode:
authorlloyd <[email protected]>2009-11-06 18:57:58 +0000
committerlloyd <[email protected]>2009-11-06 18:57:58 +0000
commit07412401c927e01da3504f0c2b7e94d4ac13ee33 (patch)
tree8a0a221d4b962db645d4a34ef9a97ccfd1cfcf3d /src/block/aes_intel/aes_intel.cpp
parent89da502ff80a9c63038b8b02a5062e460dff4649 (diff)
Add a complete but untested AES-128 using the AES-NI intrinsics.
From looking at how key gen works in particular, it seems easiest to provide only AES-128, AES-192, and AES-256 and not a general AES class that can accept any key length. This also has the bonus of allowing full loop unrolling which may be a win (how much so will depend on the latency/throughput of the AES instructions which is currently unknown). No block interleaving, though of course it works very nicely here, simply due to the desire to keep things simple until what is currently here can actually be tested. (Intel has an emulator that is supposed to work but just crashes on my machine...) I'm not entirely sure if byte swapping is required. Intel has a white paper out that suggests it isn't (and really it would have been stupid of them to not build this into the aes instructions), but who knows. If it turns out to be necessary there is a pretty fast bswap instruction for SSE anyway.
Diffstat (limited to 'src/block/aes_intel/aes_intel.cpp')
-rw-r--r--src/block/aes_intel/aes_intel.cpp147
1 files changed, 134 insertions, 13 deletions
diff --git a/src/block/aes_intel/aes_intel.cpp b/src/block/aes_intel/aes_intel.cpp
index bd814e6c8..8a8d0331a 100644
--- a/src/block/aes_intel/aes_intel.cpp
+++ b/src/block/aes_intel/aes_intel.cpp
@@ -6,16 +6,70 @@
*/
#include <botan/aes_intel.h>
+#include <wmmintrin.h>
namespace Botan {
+namespace {
+
+__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
+ {
+ key_with_rcon = _mm_shuffle_epi32(key_with_rcon, 0xff);
+
+ __m128i T = _mm_slli_si128 (key, 0x4);
+ key = _mm_xor_si128 (key, T);
+ T = _mm_slli_si128 (T, 0x4);
+ key = _mm_xor_si128 (key, T);
+ T = _mm_slli_si128 (T, 0x4);
+
+ key = _mm_xor_si128 (key, T);
+ key = _mm_xor_si128 (key, key_with_rcon);
+ return key;
+ }
+
+}
+
/**
* AES Encryption
*/
-void AES_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const
+void AES_128_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const
{
+ const __m128i* in_mm = (const __m128i*)in;
+ __m128i* out_mm = (__m128i*)out;
+
+ const __m128i* key_mm = (const __m128i*)&EK[0];
+
+ __m128i K0 = _mm_loadu_si128(key_mm);
+ __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ __m128i K10 = _mm_loadu_si128(key_mm + 10);
+
for(u32bit i = 0; i != blocks; ++i)
{
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesenc_si128(B, K1);
+ B = _mm_aesenc_si128(B, K2);
+ B = _mm_aesenc_si128(B, K3);
+ B = _mm_aesenc_si128(B, K4);
+ B = _mm_aesenc_si128(B, K5);
+ B = _mm_aesenc_si128(B, K6);
+ B = _mm_aesenc_si128(B, K7);
+ B = _mm_aesenc_si128(B, K8);
+ B = _mm_aesenc_si128(B, K9);
+ B = _mm_aesenclast_si128(B, K10);
+
+ _mm_storeu_si128(out_mm + i, B);
+
in += BLOCK_SIZE;
out += BLOCK_SIZE;
}
@@ -24,11 +78,43 @@ void AES_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const
/**
* AES Decryption
*/
-void AES_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const
+void AES_128_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const
{
+ const __m128i* in_mm = (const __m128i*)in;
+ __m128i* out_mm = (__m128i*)out;
+
+ const __m128i* key_mm = (const __m128i*)&DK[0];
+
+ __m128i K0 = _mm_loadu_si128(key_mm);
+ __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ __m128i K10 = _mm_loadu_si128(key_mm + 10);
for(u32bit i = 0; i != blocks; ++i)
{
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesdec_si128(B, K1);
+ B = _mm_aesdec_si128(B, K2);
+ B = _mm_aesdec_si128(B, K3);
+ B = _mm_aesdec_si128(B, K4);
+ B = _mm_aesdec_si128(B, K5);
+ B = _mm_aesdec_si128(B, K6);
+ B = _mm_aesdec_si128(B, K7);
+ B = _mm_aesdec_si128(B, K8);
+ B = _mm_aesdec_si128(B, K9);
+ B = _mm_aesdeclast_si128(B, K10);
+
+ _mm_storeu_si128(out_mm + i, B);
in += BLOCK_SIZE;
out += BLOCK_SIZE;
@@ -38,25 +124,60 @@ void AES_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const
/**
* AES Key Schedule
*/
-void AES_Intel::key_schedule(const byte key[], u32bit length)
+void AES_128_Intel::key_schedule(const byte key[], u32bit length)
{
- }
-/**
-* AES Constructor
-*/
-AES_Intel::AES_Intel(u32bit key_size) : BlockCipher(16, key_size)
- {
- if(key_size != 16 && key_size != 24 && key_size != 32)
- throw Invalid_Key_Length(name(), key_size);
- ROUNDS = (key_size / 4) + 6;
+#define AES_128_key_exp_with_rcon(K, RCON) \
+ aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON));
+
+ __m128i K0 = _mm_loadu_si128((const __m128i*)key);
+ __m128i K1 = AES_128_key_exp_with_rcon(K0, 0x01);
+ __m128i K2 = AES_128_key_exp_with_rcon(K1, 0x02);
+ __m128i K3 = AES_128_key_exp_with_rcon(K2, 0x04);
+ __m128i K4 = AES_128_key_exp_with_rcon(K3, 0x08);
+ __m128i K5 = AES_128_key_exp_with_rcon(K4, 0x10);
+ __m128i K6 = AES_128_key_exp_with_rcon(K5, 0x20);
+ __m128i K7 = AES_128_key_exp_with_rcon(K6, 0x40);
+ __m128i K8 = AES_128_key_exp_with_rcon(K7, 0x80);
+ __m128i K9 = AES_128_key_exp_with_rcon(K8, 0x1B);
+ __m128i K10 = AES_128_key_exp_with_rcon(K9, 0x36);
+
+ __m128i* EK_mm = (__m128i*)&EK[0];
+ _mm_storeu_si128(EK_mm , K0);
+ _mm_storeu_si128(EK_mm + 1, K1);
+ _mm_storeu_si128(EK_mm + 2, K2);
+ _mm_storeu_si128(EK_mm + 3, K3);
+ _mm_storeu_si128(EK_mm + 4, K4);
+ _mm_storeu_si128(EK_mm + 5, K5);
+ _mm_storeu_si128(EK_mm + 6, K6);
+ _mm_storeu_si128(EK_mm + 7, K7);
+ _mm_storeu_si128(EK_mm + 8, K8);
+ _mm_storeu_si128(EK_mm + 9, K9);
+ _mm_storeu_si128(EK_mm + 10, K10);
+
+ // Now generate decryption keys
+
+ __m128i* DK_mm = (__m128i*)&DK[0];
+ _mm_storeu_si128(DK_mm , K10);
+ _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
+ _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
+ _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
+ _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
+ _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
+ _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
+ _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
+ _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
+ _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
+ _mm_storeu_si128(DK_mm + 10, K0);
}
/**
* Clear memory of sensitive data
*/
-void AES_Intel::clear()
+void AES_128_Intel::clear()
{
+ EK.clear();
+ DK.clear();
}
}