In the first round of AES, use a 256 element table and do the

rotations in the code. This reduces the number of cache lines potentially accessed in the first round from 64 to 16 (assuming 64 byte cache lines). On average, about 10 cache lines will actually be accessed, assuming a uniform distribution of the inputs, so there definitely is still a timing channel here, just a somewhat smaller one. I experimented with using the 256 element table for all rounds but it reduced performance significantly and I'm not sure if the benefit is worth the cost or not.
author: lloyd <[email protected]> 2010-08-18 23:11:15 +0000
committer: lloyd <[email protected]> 2010-08-18 23:11:15 +0000
commit: 34fe34b27e0b6a2799078879b4566707a8622e81 (patch)
tree: 969e70f0d6ffa1c92c9db7a824ed8b3ed623ded7
parent: 87c938f6289782cacfcb5170322716363bd31122 (diff)
2 files changed, 29 insertions, 9 deletions
diff --git a/doc/log.txt b/doc/log.txt
index f9092da21..7f7076a4b 100644
--- a/doc/log.txt
+++ b/doc/log.txt
@@ -1,6 +1,7 @@
 
 * 1.9.11-dev, ????-??-??
  - Switch default PKCS #8 encryption algorithm from AES-128 to AES-256
+ - Use smaller tables in the first round of AES
 
 * 1.9.10, 2010-08-12
  - Add a constant time AES implementation using SSSE3
diff --git a/src/block/aes/aes.cpp b/src/block/aes/aes.cpp
index bf9a4198b..1177a1461 100644
--- a/src/block/aes/aes.cpp
+++ b/src/block/aes/aes.cpp
@@ -8,6 +8,7 @@
 #include <botan/aes.h>
 #include <botan/loadstor.h>
 #include <botan/rotate.h>
+#include <botan/internal/prefetch.h>
 
 namespace Botan {
 
@@ -426,15 +427,33 @@ void AES::encrypt_n(const byte in[], byte out[], u32bit blocks) const
       u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2];
       u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3];
 
-      u32bit B0, B1, B2, B3;
-      B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^
-           TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ EK[4];
-      B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^
-           TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ EK[5];
-      B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^
-           TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ EK[6];
-      B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^
-           TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ EK[7];
+      /* Use only the first 256 entries of the TE table and do the
+      * rotations directly in the code. This reduces the number of
+      * cache lines potentially used in the first round from 64 to 16
+      * (assuming a typical 64 byte cache line), which makes timing
+      * attacks a little harder; the first round is particularly
+      * vulnerable.
+      */
+
+      u32bit B0 = TE[get_byte(0, T0)] ^
+                  rotate_right(TE[get_byte(1, T1)],  8) ^
+                  rotate_right(TE[get_byte(2, T2)], 16) ^
+                  rotate_right(TE[get_byte(3, T3)], 24) ^ EK[4];
+
+      u32bit B1 = TE[get_byte(0, T1)] ^
+                  rotate_right(TE[get_byte(1, T2)],  8) ^
+                  rotate_right(TE[get_byte(2, T3)], 16) ^
+                  rotate_right(TE[get_byte(3, T0)], 24) ^ EK[5];
+
+      u32bit B2 = TE[get_byte(0, T2)] ^
+                  rotate_right(TE[get_byte(1, T3)],  8) ^
+                  rotate_right(TE[get_byte(2, T0)], 16) ^
+                  rotate_right(TE[get_byte(3, T1)], 24) ^ EK[6];
+
+      u32bit B3 = TE[get_byte(0, T3)] ^
+                  rotate_right(TE[get_byte(1, T0)],  8) ^
+                  rotate_right(TE[get_byte(2, T1)], 16) ^
+                  rotate_right(TE[get_byte(3, T2)], 24) ^ EK[7];
 
       for(u32bit j = 2; j != ROUNDS; j += 2)
          {
author	lloyd <[email protected]>	2010-08-18 23:11:15 +0000
committer	lloyd <[email protected]>	2010-08-18 23:11:15 +0000
commit	34fe34b27e0b6a2799078879b4566707a8622e81 (patch)
tree	969e70f0d6ffa1c92c9db7a824ed8b3ed623ded7
parent	87c938f6289782cacfcb5170322716363bd31122 (diff)