1 files changed, 55 insertions, 22 deletions
diff --git a/src/block/aes/aes.cpp b/src/block/aes/aes.cpp
index bf9a4198b..8783f13a0 100644
--- a/src/block/aes/aes.cpp
+++ b/src/block/aes/aes.cpp
@@ -426,15 +426,33 @@ void AES::encrypt_n(const byte in[], byte out[], u32bit blocks) const
       u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2];
       u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3];
 
-      u32bit B0, B1, B2, B3;
-      B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^
-           TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ EK[4];
-      B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^
-           TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ EK[5];
-      B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^
-           TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ EK[6];
-      B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^
-           TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ EK[7];
+      /* Use only the first 256 entries of the TE table and do the
+      * rotations directly in the code. This reduces the number of
+      * cache lines potentially used in the first round from 64 to 16
+      * (assuming a typical 64 byte cache line), which makes timing
+      * attacks a little harder; the first round is particularly
+      * vulnerable.
+      */
+
+      u32bit B0 = TE[get_byte(0, T0)] ^
+                  rotate_right(TE[get_byte(1, T1)],  8) ^
+                  rotate_right(TE[get_byte(2, T2)], 16) ^
+                  rotate_right(TE[get_byte(3, T3)], 24) ^ EK[4];
+
+      u32bit B1 = TE[get_byte(0, T1)] ^
+                  rotate_right(TE[get_byte(1, T2)],  8) ^
+                  rotate_right(TE[get_byte(2, T3)], 16) ^
+                  rotate_right(TE[get_byte(3, T0)], 24) ^ EK[5];
+
+      u32bit B2 = TE[get_byte(0, T2)] ^
+                  rotate_right(TE[get_byte(1, T3)],  8) ^
+                  rotate_right(TE[get_byte(2, T0)], 16) ^
+                  rotate_right(TE[get_byte(3, T1)], 24) ^ EK[6];
+
+      u32bit B3 = TE[get_byte(0, T3)] ^
+                  rotate_right(TE[get_byte(1, T0)],  8) ^
+                  rotate_right(TE[get_byte(2, T1)], 16) ^
+                  rotate_right(TE[get_byte(3, T2)], 24) ^ EK[7];
 
       for(u32bit j = 2; j != ROUNDS; j += 2)
          {
@@ -468,10 +486,14 @@ void AES::encrypt_n(const byte in[], byte out[], u32bit blocks) const
          }
 
       /*
-      Joseph Bonneau and Ilya Mironov's paper
-      <a href = "http://icme2007.org/users/mironov/papers/aes-timing.pdf">
-      Cache-Collision Timing Attacks Against AES</a> describes an attack
-      that can recover AES keys with as few as 2<sup>13</sup> samples.
+      Joseph Bonneau and Ilya Mironov's paper "Cache-Collision Timing
+      Attacks Against AES" describes an attack that can recover AES
+      keys with as few as 2**13 samples.
+
+      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.88.4753
+
+      They recommend using a byte-wide table, which still allows an attack
+      but increases the samples required from 2**13 to 2**25:
 
       """In addition to OpenSSL v. 0.9.8.(a), which was used in our
       experiments, the AES implementations of Crypto++ 5.2.1 and
@@ -481,6 +503,7 @@ void AES::encrypt_n(const byte in[], byte out[], u32bit blocks) const
       use a smaller byte-wide final table which lessens the effectiveness
       of the attacks."""
       */
+
       out[ 0] = SE[get_byte(0, B0)] ^ ME[0];
       out[ 1] = SE[get_byte(1, B1)] ^ ME[1];
       out[ 2] = SE[get_byte(2, B2)] ^ ME[2];
@@ -520,15 +543,25 @@ void AES::decrypt_n(const byte in[], byte out[], u32bit blocks) const
       u32bit T2 = load_be<u32bit>(in, 2) ^ DK[2];
       u32bit T3 = load_be<u32bit>(in, 3) ^ DK[3];
 
-      u32bit B0, B1, B2, B3;
-      B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^
-           TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ DK[4];
-      B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^
-           TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ DK[5];
-      B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^
-           TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ DK[6];
-      B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^
-           TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ DK[7];
+      u32bit B0 = TD[get_byte(0, T0)] ^
+                  rotate_right(TD[get_byte(1, T3)],  8) ^
+                  rotate_right(TD[get_byte(2, T2)], 16) ^
+                  rotate_right(TD[get_byte(3, T1)], 24) ^ DK[4];
+
+      u32bit B1 = TD[get_byte(0, T1)] ^
+                  rotate_right(TD[get_byte(1, T0)],  8) ^
+                  rotate_right(TD[get_byte(2, T3)], 16) ^
+                  rotate_right(TD[get_byte(3, T2)], 24) ^ DK[5];
+
+      u32bit B2 = TD[get_byte(0, T2)] ^
+                  rotate_right(TD[get_byte(1, T1)],  8) ^
+                  rotate_right(TD[get_byte(2, T0)], 16) ^
+                  rotate_right(TD[get_byte(3, T3)], 24) ^ DK[6];
+
+      u32bit B3 = TD[get_byte(0, T3)] ^
+                  rotate_right(TD[get_byte(1, T2)],  8) ^
+                  rotate_right(TD[get_byte(2, T1)], 16) ^
+                  rotate_right(TD[get_byte(3, T0)], 24) ^ DK[7];
 
       for(u32bit j = 2; j != ROUNDS; j += 2)
          {