aboutsummaryrefslogtreecommitdiffstats
path: root/src/block/aes/aes.cpp
diff options
context:
space:
mode:
authorlloyd <[email protected]>2009-08-11 02:31:17 +0000
committerlloyd <[email protected]>2009-08-11 02:31:17 +0000
commitf51841ba5237952dda3e76df643d3ae13bed3df5 (patch)
tree7fd004a107bae55a5f87c4e8bc35b0012334b29b /src/block/aes/aes.cpp
parent34eb8de4ed014ab8913bdb34b096d60880b1c14a (diff)
Change the BlockCipher interface to support multi-block encryption and
decryption. Currently only used for counter mode. Doesn't offer much advantage as-is (though might help slightly, in terms of cache effects), but allows for SIMD implementations to process multiple blocks in parallel when possible. Particularly thinking here of Serpent; TEA/XTEA also seem promising in this sense, as is Threefish once that is implemented as a standalone block cipher.
Diffstat (limited to 'src/block/aes/aes.cpp')
-rw-r--r--src/block/aes/aes.cpp266
1 files changed, 139 insertions, 127 deletions
diff --git a/src/block/aes/aes.cpp b/src/block/aes/aes.cpp
index 9072b507b..34698ae7f 100644
--- a/src/block/aes/aes.cpp
+++ b/src/block/aes/aes.cpp
@@ -1,6 +1,6 @@
/**
* AES
-* (C) 1999-2007 Jack Lloyd
+* (C) 1999-2009 Jack Lloyd
*
* Distributed under the terms of the Botan license
*/
@@ -13,163 +13,175 @@ namespace Botan {
/**
* AES Encryption
*/
-void AES::enc(const byte in[], byte out[]) const
+void AES::encrypt_n(const byte in[], byte out[], u32bit blocks) const
{
const u32bit* TE0 = TE;
const u32bit* TE1 = TE + 256;
const u32bit* TE2 = TE + 512;
const u32bit* TE3 = TE + 768;
- u32bit T0 = load_be<u32bit>(in, 0) ^ EK[0];
- u32bit T1 = load_be<u32bit>(in, 1) ^ EK[1];
- u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2];
- u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3];
-
- u32bit B0, B1, B2, B3;
- B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^
- TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ EK[4];
- B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^
- TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ EK[5];
- B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^
- TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ EK[6];
- B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^
- TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ EK[7];
-
- for(u32bit j = 2; j != ROUNDS; j += 2)
+ for(u32bit i = 0; i != blocks; ++i)
{
- const u32bit K0 = EK[4*j];
- const u32bit K1 = EK[4*j+1];
- const u32bit K2 = EK[4*j+2];
- const u32bit K3 = EK[4*j+3];
-
- T0 = TE0[get_byte(0, B0)] ^ TE1[get_byte(1, B1)] ^
- TE2[get_byte(2, B2)] ^ TE3[get_byte(3, B3)] ^ K0;
- T1 = TE0[get_byte(0, B1)] ^ TE1[get_byte(1, B2)] ^
- TE2[get_byte(2, B3)] ^ TE3[get_byte(3, B0)] ^ K1;
- T2 = TE0[get_byte(0, B2)] ^ TE1[get_byte(1, B3)] ^
- TE2[get_byte(2, B0)] ^ TE3[get_byte(3, B1)] ^ K2;
- T3 = TE0[get_byte(0, B3)] ^ TE1[get_byte(1, B0)] ^
- TE2[get_byte(2, B1)] ^ TE3[get_byte(3, B2)] ^ K3;
-
- const u32bit K4 = EK[4*(j+1)+0];
- const u32bit K5 = EK[4*(j+1)+1];
- const u32bit K6 = EK[4*(j+1)+2];
- const u32bit K7 = EK[4*(j+1)+3];
+ u32bit T0 = load_be<u32bit>(in, 0) ^ EK[0];
+ u32bit T1 = load_be<u32bit>(in, 1) ^ EK[1];
+ u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2];
+ u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3];
+ u32bit B0, B1, B2, B3;
B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^
- TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ K4;
+ TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ EK[4];
B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^
- TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ K5;
+ TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ EK[5];
B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^
- TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ K6;
+ TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ EK[6];
B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^
- TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ K7;
- }
+ TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ EK[7];
+
+ for(u32bit j = 2; j != ROUNDS; j += 2)
+ {
+ const u32bit K0 = EK[4*j];
+ const u32bit K1 = EK[4*j+1];
+ const u32bit K2 = EK[4*j+2];
+ const u32bit K3 = EK[4*j+3];
+
+ T0 = TE0[get_byte(0, B0)] ^ TE1[get_byte(1, B1)] ^
+ TE2[get_byte(2, B2)] ^ TE3[get_byte(3, B3)] ^ K0;
+ T1 = TE0[get_byte(0, B1)] ^ TE1[get_byte(1, B2)] ^
+ TE2[get_byte(2, B3)] ^ TE3[get_byte(3, B0)] ^ K1;
+ T2 = TE0[get_byte(0, B2)] ^ TE1[get_byte(1, B3)] ^
+ TE2[get_byte(2, B0)] ^ TE3[get_byte(3, B1)] ^ K2;
+ T3 = TE0[get_byte(0, B3)] ^ TE1[get_byte(1, B0)] ^
+ TE2[get_byte(2, B1)] ^ TE3[get_byte(3, B2)] ^ K3;
+
+ const u32bit K4 = EK[4*(j+1)+0];
+ const u32bit K5 = EK[4*(j+1)+1];
+ const u32bit K6 = EK[4*(j+1)+2];
+ const u32bit K7 = EK[4*(j+1)+3];
- /*
- Joseph Bonneau and Ilya Mironov's paper
- <a href = "http://icme2007.org/users/mironov/papers/aes-timing.pdf">
- Cache-Collision Timing Attacks Against AES</a> describes an attack
- that can recover AES keys with as few as 2<sup>13</sup> samples.
-
- """In addition to OpenSSL v. 0.9.8.(a), which was used in our
- experiments, the AES implementations of Crypto++ 5.2.1 and
- LibTomCrypt 1.09 use the original Rijndael C implementation with
- very few changes and are highly vulnerable. The AES implementations
- in libgcrypt v. 1.2.2 and Botan v. 1.4.2 are also vulnerable, but
- use a smaller byte-wide final table which lessens the effectiveness
- of the attacks."""
- */
- out[ 0] = SE[get_byte(0, B0)] ^ ME[0];
- out[ 1] = SE[get_byte(1, B1)] ^ ME[1];
- out[ 2] = SE[get_byte(2, B2)] ^ ME[2];
- out[ 3] = SE[get_byte(3, B3)] ^ ME[3];
- out[ 4] = SE[get_byte(0, B1)] ^ ME[4];
- out[ 5] = SE[get_byte(1, B2)] ^ ME[5];
- out[ 6] = SE[get_byte(2, B3)] ^ ME[6];
- out[ 7] = SE[get_byte(3, B0)] ^ ME[7];
- out[ 8] = SE[get_byte(0, B2)] ^ ME[8];
- out[ 9] = SE[get_byte(1, B3)] ^ ME[9];
- out[10] = SE[get_byte(2, B0)] ^ ME[10];
- out[11] = SE[get_byte(3, B1)] ^ ME[11];
- out[12] = SE[get_byte(0, B3)] ^ ME[12];
- out[13] = SE[get_byte(1, B0)] ^ ME[13];
- out[14] = SE[get_byte(2, B1)] ^ ME[14];
- out[15] = SE[get_byte(3, B2)] ^ ME[15];
+ B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^
+ TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ K4;
+ B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^
+ TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ K5;
+ B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^
+ TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ K6;
+ B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^
+ TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ K7;
+ }
+
+ /*
+ Joseph Bonneau and Ilya Mironov's paper
+ <a href = "http://icme2007.org/users/mironov/papers/aes-timing.pdf">
+ Cache-Collision Timing Attacks Against AES</a> describes an attack
+ that can recover AES keys with as few as 2<sup>13</sup> samples.
+
+ """In addition to OpenSSL v. 0.9.8.(a), which was used in our
+ experiments, the AES implementations of Crypto++ 5.2.1 and
+ LibTomCrypt 1.09 use the original Rijndael C implementation with
+ very few changes and are highly vulnerable. The AES implementations
+ in libgcrypt v. 1.2.2 and Botan v. 1.4.2 are also vulnerable, but
+ use a smaller byte-wide final table which lessens the effectiveness
+ of the attacks."""
+ */
+ out[ 0] = SE[get_byte(0, B0)] ^ ME[0];
+ out[ 1] = SE[get_byte(1, B1)] ^ ME[1];
+ out[ 2] = SE[get_byte(2, B2)] ^ ME[2];
+ out[ 3] = SE[get_byte(3, B3)] ^ ME[3];
+ out[ 4] = SE[get_byte(0, B1)] ^ ME[4];
+ out[ 5] = SE[get_byte(1, B2)] ^ ME[5];
+ out[ 6] = SE[get_byte(2, B3)] ^ ME[6];
+ out[ 7] = SE[get_byte(3, B0)] ^ ME[7];
+ out[ 8] = SE[get_byte(0, B2)] ^ ME[8];
+ out[ 9] = SE[get_byte(1, B3)] ^ ME[9];
+ out[10] = SE[get_byte(2, B0)] ^ ME[10];
+ out[11] = SE[get_byte(3, B1)] ^ ME[11];
+ out[12] = SE[get_byte(0, B3)] ^ ME[12];
+ out[13] = SE[get_byte(1, B0)] ^ ME[13];
+ out[14] = SE[get_byte(2, B1)] ^ ME[14];
+ out[15] = SE[get_byte(3, B2)] ^ ME[15];
+
+ in += BLOCK_SIZE;
+ out += BLOCK_SIZE;
+ }
}
/**
* AES Decryption
*/
-void AES::dec(const byte in[], byte out[]) const
+void AES::decrypt_n(const byte in[], byte out[], u32bit blocks) const
{
const u32bit* TD0 = TD;
const u32bit* TD1 = TD + 256;
const u32bit* TD2 = TD + 512;
const u32bit* TD3 = TD + 768;
- u32bit T0 = load_be<u32bit>(in, 0) ^ DK[0];
- u32bit T1 = load_be<u32bit>(in, 1) ^ DK[1];
- u32bit T2 = load_be<u32bit>(in, 2) ^ DK[2];
- u32bit T3 = load_be<u32bit>(in, 3) ^ DK[3];
-
- u32bit B0, B1, B2, B3;
- B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^
- TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ DK[4];
- B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^
- TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ DK[5];
- B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^
- TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ DK[6];
- B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^
- TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ DK[7];
-
- for(u32bit j = 2; j != ROUNDS; j += 2)
+ for(u32bit i = 0; i != blocks; ++i)
{
- const u32bit K0 = DK[4*j+0];
- const u32bit K1 = DK[4*j+1];
- const u32bit K2 = DK[4*j+2];
- const u32bit K3 = DK[4*j+3];
-
- T0 = TD0[get_byte(0, B0)] ^ TD1[get_byte(1, B3)] ^
- TD2[get_byte(2, B2)] ^ TD3[get_byte(3, B1)] ^ K0;
- T1 = TD0[get_byte(0, B1)] ^ TD1[get_byte(1, B0)] ^
- TD2[get_byte(2, B3)] ^ TD3[get_byte(3, B2)] ^ K1;
- T2 = TD0[get_byte(0, B2)] ^ TD1[get_byte(1, B1)] ^
- TD2[get_byte(2, B0)] ^ TD3[get_byte(3, B3)] ^ K2;
- T3 = TD0[get_byte(0, B3)] ^ TD1[get_byte(1, B2)] ^
- TD2[get_byte(2, B1)] ^ TD3[get_byte(3, B0)] ^ K3;
-
- const u32bit K4 = DK[4*(j+1)+0];
- const u32bit K5 = DK[4*(j+1)+1];
- const u32bit K6 = DK[4*(j+1)+2];
- const u32bit K7 = DK[4*(j+1)+3];
+ u32bit T0 = load_be<u32bit>(in, 0) ^ DK[0];
+ u32bit T1 = load_be<u32bit>(in, 1) ^ DK[1];
+ u32bit T2 = load_be<u32bit>(in, 2) ^ DK[2];
+ u32bit T3 = load_be<u32bit>(in, 3) ^ DK[3];
+ u32bit B0, B1, B2, B3;
B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^
- TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ K4;
+ TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ DK[4];
B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^
- TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ K5;
+ TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ DK[5];
B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^
- TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ K6;
+ TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ DK[6];
B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^
- TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ K7;
- }
+ TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ DK[7];
+
+ for(u32bit j = 2; j != ROUNDS; j += 2)
+ {
+ const u32bit K0 = DK[4*j+0];
+ const u32bit K1 = DK[4*j+1];
+ const u32bit K2 = DK[4*j+2];
+ const u32bit K3 = DK[4*j+3];
+
+ T0 = TD0[get_byte(0, B0)] ^ TD1[get_byte(1, B3)] ^
+ TD2[get_byte(2, B2)] ^ TD3[get_byte(3, B1)] ^ K0;
+ T1 = TD0[get_byte(0, B1)] ^ TD1[get_byte(1, B0)] ^
+ TD2[get_byte(2, B3)] ^ TD3[get_byte(3, B2)] ^ K1;
+ T2 = TD0[get_byte(0, B2)] ^ TD1[get_byte(1, B1)] ^
+ TD2[get_byte(2, B0)] ^ TD3[get_byte(3, B3)] ^ K2;
+ T3 = TD0[get_byte(0, B3)] ^ TD1[get_byte(1, B2)] ^
+ TD2[get_byte(2, B1)] ^ TD3[get_byte(3, B0)] ^ K3;
+
+ const u32bit K4 = DK[4*(j+1)+0];
+ const u32bit K5 = DK[4*(j+1)+1];
+ const u32bit K6 = DK[4*(j+1)+2];
+ const u32bit K7 = DK[4*(j+1)+3];
- out[ 0] = SD[get_byte(0, B0)] ^ MD[0];
- out[ 1] = SD[get_byte(1, B3)] ^ MD[1];
- out[ 2] = SD[get_byte(2, B2)] ^ MD[2];
- out[ 3] = SD[get_byte(3, B1)] ^ MD[3];
- out[ 4] = SD[get_byte(0, B1)] ^ MD[4];
- out[ 5] = SD[get_byte(1, B0)] ^ MD[5];
- out[ 6] = SD[get_byte(2, B3)] ^ MD[6];
- out[ 7] = SD[get_byte(3, B2)] ^ MD[7];
- out[ 8] = SD[get_byte(0, B2)] ^ MD[8];
- out[ 9] = SD[get_byte(1, B1)] ^ MD[9];
- out[10] = SD[get_byte(2, B0)] ^ MD[10];
- out[11] = SD[get_byte(3, B3)] ^ MD[11];
- out[12] = SD[get_byte(0, B3)] ^ MD[12];
- out[13] = SD[get_byte(1, B2)] ^ MD[13];
- out[14] = SD[get_byte(2, B1)] ^ MD[14];
- out[15] = SD[get_byte(3, B0)] ^ MD[15];
+ B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^
+ TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ K4;
+ B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^
+ TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ K5;
+ B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^
+ TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ K6;
+ B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^
+ TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ K7;
+ }
+
+ out[ 0] = SD[get_byte(0, B0)] ^ MD[0];
+ out[ 1] = SD[get_byte(1, B3)] ^ MD[1];
+ out[ 2] = SD[get_byte(2, B2)] ^ MD[2];
+ out[ 3] = SD[get_byte(3, B1)] ^ MD[3];
+ out[ 4] = SD[get_byte(0, B1)] ^ MD[4];
+ out[ 5] = SD[get_byte(1, B0)] ^ MD[5];
+ out[ 6] = SD[get_byte(2, B3)] ^ MD[6];
+ out[ 7] = SD[get_byte(3, B2)] ^ MD[7];
+ out[ 8] = SD[get_byte(0, B2)] ^ MD[8];
+ out[ 9] = SD[get_byte(1, B1)] ^ MD[9];
+ out[10] = SD[get_byte(2, B0)] ^ MD[10];
+ out[11] = SD[get_byte(3, B3)] ^ MD[11];
+ out[12] = SD[get_byte(0, B3)] ^ MD[12];
+ out[13] = SD[get_byte(1, B2)] ^ MD[13];
+ out[14] = SD[get_byte(2, B1)] ^ MD[14];
+ out[15] = SD[get_byte(3, B0)] ^ MD[15];
+
+ in += BLOCK_SIZE;
+ out += BLOCK_SIZE;
+ }
}
/**