diff options
author | lloyd <[email protected]> | 2009-08-11 02:31:17 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2009-08-11 02:31:17 +0000 |
commit | f51841ba5237952dda3e76df643d3ae13bed3df5 (patch) | |
tree | 7fd004a107bae55a5f87c4e8bc35b0012334b29b /src/block/aes | |
parent | 34eb8de4ed014ab8913bdb34b096d60880b1c14a (diff) |
Change the BlockCipher interface to support multi-block encryption and
decryption. Currently only used for counter mode. Doesn't offer much
advantage as-is (though might help slightly, in terms of cache effects),
but allows for SIMD implementations to process multiple blocks in parallel
when possible. Particularly thinking here of Serpent; TEA/XTEA also seem
promising in this sense, as is Threefish once that is implemented as a
standalone block cipher.
Diffstat (limited to 'src/block/aes')
-rw-r--r-- | src/block/aes/aes.cpp | 266 | ||||
-rw-r--r-- | src/block/aes/aes.h | 6 |
2 files changed, 142 insertions, 130 deletions
diff --git a/src/block/aes/aes.cpp b/src/block/aes/aes.cpp index 9072b507b..34698ae7f 100644 --- a/src/block/aes/aes.cpp +++ b/src/block/aes/aes.cpp @@ -1,6 +1,6 @@ /** * AES -* (C) 1999-2007 Jack Lloyd +* (C) 1999-2009 Jack Lloyd * * Distributed under the terms of the Botan license */ @@ -13,163 +13,175 @@ namespace Botan { /** * AES Encryption */ -void AES::enc(const byte in[], byte out[]) const +void AES::encrypt_n(const byte in[], byte out[], u32bit blocks) const { const u32bit* TE0 = TE; const u32bit* TE1 = TE + 256; const u32bit* TE2 = TE + 512; const u32bit* TE3 = TE + 768; - u32bit T0 = load_be<u32bit>(in, 0) ^ EK[0]; - u32bit T1 = load_be<u32bit>(in, 1) ^ EK[1]; - u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2]; - u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3]; - - u32bit B0, B1, B2, B3; - B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^ - TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ EK[4]; - B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^ - TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ EK[5]; - B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^ - TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ EK[6]; - B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^ - TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ EK[7]; - - for(u32bit j = 2; j != ROUNDS; j += 2) + for(u32bit i = 0; i != blocks; ++i) { - const u32bit K0 = EK[4*j]; - const u32bit K1 = EK[4*j+1]; - const u32bit K2 = EK[4*j+2]; - const u32bit K3 = EK[4*j+3]; - - T0 = TE0[get_byte(0, B0)] ^ TE1[get_byte(1, B1)] ^ - TE2[get_byte(2, B2)] ^ TE3[get_byte(3, B3)] ^ K0; - T1 = TE0[get_byte(0, B1)] ^ TE1[get_byte(1, B2)] ^ - TE2[get_byte(2, B3)] ^ TE3[get_byte(3, B0)] ^ K1; - T2 = TE0[get_byte(0, B2)] ^ TE1[get_byte(1, B3)] ^ - TE2[get_byte(2, B0)] ^ TE3[get_byte(3, B1)] ^ K2; - T3 = TE0[get_byte(0, B3)] ^ TE1[get_byte(1, B0)] ^ - TE2[get_byte(2, B1)] ^ TE3[get_byte(3, B2)] ^ K3; - - const u32bit K4 = EK[4*(j+1)+0]; - const u32bit K5 = EK[4*(j+1)+1]; - const u32bit K6 = EK[4*(j+1)+2]; - const u32bit K7 = EK[4*(j+1)+3]; + u32bit T0 = load_be<u32bit>(in, 0) ^ EK[0]; + u32bit T1 = load_be<u32bit>(in, 1) ^ EK[1]; + u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2]; + u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3]; + u32bit B0, B1, B2, B3; B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^ - TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ K4; + TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ EK[4]; B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^ - TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ K5; + TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ EK[5]; B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^ - TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ K6; + TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ EK[6]; B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^ - TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ K7; - } + TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ EK[7]; + + for(u32bit j = 2; j != ROUNDS; j += 2) + { + const u32bit K0 = EK[4*j]; + const u32bit K1 = EK[4*j+1]; + const u32bit K2 = EK[4*j+2]; + const u32bit K3 = EK[4*j+3]; + + T0 = TE0[get_byte(0, B0)] ^ TE1[get_byte(1, B1)] ^ + TE2[get_byte(2, B2)] ^ TE3[get_byte(3, B3)] ^ K0; + T1 = TE0[get_byte(0, B1)] ^ TE1[get_byte(1, B2)] ^ + TE2[get_byte(2, B3)] ^ TE3[get_byte(3, B0)] ^ K1; + T2 = TE0[get_byte(0, B2)] ^ TE1[get_byte(1, B3)] ^ + TE2[get_byte(2, B0)] ^ TE3[get_byte(3, B1)] ^ K2; + T3 = TE0[get_byte(0, B3)] ^ TE1[get_byte(1, B0)] ^ + TE2[get_byte(2, B1)] ^ TE3[get_byte(3, B2)] ^ K3; + + const u32bit K4 = EK[4*(j+1)+0]; + const u32bit K5 = EK[4*(j+1)+1]; + const u32bit K6 = EK[4*(j+1)+2]; + const u32bit K7 = EK[4*(j+1)+3]; - /* - Joseph Bonneau and Ilya Mironov's paper - <a href = "http://icme2007.org/users/mironov/papers/aes-timing.pdf"> - Cache-Collision Timing Attacks Against AES</a> describes an attack - that can recover AES keys with as few as 2<sup>13</sup> samples. - - """In addition to OpenSSL v. 0.9.8.(a), which was used in our - experiments, the AES implementations of Crypto++ 5.2.1 and - LibTomCrypt 1.09 use the original Rijndael C implementation with - very few changes and are highly vulnerable. The AES implementations - in libgcrypt v. 1.2.2 and Botan v. 1.4.2 are also vulnerable, but - use a smaller byte-wide final table which lessens the effectiveness - of the attacks.""" - */ - out[ 0] = SE[get_byte(0, B0)] ^ ME[0]; - out[ 1] = SE[get_byte(1, B1)] ^ ME[1]; - out[ 2] = SE[get_byte(2, B2)] ^ ME[2]; - out[ 3] = SE[get_byte(3, B3)] ^ ME[3]; - out[ 4] = SE[get_byte(0, B1)] ^ ME[4]; - out[ 5] = SE[get_byte(1, B2)] ^ ME[5]; - out[ 6] = SE[get_byte(2, B3)] ^ ME[6]; - out[ 7] = SE[get_byte(3, B0)] ^ ME[7]; - out[ 8] = SE[get_byte(0, B2)] ^ ME[8]; - out[ 9] = SE[get_byte(1, B3)] ^ ME[9]; - out[10] = SE[get_byte(2, B0)] ^ ME[10]; - out[11] = SE[get_byte(3, B1)] ^ ME[11]; - out[12] = SE[get_byte(0, B3)] ^ ME[12]; - out[13] = SE[get_byte(1, B0)] ^ ME[13]; - out[14] = SE[get_byte(2, B1)] ^ ME[14]; - out[15] = SE[get_byte(3, B2)] ^ ME[15]; + B0 = TE0[get_byte(0, T0)] ^ TE1[get_byte(1, T1)] ^ + TE2[get_byte(2, T2)] ^ TE3[get_byte(3, T3)] ^ K4; + B1 = TE0[get_byte(0, T1)] ^ TE1[get_byte(1, T2)] ^ + TE2[get_byte(2, T3)] ^ TE3[get_byte(3, T0)] ^ K5; + B2 = TE0[get_byte(0, T2)] ^ TE1[get_byte(1, T3)] ^ + TE2[get_byte(2, T0)] ^ TE3[get_byte(3, T1)] ^ K6; + B3 = TE0[get_byte(0, T3)] ^ TE1[get_byte(1, T0)] ^ + TE2[get_byte(2, T1)] ^ TE3[get_byte(3, T2)] ^ K7; + } + + /* + Joseph Bonneau and Ilya Mironov's paper + <a href = "http://icme2007.org/users/mironov/papers/aes-timing.pdf"> + Cache-Collision Timing Attacks Against AES</a> describes an attack + that can recover AES keys with as few as 2<sup>13</sup> samples. + + """In addition to OpenSSL v. 0.9.8.(a), which was used in our + experiments, the AES implementations of Crypto++ 5.2.1 and + LibTomCrypt 1.09 use the original Rijndael C implementation with + very few changes and are highly vulnerable. The AES implementations + in libgcrypt v. 1.2.2 and Botan v. 1.4.2 are also vulnerable, but + use a smaller byte-wide final table which lessens the effectiveness + of the attacks.""" + */ + out[ 0] = SE[get_byte(0, B0)] ^ ME[0]; + out[ 1] = SE[get_byte(1, B1)] ^ ME[1]; + out[ 2] = SE[get_byte(2, B2)] ^ ME[2]; + out[ 3] = SE[get_byte(3, B3)] ^ ME[3]; + out[ 4] = SE[get_byte(0, B1)] ^ ME[4]; + out[ 5] = SE[get_byte(1, B2)] ^ ME[5]; + out[ 6] = SE[get_byte(2, B3)] ^ ME[6]; + out[ 7] = SE[get_byte(3, B0)] ^ ME[7]; + out[ 8] = SE[get_byte(0, B2)] ^ ME[8]; + out[ 9] = SE[get_byte(1, B3)] ^ ME[9]; + out[10] = SE[get_byte(2, B0)] ^ ME[10]; + out[11] = SE[get_byte(3, B1)] ^ ME[11]; + out[12] = SE[get_byte(0, B3)] ^ ME[12]; + out[13] = SE[get_byte(1, B0)] ^ ME[13]; + out[14] = SE[get_byte(2, B1)] ^ ME[14]; + out[15] = SE[get_byte(3, B2)] ^ ME[15]; + + in += BLOCK_SIZE; + out += BLOCK_SIZE; + } } /** * AES Decryption */ -void AES::dec(const byte in[], byte out[]) const +void AES::decrypt_n(const byte in[], byte out[], u32bit blocks) const { const u32bit* TD0 = TD; const u32bit* TD1 = TD + 256; const u32bit* TD2 = TD + 512; const u32bit* TD3 = TD + 768; - u32bit T0 = load_be<u32bit>(in, 0) ^ DK[0]; - u32bit T1 = load_be<u32bit>(in, 1) ^ DK[1]; - u32bit T2 = load_be<u32bit>(in, 2) ^ DK[2]; - u32bit T3 = load_be<u32bit>(in, 3) ^ DK[3]; - - u32bit B0, B1, B2, B3; - B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^ - TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ DK[4]; - B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^ - TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ DK[5]; - B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^ - TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ DK[6]; - B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^ - TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ DK[7]; - - for(u32bit j = 2; j != ROUNDS; j += 2) + for(u32bit i = 0; i != blocks; ++i) { - const u32bit K0 = DK[4*j+0]; - const u32bit K1 = DK[4*j+1]; - const u32bit K2 = DK[4*j+2]; - const u32bit K3 = DK[4*j+3]; - - T0 = TD0[get_byte(0, B0)] ^ TD1[get_byte(1, B3)] ^ - TD2[get_byte(2, B2)] ^ TD3[get_byte(3, B1)] ^ K0; - T1 = TD0[get_byte(0, B1)] ^ TD1[get_byte(1, B0)] ^ - TD2[get_byte(2, B3)] ^ TD3[get_byte(3, B2)] ^ K1; - T2 = TD0[get_byte(0, B2)] ^ TD1[get_byte(1, B1)] ^ - TD2[get_byte(2, B0)] ^ TD3[get_byte(3, B3)] ^ K2; - T3 = TD0[get_byte(0, B3)] ^ TD1[get_byte(1, B2)] ^ - TD2[get_byte(2, B1)] ^ TD3[get_byte(3, B0)] ^ K3; - - const u32bit K4 = DK[4*(j+1)+0]; - const u32bit K5 = DK[4*(j+1)+1]; - const u32bit K6 = DK[4*(j+1)+2]; - const u32bit K7 = DK[4*(j+1)+3]; + u32bit T0 = load_be<u32bit>(in, 0) ^ DK[0]; + u32bit T1 = load_be<u32bit>(in, 1) ^ DK[1]; + u32bit T2 = load_be<u32bit>(in, 2) ^ DK[2]; + u32bit T3 = load_be<u32bit>(in, 3) ^ DK[3]; + u32bit B0, B1, B2, B3; B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^ - TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ K4; + TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ DK[4]; B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^ - TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ K5; + TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ DK[5]; B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^ - TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ K6; + TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ DK[6]; B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^ - TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ K7; - } + TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ DK[7]; + + for(u32bit j = 2; j != ROUNDS; j += 2) + { + const u32bit K0 = DK[4*j+0]; + const u32bit K1 = DK[4*j+1]; + const u32bit K2 = DK[4*j+2]; + const u32bit K3 = DK[4*j+3]; + + T0 = TD0[get_byte(0, B0)] ^ TD1[get_byte(1, B3)] ^ + TD2[get_byte(2, B2)] ^ TD3[get_byte(3, B1)] ^ K0; + T1 = TD0[get_byte(0, B1)] ^ TD1[get_byte(1, B0)] ^ + TD2[get_byte(2, B3)] ^ TD3[get_byte(3, B2)] ^ K1; + T2 = TD0[get_byte(0, B2)] ^ TD1[get_byte(1, B1)] ^ + TD2[get_byte(2, B0)] ^ TD3[get_byte(3, B3)] ^ K2; + T3 = TD0[get_byte(0, B3)] ^ TD1[get_byte(1, B2)] ^ + TD2[get_byte(2, B1)] ^ TD3[get_byte(3, B0)] ^ K3; + + const u32bit K4 = DK[4*(j+1)+0]; + const u32bit K5 = DK[4*(j+1)+1]; + const u32bit K6 = DK[4*(j+1)+2]; + const u32bit K7 = DK[4*(j+1)+3]; - out[ 0] = SD[get_byte(0, B0)] ^ MD[0]; - out[ 1] = SD[get_byte(1, B3)] ^ MD[1]; - out[ 2] = SD[get_byte(2, B2)] ^ MD[2]; - out[ 3] = SD[get_byte(3, B1)] ^ MD[3]; - out[ 4] = SD[get_byte(0, B1)] ^ MD[4]; - out[ 5] = SD[get_byte(1, B0)] ^ MD[5]; - out[ 6] = SD[get_byte(2, B3)] ^ MD[6]; - out[ 7] = SD[get_byte(3, B2)] ^ MD[7]; - out[ 8] = SD[get_byte(0, B2)] ^ MD[8]; - out[ 9] = SD[get_byte(1, B1)] ^ MD[9]; - out[10] = SD[get_byte(2, B0)] ^ MD[10]; - out[11] = SD[get_byte(3, B3)] ^ MD[11]; - out[12] = SD[get_byte(0, B3)] ^ MD[12]; - out[13] = SD[get_byte(1, B2)] ^ MD[13]; - out[14] = SD[get_byte(2, B1)] ^ MD[14]; - out[15] = SD[get_byte(3, B0)] ^ MD[15]; + B0 = TD0[get_byte(0, T0)] ^ TD1[get_byte(1, T3)] ^ + TD2[get_byte(2, T2)] ^ TD3[get_byte(3, T1)] ^ K4; + B1 = TD0[get_byte(0, T1)] ^ TD1[get_byte(1, T0)] ^ + TD2[get_byte(2, T3)] ^ TD3[get_byte(3, T2)] ^ K5; + B2 = TD0[get_byte(0, T2)] ^ TD1[get_byte(1, T1)] ^ + TD2[get_byte(2, T0)] ^ TD3[get_byte(3, T3)] ^ K6; + B3 = TD0[get_byte(0, T3)] ^ TD1[get_byte(1, T2)] ^ + TD2[get_byte(2, T1)] ^ TD3[get_byte(3, T0)] ^ K7; + } + + out[ 0] = SD[get_byte(0, B0)] ^ MD[0]; + out[ 1] = SD[get_byte(1, B3)] ^ MD[1]; + out[ 2] = SD[get_byte(2, B2)] ^ MD[2]; + out[ 3] = SD[get_byte(3, B1)] ^ MD[3]; + out[ 4] = SD[get_byte(0, B1)] ^ MD[4]; + out[ 5] = SD[get_byte(1, B0)] ^ MD[5]; + out[ 6] = SD[get_byte(2, B3)] ^ MD[6]; + out[ 7] = SD[get_byte(3, B2)] ^ MD[7]; + out[ 8] = SD[get_byte(0, B2)] ^ MD[8]; + out[ 9] = SD[get_byte(1, B1)] ^ MD[9]; + out[10] = SD[get_byte(2, B0)] ^ MD[10]; + out[11] = SD[get_byte(3, B3)] ^ MD[11]; + out[12] = SD[get_byte(0, B3)] ^ MD[12]; + out[13] = SD[get_byte(1, B2)] ^ MD[13]; + out[14] = SD[get_byte(2, B1)] ^ MD[14]; + out[15] = SD[get_byte(3, B0)] ^ MD[15]; + + in += BLOCK_SIZE; + out += BLOCK_SIZE; + } } /** diff --git a/src/block/aes/aes.h b/src/block/aes/aes.h index 05e2e3123..940e11a48 100644 --- a/src/block/aes/aes.h +++ b/src/block/aes/aes.h @@ -1,6 +1,6 @@ /** * AES -* (C) 1999-2007 Jack Lloyd +* (C) 1999-2009 Jack Lloyd * * Distributed under the terms of the Botan license */ @@ -24,8 +24,8 @@ class BOTAN_DLL AES : public BlockCipher AES() : BlockCipher(16, 16, 32, 8) { ROUNDS = 14; } AES(u32bit); private: - void enc(const byte[], byte[]) const; - void dec(const byte[], byte[]) const; + void encrypt_n(const byte in[], byte out[], u32bit blocks) const; + void decrypt_n(const byte in[], byte out[], u32bit blocks) const; void key_schedule(const byte[], u32bit); static u32bit S(u32bit); |