diff options
author | Jack Lloyd <[email protected]> | 2017-10-13 21:08:31 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-13 21:08:31 -0400 |
commit | 93a902e65466957bdb4210cf84d7f704658ed16f (patch) | |
tree | 0a0e114b2d3d95fcd9c83b5e84fd82cde4327deb /src/lib/block/aes | |
parent | 69706d9a4cbca0f502d6a810e1e1cbda280367a7 (diff) |
Reduce AES to using a single T-table
Should have significantly better cache characteristics, though it
would be nice to verify this.
It reduces performance somewhat but less than I expected, at least
on Skylake. I need to check this across more platforms to make sure
t won't hurt too badly.
Diffstat (limited to 'src/lib/block/aes')
-rw-r--r-- | src/lib/block/aes/aes.cpp | 205 |
1 files changed, 78 insertions, 127 deletions
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 1893ab4a0..0c78852d4 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -1,6 +1,6 @@ /* * AES -* (C) 1999-2010,2015 Jack Lloyd +* (C) 1999-2010,2015,2017 Jack Lloyd * * Based on the public domain reference implementation by Paulo Baretto * @@ -16,7 +16,9 @@ * vulnerable to timing and cache based side channel attacks. Some * countermeasures are used which may be helpful in some situations: * -* - Small tables are used in the first and last rounds. +* - Only a single 256-word T-table is used, with rotations applied. +* Most implementations use 4 T-tables which leaks much more +* information via cache usage. * * - The TE and TD tables are computed at runtime to avoid flush+reload * attacks using clflush. As different processes will not share the @@ -104,19 +106,22 @@ inline uint8_t xtime11(uint8_t s) { return xtime8(s) ^ xtime(s) ^ s; } inline uint8_t xtime13(uint8_t s) { return xtime8(s) ^ xtime4(s) ^ s; } inline uint8_t xtime14(uint8_t s) { return xtime8(s) ^ xtime4(s) ^ xtime(s); } +inline uint32_t SE_word(uint32_t x) + { + return make_uint32(SE[get_byte(0, x)], + SE[get_byte(1, x)], + SE[get_byte(2, x)], + SE[get_byte(3, x)]); + } + const std::vector<uint32_t>& AES_TE() { auto compute_TE = []() -> std::vector<uint32_t> { - std::vector<uint32_t> TE(1024); + std::vector<uint32_t> TE(256); for(size_t i = 0; i != 256; ++i) { const uint8_t s = SE[i]; - const uint32_t x = make_uint32(xtime(s), s, s, xtime3(s)); - - TE[i] = x; - TE[i+256] = rotr< 8>(x); - TE[i+512] = rotr<16>(x); - TE[i+768] = rotr<24>(x); + TE[i] = make_uint32(xtime(s), s, s, xtime3(s)); } return TE; }; @@ -128,16 +133,11 @@ const std::vector<uint32_t>& AES_TE() const std::vector<uint32_t>& AES_TD() { auto compute_TD = []() -> std::vector<uint32_t> { - std::vector<uint32_t> TD(1024); + std::vector<uint32_t> TD(256); for(size_t i = 0; i != 256; ++i) { const uint8_t s = SD[i]; - const uint32_t x = make_uint32(xtime14(s), xtime9(s), xtime13(s), xtime11(s)); - - TD[i] = x; - TD[i+256] = rotr< 8>(x); - TD[i+512] = rotr<16>(x); - TD[i+768] = rotr<24>(x); + TD[i] = make_uint32(xtime14(s), xtime9(s), xtime13(s), xtime11(s)); } return TD; }; @@ -145,6 +145,12 @@ const std::vector<uint32_t>& AES_TD() return TD; } +#define AES_T(T, K, V0, V1, V2, V3) \ + (K ^ T[get_byte(0, V0)] ^ \ + rotr< 8>(T[get_byte(1, V1)]) ^ \ + rotr<16>(T[get_byte(2, V2)]) ^ \ + rotr<24>(T[get_byte(3, V3)])) + /* * AES Encryption */ @@ -160,7 +166,7 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], const std::vector<uint32_t>& TE = AES_TE(); // Hit every cache line of TE - uint32_t Z = 0; + volatile uint32_t Z = 0; for(size_t i = 0; i < TE.size(); i += cache_line_size / sizeof(uint32_t)) { Z |= TE[i]; @@ -179,71 +185,44 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], T0 ^= Z; - /* Use only the first 256 entries of the TE table and do the - * rotations directly in the code. This reduces the number of - * cache lines potentially used in the first round from 64 to 16 - * (assuming a typical 64 byte cache line), which makes timing - * attacks a little harder; the first round is particularly - * vulnerable. - */ - - uint32_t B0 = TE[get_byte(0, T0)] ^ - rotr< 8>(TE[get_byte(1, T1)]) ^ - rotr<16>(TE[get_byte(2, T2)]) ^ - rotr<24>(TE[get_byte(3, T3)]) ^ EK[4]; - - uint32_t B1 = TE[get_byte(0, T1)] ^ - rotr< 8>(TE[get_byte(1, T2)]) ^ - rotr<16>(TE[get_byte(2, T3)]) ^ - rotr<24>(TE[get_byte(3, T0)]) ^ EK[5]; - - uint32_t B2 = TE[get_byte(0, T2)] ^ - rotr< 8>(TE[get_byte(1, T3)]) ^ - rotr<16>(TE[get_byte(2, T0)]) ^ - rotr<24>(TE[get_byte(3, T1)]) ^ EK[6]; - - uint32_t B3 = TE[get_byte(0, T3)] ^ - rotr< 8>(TE[get_byte(1, T0)]) ^ - rotr<16>(TE[get_byte(2, T1)]) ^ - rotr<24>(TE[get_byte(3, T2)]) ^ EK[7]; + uint32_t B0 = AES_T(TE, EK[4], T0, T1, T2, T3); + uint32_t B1 = AES_T(TE, EK[5], T1, T2, T3, T0); + uint32_t B2 = AES_T(TE, EK[6], T2, T3, T0, T1); + uint32_t B3 = AES_T(TE, EK[7], T3, T0, T1, T2); for(size_t r = 2*4; r < EK.size(); r += 2*4) { - T0 = EK[r ] ^ TE[get_byte(0, B0) ] ^ TE[get_byte(1, B1) + 256] ^ - TE[get_byte(2, B2) + 512] ^ TE[get_byte(3, B3) + 768]; - T1 = EK[r+1] ^ TE[get_byte(0, B1) ] ^ TE[get_byte(1, B2) + 256] ^ - TE[get_byte(2, B3) + 512] ^ TE[get_byte(3, B0) + 768]; - T2 = EK[r+2] ^ TE[get_byte(0, B2) ] ^ TE[get_byte(1, B3) + 256] ^ - TE[get_byte(2, B0) + 512] ^ TE[get_byte(3, B1) + 768]; - T3 = EK[r+3] ^ TE[get_byte(0, B3) ] ^ TE[get_byte(1, B0) + 256] ^ - TE[get_byte(2, B1) + 512] ^ TE[get_byte(3, B2) + 768]; - - B0 = EK[r+4] ^ TE[get_byte(0, T0) ] ^ TE[get_byte(1, T1) + 256] ^ - TE[get_byte(2, T2) + 512] ^ TE[get_byte(3, T3) + 768]; - B1 = EK[r+5] ^ TE[get_byte(0, T1) ] ^ TE[get_byte(1, T2) + 256] ^ - TE[get_byte(2, T3) + 512] ^ TE[get_byte(3, T0) + 768]; - B2 = EK[r+6] ^ TE[get_byte(0, T2) ] ^ TE[get_byte(1, T3) + 256] ^ - TE[get_byte(2, T0) + 512] ^ TE[get_byte(3, T1) + 768]; - B3 = EK[r+7] ^ TE[get_byte(0, T3) ] ^ TE[get_byte(1, T0) + 256] ^ - TE[get_byte(2, T1) + 512] ^ TE[get_byte(3, T2) + 768]; + T0 = AES_T(TE, EK[r ], B0, B1, B2, B3); + T1 = AES_T(TE, EK[r+1], B1, B2, B3, B0); + T2 = AES_T(TE, EK[r+2], B2, B3, B0, B1); + T3 = AES_T(TE, EK[r+3], B3, B0, B1, B2); + + B0 = AES_T(TE, EK[r+4], T0, T1, T2, T3); + B1 = AES_T(TE, EK[r+5], T1, T2, T3, T0); + B2 = AES_T(TE, EK[r+6], T2, T3, T0, T1); + B3 = AES_T(TE, EK[r+7], T3, T0, T1, T2); } - out[16*i+ 0] = SE[get_byte(0, B0)] ^ ME[0]; - out[16*i+ 1] = SE[get_byte(1, B1)] ^ ME[1]; - out[16*i+ 2] = SE[get_byte(2, B2)] ^ ME[2]; - out[16*i+ 3] = SE[get_byte(3, B3)] ^ ME[3]; - out[16*i+ 4] = SE[get_byte(0, B1)] ^ ME[4]; - out[16*i+ 5] = SE[get_byte(1, B2)] ^ ME[5]; - out[16*i+ 6] = SE[get_byte(2, B3)] ^ ME[6]; - out[16*i+ 7] = SE[get_byte(3, B0)] ^ ME[7]; - out[16*i+ 8] = SE[get_byte(0, B2)] ^ ME[8]; - out[16*i+ 9] = SE[get_byte(1, B3)] ^ ME[9]; - out[16*i+10] = SE[get_byte(2, B0)] ^ ME[10]; - out[16*i+11] = SE[get_byte(3, B1)] ^ ME[11]; - out[16*i+12] = SE[get_byte(0, B3)] ^ ME[12]; - out[16*i+13] = SE[get_byte(1, B0)] ^ ME[13]; - out[16*i+14] = SE[get_byte(2, B1)] ^ ME[14]; - out[16*i+15] = SE[get_byte(3, B2)] ^ ME[15]; + /* + * Use TE[x] >> 8 instead of SE[] so encryption only references a single + * lookup table. + */ + out[16*i+ 0] = static_cast<uint8_t>(TE[get_byte(0, B0)] >> 8) ^ ME[0]; + out[16*i+ 1] = static_cast<uint8_t>(TE[get_byte(1, B1)] >> 8) ^ ME[1]; + out[16*i+ 2] = static_cast<uint8_t>(TE[get_byte(2, B2)] >> 8) ^ ME[2]; + out[16*i+ 3] = static_cast<uint8_t>(TE[get_byte(3, B3)] >> 8) ^ ME[3]; + out[16*i+ 4] = static_cast<uint8_t>(TE[get_byte(0, B1)] >> 8) ^ ME[4]; + out[16*i+ 5] = static_cast<uint8_t>(TE[get_byte(1, B2)] >> 8) ^ ME[5]; + out[16*i+ 6] = static_cast<uint8_t>(TE[get_byte(2, B3)] >> 8) ^ ME[6]; + out[16*i+ 7] = static_cast<uint8_t>(TE[get_byte(3, B0)] >> 8) ^ ME[7]; + out[16*i+ 8] = static_cast<uint8_t>(TE[get_byte(0, B2)] >> 8) ^ ME[8]; + out[16*i+ 9] = static_cast<uint8_t>(TE[get_byte(1, B3)] >> 8) ^ ME[9]; + out[16*i+10] = static_cast<uint8_t>(TE[get_byte(2, B0)] >> 8) ^ ME[10]; + out[16*i+11] = static_cast<uint8_t>(TE[get_byte(3, B1)] >> 8) ^ ME[11]; + out[16*i+12] = static_cast<uint8_t>(TE[get_byte(0, B3)] >> 8) ^ ME[12]; + out[16*i+13] = static_cast<uint8_t>(TE[get_byte(1, B0)] >> 8) ^ ME[13]; + out[16*i+14] = static_cast<uint8_t>(TE[get_byte(2, B1)] >> 8) ^ ME[14]; + out[16*i+15] = static_cast<uint8_t>(TE[get_byte(3, B2)] >> 8) ^ ME[15]; } } @@ -259,7 +238,7 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const size_t cache_line_size = CPUID::cache_line_size(); const std::vector<uint32_t>& TD = AES_TD(); - uint32_t Z = 0; + volatile uint32_t Z = 0; for(size_t i = 0; i < TD.size(); i += cache_line_size / sizeof(uint32_t)) { Z |= TD[i]; @@ -275,45 +254,22 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, T0 ^= Z; - uint32_t B0 = TD[get_byte(0, T0)] ^ - rotr< 8>(TD[get_byte(1, T3)]) ^ - rotr<16>(TD[get_byte(2, T2)]) ^ - rotr<24>(TD[get_byte(3, T1)]) ^ DK[4]; - - uint32_t B1 = TD[get_byte(0, T1)] ^ - rotr< 8>(TD[get_byte(1, T0)]) ^ - rotr<16>(TD[get_byte(2, T3)]) ^ - rotr<24>(TD[get_byte(3, T2)]) ^ DK[5]; - - uint32_t B2 = TD[get_byte(0, T2)] ^ - rotr< 8>(TD[get_byte(1, T1)]) ^ - rotr<16>(TD[get_byte(2, T0)]) ^ - rotr<24>(TD[get_byte(3, T3)]) ^ DK[6]; - - uint32_t B3 = TD[get_byte(0, T3)] ^ - rotr< 8>(TD[get_byte(1, T2)]) ^ - rotr<16>(TD[get_byte(2, T1)]) ^ - rotr<24>(TD[get_byte(3, T0)]) ^ DK[7]; + uint32_t B0 = AES_T(TD, DK[4], T0, T3, T2, T1); + uint32_t B1 = AES_T(TD, DK[5], T1, T0, T3, T2); + uint32_t B2 = AES_T(TD, DK[6], T2, T1, T0, T3); + uint32_t B3 = AES_T(TD, DK[7], T3, T2, T1, T0); for(size_t r = 2*4; r < DK.size(); r += 2*4) { - T0 = DK[r ] ^ TD[get_byte(0, B0) ] ^ TD[get_byte(1, B3) + 256] ^ - TD[get_byte(2, B2) + 512] ^ TD[get_byte(3, B1) + 768]; - T1 = DK[r+1] ^ TD[get_byte(0, B1) ] ^ TD[get_byte(1, B0) + 256] ^ - TD[get_byte(2, B3) + 512] ^ TD[get_byte(3, B2) + 768]; - T2 = DK[r+2] ^ TD[get_byte(0, B2) ] ^ TD[get_byte(1, B1) + 256] ^ - TD[get_byte(2, B0) + 512] ^ TD[get_byte(3, B3) + 768]; - T3 = DK[r+3] ^ TD[get_byte(0, B3) ] ^ TD[get_byte(1, B2) + 256] ^ - TD[get_byte(2, B1) + 512] ^ TD[get_byte(3, B0) + 768]; - - B0 = DK[r+4] ^ TD[get_byte(0, T0) ] ^ TD[get_byte(1, T3) + 256] ^ - TD[get_byte(2, T2) + 512] ^ TD[get_byte(3, T1) + 768]; - B1 = DK[r+5] ^ TD[get_byte(0, T1) ] ^ TD[get_byte(1, T0) + 256] ^ - TD[get_byte(2, T3) + 512] ^ TD[get_byte(3, T2) + 768]; - B2 = DK[r+6] ^ TD[get_byte(0, T2) ] ^ TD[get_byte(1, T1) + 256] ^ - TD[get_byte(2, T0) + 512] ^ TD[get_byte(3, T3) + 768]; - B3 = DK[r+7] ^ TD[get_byte(0, T3) ] ^ TD[get_byte(1, T2) + 256] ^ - TD[get_byte(2, T1) + 512] ^ TD[get_byte(3, T0) + 768]; + T0 = AES_T(TD, DK[r ], B0, B3, B2, B1); + T1 = AES_T(TD, DK[r+1], B1, B0, B3, B2); + T2 = AES_T(TD, DK[r+2], B2, B1, B0, B3); + T3 = AES_T(TD, DK[r+3], B3, B2, B1, B0); + + B0 = AES_T(TD, DK[r+4], T0, T3, T2, T1); + B1 = AES_T(TD, DK[r+5], T1, T0, T3, T2); + B2 = AES_T(TD, DK[r+6], T2, T1, T0, T3); + B3 = AES_T(TD, DK[r+7], T3, T2, T1, T0); } out[ 0] = SD[get_byte(0, B0)] ^ MD[0]; @@ -363,21 +319,14 @@ void aes_key_schedule(const uint8_t key[], size_t length, for(size_t i = X; i < 4*(rounds+1); i += X) { - XEK[i] = XEK[i-X] ^ RC[(i-X)/X] ^ - make_uint32(SE[get_byte(1, XEK[i-1])], - SE[get_byte(2, XEK[i-1])], - SE[get_byte(3, XEK[i-1])], - SE[get_byte(0, XEK[i-1])]); + XEK[i] = XEK[i-X] ^ RC[(i-X)/X] ^ SE_word(rotl<8>(XEK[i-1])); for(size_t j = 1; j != X; ++j) { XEK[i+j] = XEK[i+j-X]; if(X == 8 && j == 4) - XEK[i+j] ^= make_uint32(SE[get_byte(0, XEK[i+j-1])], - SE[get_byte(1, XEK[i+j-1])], - SE[get_byte(2, XEK[i+j-1])], - SE[get_byte(3, XEK[i+j-1])]); + XEK[i+j] ^= SE_word(XEK[i+j-1]); else XEK[i+j] ^= XEK[i+j-1]; } @@ -394,10 +343,10 @@ void aes_key_schedule(const uint8_t key[], size_t length, } for(size_t i = 4; i != length + 24; ++i) - XDK[i] = TD[SE[get_byte(0, XDK[i])] + 0] ^ - TD[SE[get_byte(1, XDK[i])] + 256] ^ - TD[SE[get_byte(2, XDK[i])] + 512] ^ - TD[SE[get_byte(3, XDK[i])] + 768]; + { + XDK[i] = SE_word(XDK[i]); + XDK[i] = AES_T(TD, 0, XDK[i], XDK[i], XDK[i], XDK[i]); + } ME.resize(16); MD.resize(16); @@ -427,6 +376,8 @@ void aes_key_schedule(const uint8_t key[], size_t length, } +#undef AES_T + size_t aes_parallelism() { #if defined(BOTAN_HAS_AES_NI) |