diff options
-rw-r--r-- | src/lib/block/aes/aes.cpp | 363 | ||||
-rw-r--r-- | src/tests/data/block/aes.vec | 16 |
2 files changed, 202 insertions, 177 deletions
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 64205504f..7b3c9bc37 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -1,9 +1,6 @@ /* -* AES * (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd * -* Based on the public domain reference implementation by Paulo Baretto -* * Botan is released under the Simplified BSD License (see license.txt) */ @@ -15,69 +12,11 @@ #include <botan/internal/ct_utils.h> #include <type_traits> -/* -* This implementation is based on table lookups which are known to be -* vulnerable to timing and cache based side channel attacks. Some -* countermeasures are used which may be helpful in some situations: -* -* - Only a single 256-word T-table is used, with rotations applied. -* Most implementations use 4 (or sometimes 5) T-tables, which leaks -* much more information via cache usage. -* -* - The TE and TD tables are computed at runtime to avoid flush+reload -* attacks using clflush. As different processes will not share the -* same underlying table data, an attacker can't manipulate another -* processes cache lines via their shared reference to the library -* read only segment. (However, prime+probe attacks are still possible.) -* -* - Each cache line of the lookup tables is accessed at the beginning -* of each call to encrypt or decrypt. (See the Z variable below) -* -* If available SSSE3 or AES-NI are used instead of this version, as both -* are faster and immune to side channel attacks. -* -* Some AES cache timing papers for reference: -* -* "Software mitigations to hedge AES against cache-based software side -* channel vulnerabilities" https://eprint.iacr.org/2006/052.pdf -* -* "Cache Games - Bringing Access-Based Cache Attacks on AES to Practice" -* http://www.ieee-security.org/TC/SP2011/PAPERS/2011/paper031.pdf -* -* "Cache-Collision Timing Attacks Against AES" Bonneau, Mironov -* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.88.4753 -*/ - namespace Botan { namespace { alignas(64) -const uint8_t SE[256] = { - 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, - 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, - 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26, - 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, - 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, - 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, - 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED, - 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, - 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, - 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, - 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC, - 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, - 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, - 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, - 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D, - 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, - 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, - 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, - 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, - 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, - 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, - 0xB0, 0x54, 0xBB, 0x16 }; - -alignas(64) const uint8_t SD[256] = { 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, @@ -103,7 +42,6 @@ const uint8_t SD[256] = { 0x55, 0x21, 0x0C, 0x7D }; inline constexpr uint8_t xtime(uint8_t s) { return static_cast<uint8_t>(s << 1) ^ ((s >> 7) * 0x1B); } -inline constexpr uint8_t xtime3(uint8_t s) { return xtime(s) ^ s; } inline uint32_t InvMixColumn(uint8_t s1) { @@ -118,8 +56,8 @@ inline uint32_t InvMixColumn(uint8_t s1) } /* -This is a bitsliced AES sbox computation which can execute up -to 32 parallel sbox computations. +This is an AES sbox circuit which can execute in bitsliced mode up to 32x in +parallel. The circuit is from "A depth-16 circuit for the AES S-box" by Boyar and Peralta (https://eprint.iacr.org/2011/332.pdf) @@ -318,64 +256,132 @@ inline uint32_t SE_word(uint32_t x) return x; } -const uint32_t* AES_TE() +inline void bit_transpose(uint32_t B[8]) { - class TE_Table final - { - public: - TE_Table() - { - uint32_t* p = reinterpret_cast<uint32_t*>(&data); - for(size_t i = 0; i != 256; ++i) - { - const uint8_t s = SE[i]; - p[i] = make_uint32(xtime(s), s, s, xtime3(s)); - } - } + swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1); + swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1); + swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1); + swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1); + + swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2); + swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2); + swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2); + swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2); + + swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4); + swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4); + swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4); + swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4); + } - const uint32_t* ptr() const - { - return reinterpret_cast<const uint32_t*>(&data); - } - private: - std::aligned_storage<256*sizeof(uint32_t), 64>::type data; - }; +inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) + { + /* + This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation + due to knowing the first and second halves are the same data. + */ + for(size_t i = 0; i != 4; ++i) + B[i] = K[r + i]; - static TE_Table table; - return table.ptr(); + swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1); + swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1); + + swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2); + swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2); + + B[4] = B[0]; + B[5] = B[1]; + B[6] = B[2]; + B[7] = B[3]; + + swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4); + swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4); + swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4); + swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4); } -const uint32_t* AES_TD() +inline void shift_rows(uint32_t B[8]) { - class TD_Table final + for(size_t i = 0; i != 8; ++i) { - public: - TD_Table() - { - uint32_t* p = reinterpret_cast<uint32_t*>(&data); - for(size_t i = 0; i != 256; ++i) - { - p[i] = InvMixColumn(SD[i]); - } - } + uint32_t x = B[i]; + // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31 + x = bit_permute_step<uint32_t>(x, 0x00223311, 2); // Butterfly, stage 1 + x = bit_permute_step<uint32_t>(x, 0x00550055, 1); // Butterfly, stage 0 + B[i] = x; + } + } - const uint32_t* ptr() const - { - return reinterpret_cast<const uint32_t*>(&data); - } - private: - std::aligned_storage<256*sizeof(uint32_t), 64>::type data; - }; +inline void mix_columns(uint32_t B[8]) + { + /* + This is equivalent to what T-tables mix columns looks like when you decompose it: + + // carry high bits in B[0] to positions in 0x1b == 0b11011 + const uint32_t X2[8] = { + B[1], + B[2], + B[3], + B[4] ^ B[0], + B[5] ^ B[0], + B[6], + B[7] ^ B[0], + B[0], + }; + for(size_t i = 0; i != 8; i++) + { + const uint32_t X3 = B[i] ^ X2[i]; - static TD_Table table; - return table.ptr(); - } + uint8_t b0 = get_byte(0, X2[i]) ^ get_byte(1, X3) ^ get_byte(2, B[i]) ^ get_byte(3, B[i]); + uint8_t b1 = get_byte(0, B[i]) ^ get_byte(1, X2[i]) ^ get_byte(2, X3) ^ get_byte(3, B[i]); + uint8_t b2 = get_byte(0, B[i]) ^ get_byte(1, B[i]) ^ get_byte(2, X2[i]) ^ get_byte(3, X3); + uint8_t b3 = get_byte(0, X3) ^ get_byte(1, B[i]) ^ get_byte(2, B[i]) ^ get_byte(3, X2[i]); -#define AES_T(T, K, V0, V1, V2, V3) \ - (K ^ T[get_byte(0, V0)] ^ \ - rotr< 8>(T[get_byte(1, V1)]) ^ \ - rotr<16>(T[get_byte(2, V2)]) ^ \ - rotr<24>(T[get_byte(3, V3)])) + B[i] = make_uint32(b0, b1, b2, b3); + } + + Notice that each byte of B[i], X2[i] and X3 is used once in each column, so + we can instead effect the selections by rotations and do the XORs in word units + instead of bytes. Unrolling and expanding the definition of X2 then combining + similar terms results in the expressions below. The end result is very + similar to the MixColumns found in section 4.4 and Appendix A of "Faster and + Timing-Attack Resistant AES-GCM" (https://eprint.iacr.org/2009/129.pdf) except + suited to our word size, and of course we cannot make use of word/byte shuffles + to perform the rotations. + */ + + const uint32_t R24[8] = { + rotr<24>(B[0]), + rotr<24>(B[1]), + rotr<24>(B[2]), + rotr<24>(B[3]), + rotr<24>(B[4]), + rotr<24>(B[5]), + rotr<24>(B[6]), + rotr<24>(B[7]) + }; + + const uint32_t R8_16[8] = { + rotr<8>(B[0]) ^ rotr<16>(B[0]), + rotr<8>(B[1]) ^ rotr<16>(B[1]), + rotr<8>(B[2]) ^ rotr<16>(B[2]), + rotr<8>(B[3]) ^ rotr<16>(B[3]), + rotr<8>(B[4]) ^ rotr<16>(B[4]), + rotr<8>(B[5]) ^ rotr<16>(B[5]), + rotr<8>(B[6]) ^ rotr<16>(B[6]), + rotr<8>(B[7]) ^ rotr<16>(B[7]) + }; + + const uint32_t B0 = B[1] ^ R24[0] ^ R24[1] ^ R8_16[0]; + B[1] = B[2] ^ R24[1] ^ R24[2] ^ R8_16[1]; + B[2] = B[3] ^ R24[2] ^ R24[3] ^ R8_16[2]; + B[3] = B[0] ^ B[4] ^ R24[0] ^ R24[3] ^ R24[4] ^ R8_16[3]; + B[4] = B[5] ^ B[0] ^ R24[0] ^ R24[4] ^ R24[5] ^ R8_16[4]; + B[5] = B[6] ^ R24[5] ^ R24[6] ^ R8_16[5]; + B[6] = B[7] ^ B[0] ^ R24[0] ^ R24[6] ^ R24[7] ^ R8_16[6]; + B[7] = B[0] ^ R24[0] ^ R24[7] ^ R8_16[7]; + B[0] = B0; + } /* * AES Encryption @@ -386,71 +392,94 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], const secure_vector<uint8_t>& ME) { BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set"); + BOTAN_ASSERT(EK.size() == 40 || EK.size() == 48 || EK.size() == 56, "Expected EK size"); - const size_t cache_line_size = CPUID::cache_line_size(); - const uint32_t* TE = AES_TE(); - - // Hit every cache line of TE - volatile uint32_t Z = 0; - for(size_t i = 0; i < 256; i += cache_line_size / sizeof(uint32_t)) + uint32_t KS[56*2] = { 0 }; // actual maximum is EK.size() * 2 + for(size_t i = 4; i < EK.size(); i += 4) { - Z |= TE[i]; + ks_expand(&KS[2*(i-4)], EK.data(), i); } - Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce - for(size_t i = 0; i < blocks; ++i) + while(blocks > 0) { - uint32_t T0, T1, T2, T3; - load_be(in + 16*i, T0, T1, T2, T3); + const size_t this_loop = (blocks >= 2) ? 2 : 1; - T0 ^= EK[0]; - T1 ^= EK[1]; - T2 ^= EK[2]; - T3 ^= EK[3]; + uint32_t B[8] = { 0 }; - T0 ^= Z; + load_be(B, in, this_loop*4); - uint32_t B0 = AES_T(TE, EK[4], T0, T1, T2, T3); - uint32_t B1 = AES_T(TE, EK[5], T1, T2, T3, T0); - uint32_t B2 = AES_T(TE, EK[6], T2, T3, T0, T1); - uint32_t B3 = AES_T(TE, EK[7], T3, T0, T1, T2); + B[0] ^= EK[0]; + B[1] ^= EK[1]; + B[2] ^= EK[2]; + B[3] ^= EK[3]; + B[4] ^= EK[0]; + B[5] ^= EK[1]; + B[6] ^= EK[2]; + B[7] ^= EK[3]; - for(size_t r = 2*4; r < EK.size(); r += 2*4) + bit_transpose(B); + + for(size_t r = 4; r < EK.size(); r += 4) { - T0 = AES_T(TE, EK[r ], B0, B1, B2, B3); - T1 = AES_T(TE, EK[r+1], B1, B2, B3, B0); - T2 = AES_T(TE, EK[r+2], B2, B3, B0, B1); - T3 = AES_T(TE, EK[r+3], B3, B0, B1, B2); - - B0 = AES_T(TE, EK[r+4], T0, T1, T2, T3); - B1 = AES_T(TE, EK[r+5], T1, T2, T3, T0); - B2 = AES_T(TE, EK[r+6], T2, T3, T0, T1); - B3 = AES_T(TE, EK[r+7], T3, T0, T1, T2); + AES_SBOX(B); + shift_rows(B); + mix_columns(B); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= KS[2*(r-4) + i]; } - /* - * Use TE[x] >> 8 instead of SE[] so encryption only references a single - * lookup table. - */ - out[16*i+ 0] = static_cast<uint8_t>(TE[get_byte(0, B0)] >> 8) ^ ME[0]; - out[16*i+ 1] = static_cast<uint8_t>(TE[get_byte(1, B1)] >> 8) ^ ME[1]; - out[16*i+ 2] = static_cast<uint8_t>(TE[get_byte(2, B2)] >> 8) ^ ME[2]; - out[16*i+ 3] = static_cast<uint8_t>(TE[get_byte(3, B3)] >> 8) ^ ME[3]; - out[16*i+ 4] = static_cast<uint8_t>(TE[get_byte(0, B1)] >> 8) ^ ME[4]; - out[16*i+ 5] = static_cast<uint8_t>(TE[get_byte(1, B2)] >> 8) ^ ME[5]; - out[16*i+ 6] = static_cast<uint8_t>(TE[get_byte(2, B3)] >> 8) ^ ME[6]; - out[16*i+ 7] = static_cast<uint8_t>(TE[get_byte(3, B0)] >> 8) ^ ME[7]; - out[16*i+ 8] = static_cast<uint8_t>(TE[get_byte(0, B2)] >> 8) ^ ME[8]; - out[16*i+ 9] = static_cast<uint8_t>(TE[get_byte(1, B3)] >> 8) ^ ME[9]; - out[16*i+10] = static_cast<uint8_t>(TE[get_byte(2, B0)] >> 8) ^ ME[10]; - out[16*i+11] = static_cast<uint8_t>(TE[get_byte(3, B1)] >> 8) ^ ME[11]; - out[16*i+12] = static_cast<uint8_t>(TE[get_byte(0, B3)] >> 8) ^ ME[12]; - out[16*i+13] = static_cast<uint8_t>(TE[get_byte(1, B0)] >> 8) ^ ME[13]; - out[16*i+14] = static_cast<uint8_t>(TE[get_byte(2, B1)] >> 8) ^ ME[14]; - out[16*i+15] = static_cast<uint8_t>(TE[get_byte(3, B2)] >> 8) ^ ME[15]; + // Final round: + AES_SBOX(B); + shift_rows(B); + bit_transpose(B); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= load_be<uint32_t>(ME.data(), i % 4); + + if(this_loop == 2) + store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); + else + store_be(out, B[0], B[1], B[2], B[3]); + + in += this_loop*16; + out += this_loop*16; + blocks -= this_loop; } } +const uint32_t* AES_TD() + { + class TD_Table final + { + public: + TD_Table() + { + uint32_t* p = reinterpret_cast<uint32_t*>(&data); + for(size_t i = 0; i != 256; ++i) + { + p[i] = InvMixColumn(SD[i]); + } + } + + const uint32_t* ptr() const + { + return reinterpret_cast<const uint32_t*>(&data); + } + private: + std::aligned_storage<256*sizeof(uint32_t), 64>::type data; + }; + + static TD_Table table; + return table.ptr(); + } + +#define AES_T(T, K, V0, V1, V2, V3) \ + (K ^ T[get_byte(0, V0)] ^ \ + rotr< 8>(T[get_byte(1, V1)]) ^ \ + rotr<16>(T[get_byte(2, V2)]) ^ \ + rotr<24>(T[get_byte(3, V3)])) + /* * AES Decryption */ @@ -642,7 +671,15 @@ size_t aes_parallelism() } #endif - return 1; +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return 2; + } +#endif + + // bitsliced: + return 2; } const char* aes_provider() diff --git a/src/tests/data/block/aes.vec b/src/tests/data/block/aes.vec index 8cd7f4d85..e08df20d4 100644 --- a/src/tests/data/block/aes.vec +++ b/src/tests/data/block/aes.vec @@ -1033,20 +1033,8 @@ In = 00000000000000000000000000000000 Out = 0545AAD56DA2A97C3663D1432A3D1C84 Key = 00000000000000000000000000000000 -In = 80000000000000000000000000000000 -Out = 3AD78E726C1EC02B7EBFE92B23D9EC34 - -Key = 00000000000000000000000000000000 -In = 40000000000000000000000000000000 -Out = 45BC707D29E8204D88DFBA2F0B0CAD9B - -Key = 00000000000000000000000000000000 -In = 20000000000000000000000000000000 -Out = 161556838018F52805CDBD6202002E3F - -Key = 00000000000000000000000000000000 -In = 10000000000000000000000000000000 -Out = F5569B3AB6A6D11EFDE1BF0A64C6854A +In = 80000000000000000000000000000000400000000000000000000000000000002000000000000000000000000000000010000000000000000000000000000000 +Out = 3AD78E726C1EC02B7EBFE92B23D9EC3445BC707D29E8204D88DFBA2F0B0CAD9B161556838018F52805CDBD6202002E3FF5569B3AB6A6D11EFDE1BF0A64C6854A Key = 00000000000000000000000000000000 In = 08000000000000000000000000000000 |