From 40bca8a33c51c2b057003a66c4e1a380ccf78a24 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Tue, 5 May 2020 11:00:37 -0400 Subject: Add constant time bitsliced AES encryption for CPUs without vperm or hardware --- src/lib/block/aes/aes.cpp | 363 +++++++++++++++++++++++++--------------------- 1 file changed, 200 insertions(+), 163 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 64205504f..7b3c9bc37 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -1,9 +1,6 @@ /* -* AES * (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd * -* Based on the public domain reference implementation by Paulo Baretto -* * Botan is released under the Simplified BSD License (see license.txt) */ @@ -15,68 +12,10 @@ #include #include -/* -* This implementation is based on table lookups which are known to be -* vulnerable to timing and cache based side channel attacks. Some -* countermeasures are used which may be helpful in some situations: -* -* - Only a single 256-word T-table is used, with rotations applied. -* Most implementations use 4 (or sometimes 5) T-tables, which leaks -* much more information via cache usage. -* -* - The TE and TD tables are computed at runtime to avoid flush+reload -* attacks using clflush. As different processes will not share the -* same underlying table data, an attacker can't manipulate another -* processes cache lines via their shared reference to the library -* read only segment. (However, prime+probe attacks are still possible.) -* -* - Each cache line of the lookup tables is accessed at the beginning -* of each call to encrypt or decrypt. (See the Z variable below) -* -* If available SSSE3 or AES-NI are used instead of this version, as both -* are faster and immune to side channel attacks. -* -* Some AES cache timing papers for reference: -* -* "Software mitigations to hedge AES against cache-based software side -* channel vulnerabilities" https://eprint.iacr.org/2006/052.pdf -* -* "Cache Games - Bringing Access-Based Cache Attacks on AES to Practice" -* http://www.ieee-security.org/TC/SP2011/PAPERS/2011/paper031.pdf -* -* "Cache-Collision Timing Attacks Against AES" Bonneau, Mironov -* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.88.4753 -*/ - namespace Botan { namespace { -alignas(64) -const uint8_t SE[256] = { - 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, - 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, - 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26, - 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, - 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, - 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, - 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED, - 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, - 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, - 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, - 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC, - 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, - 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, - 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, - 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D, - 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, - 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, - 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, - 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, - 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, - 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, - 0xB0, 0x54, 0xBB, 0x16 }; - alignas(64) const uint8_t SD[256] = { 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, @@ -103,7 +42,6 @@ const uint8_t SD[256] = { 0x55, 0x21, 0x0C, 0x7D }; inline constexpr uint8_t xtime(uint8_t s) { return static_cast(s << 1) ^ ((s >> 7) * 0x1B); } -inline constexpr uint8_t xtime3(uint8_t s) { return xtime(s) ^ s; } inline uint32_t InvMixColumn(uint8_t s1) { @@ -118,8 +56,8 @@ inline uint32_t InvMixColumn(uint8_t s1) } /* -This is a bitsliced AES sbox computation which can execute up -to 32 parallel sbox computations. +This is an AES sbox circuit which can execute in bitsliced mode up to 32x in +parallel. The circuit is from "A depth-16 circuit for the AES S-box" by Boyar and Peralta (https://eprint.iacr.org/2011/332.pdf) @@ -318,64 +256,132 @@ inline uint32_t SE_word(uint32_t x) return x; } -const uint32_t* AES_TE() +inline void bit_transpose(uint32_t B[8]) { - class TE_Table final - { - public: - TE_Table() - { - uint32_t* p = reinterpret_cast(&data); - for(size_t i = 0; i != 256; ++i) - { - const uint8_t s = SE[i]; - p[i] = make_uint32(xtime(s), s, s, xtime3(s)); - } - } + swap_bits(B[1], B[0], 0x55555555, 1); + swap_bits(B[3], B[2], 0x55555555, 1); + swap_bits(B[5], B[4], 0x55555555, 1); + swap_bits(B[7], B[6], 0x55555555, 1); + + swap_bits(B[2], B[0], 0x33333333, 2); + swap_bits(B[3], B[1], 0x33333333, 2); + swap_bits(B[6], B[4], 0x33333333, 2); + swap_bits(B[7], B[5], 0x33333333, 2); + + swap_bits(B[4], B[0], 0x0F0F0F0F, 4); + swap_bits(B[5], B[1], 0x0F0F0F0F, 4); + swap_bits(B[6], B[2], 0x0F0F0F0F, 4); + swap_bits(B[7], B[3], 0x0F0F0F0F, 4); + } - const uint32_t* ptr() const - { - return reinterpret_cast(&data); - } - private: - std::aligned_storage<256*sizeof(uint32_t), 64>::type data; - }; +inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) + { + /* + This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation + due to knowing the first and second halves are the same data. + */ + for(size_t i = 0; i != 4; ++i) + B[i] = K[r + i]; - static TE_Table table; - return table.ptr(); + swap_bits(B[1], B[0], 0x55555555, 1); + swap_bits(B[3], B[2], 0x55555555, 1); + + swap_bits(B[2], B[0], 0x33333333, 2); + swap_bits(B[3], B[1], 0x33333333, 2); + + B[4] = B[0]; + B[5] = B[1]; + B[6] = B[2]; + B[7] = B[3]; + + swap_bits(B[4], B[0], 0x0F0F0F0F, 4); + swap_bits(B[5], B[1], 0x0F0F0F0F, 4); + swap_bits(B[6], B[2], 0x0F0F0F0F, 4); + swap_bits(B[7], B[3], 0x0F0F0F0F, 4); } -const uint32_t* AES_TD() +inline void shift_rows(uint32_t B[8]) { - class TD_Table final + for(size_t i = 0; i != 8; ++i) { - public: - TD_Table() - { - uint32_t* p = reinterpret_cast(&data); - for(size_t i = 0; i != 256; ++i) - { - p[i] = InvMixColumn(SD[i]); - } - } + uint32_t x = B[i]; + // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31 + x = bit_permute_step(x, 0x00223311, 2); // Butterfly, stage 1 + x = bit_permute_step(x, 0x00550055, 1); // Butterfly, stage 0 + B[i] = x; + } + } - const uint32_t* ptr() const - { - return reinterpret_cast(&data); - } - private: - std::aligned_storage<256*sizeof(uint32_t), 64>::type data; - }; +inline void mix_columns(uint32_t B[8]) + { + /* + This is equivalent to what T-tables mix columns looks like when you decompose it: + + // carry high bits in B[0] to positions in 0x1b == 0b11011 + const uint32_t X2[8] = { + B[1], + B[2], + B[3], + B[4] ^ B[0], + B[5] ^ B[0], + B[6], + B[7] ^ B[0], + B[0], + }; + for(size_t i = 0; i != 8; i++) + { + const uint32_t X3 = B[i] ^ X2[i]; - static TD_Table table; - return table.ptr(); - } + uint8_t b0 = get_byte(0, X2[i]) ^ get_byte(1, X3) ^ get_byte(2, B[i]) ^ get_byte(3, B[i]); + uint8_t b1 = get_byte(0, B[i]) ^ get_byte(1, X2[i]) ^ get_byte(2, X3) ^ get_byte(3, B[i]); + uint8_t b2 = get_byte(0, B[i]) ^ get_byte(1, B[i]) ^ get_byte(2, X2[i]) ^ get_byte(3, X3); + uint8_t b3 = get_byte(0, X3) ^ get_byte(1, B[i]) ^ get_byte(2, B[i]) ^ get_byte(3, X2[i]); -#define AES_T(T, K, V0, V1, V2, V3) \ - (K ^ T[get_byte(0, V0)] ^ \ - rotr< 8>(T[get_byte(1, V1)]) ^ \ - rotr<16>(T[get_byte(2, V2)]) ^ \ - rotr<24>(T[get_byte(3, V3)])) + B[i] = make_uint32(b0, b1, b2, b3); + } + + Notice that each byte of B[i], X2[i] and X3 is used once in each column, so + we can instead effect the selections by rotations and do the XORs in word units + instead of bytes. Unrolling and expanding the definition of X2 then combining + similar terms results in the expressions below. The end result is very + similar to the MixColumns found in section 4.4 and Appendix A of "Faster and + Timing-Attack Resistant AES-GCM" (https://eprint.iacr.org/2009/129.pdf) except + suited to our word size, and of course we cannot make use of word/byte shuffles + to perform the rotations. + */ + + const uint32_t R24[8] = { + rotr<24>(B[0]), + rotr<24>(B[1]), + rotr<24>(B[2]), + rotr<24>(B[3]), + rotr<24>(B[4]), + rotr<24>(B[5]), + rotr<24>(B[6]), + rotr<24>(B[7]) + }; + + const uint32_t R8_16[8] = { + rotr<8>(B[0]) ^ rotr<16>(B[0]), + rotr<8>(B[1]) ^ rotr<16>(B[1]), + rotr<8>(B[2]) ^ rotr<16>(B[2]), + rotr<8>(B[3]) ^ rotr<16>(B[3]), + rotr<8>(B[4]) ^ rotr<16>(B[4]), + rotr<8>(B[5]) ^ rotr<16>(B[5]), + rotr<8>(B[6]) ^ rotr<16>(B[6]), + rotr<8>(B[7]) ^ rotr<16>(B[7]) + }; + + const uint32_t B0 = B[1] ^ R24[0] ^ R24[1] ^ R8_16[0]; + B[1] = B[2] ^ R24[1] ^ R24[2] ^ R8_16[1]; + B[2] = B[3] ^ R24[2] ^ R24[3] ^ R8_16[2]; + B[3] = B[0] ^ B[4] ^ R24[0] ^ R24[3] ^ R24[4] ^ R8_16[3]; + B[4] = B[5] ^ B[0] ^ R24[0] ^ R24[4] ^ R24[5] ^ R8_16[4]; + B[5] = B[6] ^ R24[5] ^ R24[6] ^ R8_16[5]; + B[6] = B[7] ^ B[0] ^ R24[0] ^ R24[6] ^ R24[7] ^ R8_16[6]; + B[7] = B[0] ^ R24[0] ^ R24[7] ^ R8_16[7]; + B[0] = B0; + } /* * AES Encryption @@ -386,71 +392,94 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], const secure_vector& ME) { BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set"); + BOTAN_ASSERT(EK.size() == 40 || EK.size() == 48 || EK.size() == 56, "Expected EK size"); - const size_t cache_line_size = CPUID::cache_line_size(); - const uint32_t* TE = AES_TE(); - - // Hit every cache line of TE - volatile uint32_t Z = 0; - for(size_t i = 0; i < 256; i += cache_line_size / sizeof(uint32_t)) + uint32_t KS[56*2] = { 0 }; // actual maximum is EK.size() * 2 + for(size_t i = 4; i < EK.size(); i += 4) { - Z |= TE[i]; + ks_expand(&KS[2*(i-4)], EK.data(), i); } - Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce - for(size_t i = 0; i < blocks; ++i) + while(blocks > 0) { - uint32_t T0, T1, T2, T3; - load_be(in + 16*i, T0, T1, T2, T3); + const size_t this_loop = (blocks >= 2) ? 2 : 1; - T0 ^= EK[0]; - T1 ^= EK[1]; - T2 ^= EK[2]; - T3 ^= EK[3]; + uint32_t B[8] = { 0 }; - T0 ^= Z; + load_be(B, in, this_loop*4); - uint32_t B0 = AES_T(TE, EK[4], T0, T1, T2, T3); - uint32_t B1 = AES_T(TE, EK[5], T1, T2, T3, T0); - uint32_t B2 = AES_T(TE, EK[6], T2, T3, T0, T1); - uint32_t B3 = AES_T(TE, EK[7], T3, T0, T1, T2); + B[0] ^= EK[0]; + B[1] ^= EK[1]; + B[2] ^= EK[2]; + B[3] ^= EK[3]; + B[4] ^= EK[0]; + B[5] ^= EK[1]; + B[6] ^= EK[2]; + B[7] ^= EK[3]; - for(size_t r = 2*4; r < EK.size(); r += 2*4) + bit_transpose(B); + + for(size_t r = 4; r < EK.size(); r += 4) { - T0 = AES_T(TE, EK[r ], B0, B1, B2, B3); - T1 = AES_T(TE, EK[r+1], B1, B2, B3, B0); - T2 = AES_T(TE, EK[r+2], B2, B3, B0, B1); - T3 = AES_T(TE, EK[r+3], B3, B0, B1, B2); - - B0 = AES_T(TE, EK[r+4], T0, T1, T2, T3); - B1 = AES_T(TE, EK[r+5], T1, T2, T3, T0); - B2 = AES_T(TE, EK[r+6], T2, T3, T0, T1); - B3 = AES_T(TE, EK[r+7], T3, T0, T1, T2); + AES_SBOX(B); + shift_rows(B); + mix_columns(B); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= KS[2*(r-4) + i]; } - /* - * Use TE[x] >> 8 instead of SE[] so encryption only references a single - * lookup table. - */ - out[16*i+ 0] = static_cast(TE[get_byte(0, B0)] >> 8) ^ ME[0]; - out[16*i+ 1] = static_cast(TE[get_byte(1, B1)] >> 8) ^ ME[1]; - out[16*i+ 2] = static_cast(TE[get_byte(2, B2)] >> 8) ^ ME[2]; - out[16*i+ 3] = static_cast(TE[get_byte(3, B3)] >> 8) ^ ME[3]; - out[16*i+ 4] = static_cast(TE[get_byte(0, B1)] >> 8) ^ ME[4]; - out[16*i+ 5] = static_cast(TE[get_byte(1, B2)] >> 8) ^ ME[5]; - out[16*i+ 6] = static_cast(TE[get_byte(2, B3)] >> 8) ^ ME[6]; - out[16*i+ 7] = static_cast(TE[get_byte(3, B0)] >> 8) ^ ME[7]; - out[16*i+ 8] = static_cast(TE[get_byte(0, B2)] >> 8) ^ ME[8]; - out[16*i+ 9] = static_cast(TE[get_byte(1, B3)] >> 8) ^ ME[9]; - out[16*i+10] = static_cast(TE[get_byte(2, B0)] >> 8) ^ ME[10]; - out[16*i+11] = static_cast(TE[get_byte(3, B1)] >> 8) ^ ME[11]; - out[16*i+12] = static_cast(TE[get_byte(0, B3)] >> 8) ^ ME[12]; - out[16*i+13] = static_cast(TE[get_byte(1, B0)] >> 8) ^ ME[13]; - out[16*i+14] = static_cast(TE[get_byte(2, B1)] >> 8) ^ ME[14]; - out[16*i+15] = static_cast(TE[get_byte(3, B2)] >> 8) ^ ME[15]; + // Final round: + AES_SBOX(B); + shift_rows(B); + bit_transpose(B); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= load_be(ME.data(), i % 4); + + if(this_loop == 2) + store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); + else + store_be(out, B[0], B[1], B[2], B[3]); + + in += this_loop*16; + out += this_loop*16; + blocks -= this_loop; } } +const uint32_t* AES_TD() + { + class TD_Table final + { + public: + TD_Table() + { + uint32_t* p = reinterpret_cast(&data); + for(size_t i = 0; i != 256; ++i) + { + p[i] = InvMixColumn(SD[i]); + } + } + + const uint32_t* ptr() const + { + return reinterpret_cast(&data); + } + private: + std::aligned_storage<256*sizeof(uint32_t), 64>::type data; + }; + + static TD_Table table; + return table.ptr(); + } + +#define AES_T(T, K, V0, V1, V2, V3) \ + (K ^ T[get_byte(0, V0)] ^ \ + rotr< 8>(T[get_byte(1, V1)]) ^ \ + rotr<16>(T[get_byte(2, V2)]) ^ \ + rotr<24>(T[get_byte(3, V3)])) + /* * AES Decryption */ @@ -642,7 +671,15 @@ size_t aes_parallelism() } #endif - return 1; +#if defined(BOTAN_HAS_AES_VPERM) + if(CPUID::has_vperm()) + { + return 2; + } +#endif + + // bitsliced: + return 2; } const char* aes_provider() -- cgit v1.2.3 From e47ddaff1910069c0d3e2ce6dc8276e843dda76a Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 02:44:05 -0400 Subject: Add bitsliced decryption --- src/lib/block/aes/aes.cpp | 364 +++++++++++++++++++++++++++++++++------------- 1 file changed, 264 insertions(+), 100 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 7b3c9bc37..93b17e528 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -227,6 +227,165 @@ void AES_SBOX(uint32_t V[8]) V[7] = S7; } +void AES_INV_SBOX(uint32_t V[8]) + { + const uint32_t I0 = V[0]; + const uint32_t I1 = V[1]; + const uint32_t I2 = V[2]; + const uint32_t I3 = V[3]; + const uint32_t I4 = V[4]; + const uint32_t I5 = V[5]; + const uint32_t I6 = V[6]; + const uint32_t I7 = V[7]; + + // Figure 6: Top linear transform in reverse direction + const uint32_t T23 = I0 ^ I3; + const uint32_t T22 = ~(I1 ^ I3); + const uint32_t T2 = ~(I0 ^ I1); + const uint32_t T1 = I3 ^ I4; + const uint32_t T24 = ~(I4 ^ I7); + const uint32_t R5 = I6 ^ I7; + const uint32_t T8 = ~(I1 ^ T23); + const uint32_t T19 = T22 ^ R5; + const uint32_t T9 = ~(I7 ^ T1); + const uint32_t T10 = T2 ^ T24; + const uint32_t T13 = T2 ^ R5; + const uint32_t T3 = T1 ^ R5; + const uint32_t T25 = ~(I2 ^ T1); + const uint32_t R13 = I1 ^ I6; + const uint32_t T17 = ~(I2 ^ T19); + const uint32_t T20 = T24 ^ R13; + const uint32_t T4 = I4 ^ T8; + const uint32_t R17 = ~(I2 ^ I5); + const uint32_t R18 = ~(I5 ^ I6); + const uint32_t R19 = ~(I2 ^ I4); + const uint32_t Y5 = I0 ^ R17; + const uint32_t T6 = T22 ^ R17; + const uint32_t T16 = R13 ^ R19; + const uint32_t T27 = T1 ^ R18; + const uint32_t T15 = T10 ^ T27; + const uint32_t T14 = T10 ^ R18; + const uint32_t T26 = T3 ^ T16; + + const uint32_t D = Y5; + + // Figure 7: Shared part of AES S-box circuit + const uint32_t M1 = T13 & T6; + const uint32_t M2 = T23 & T8; + const uint32_t M3 = T14 ^ M1; + const uint32_t M4 = T19 & D; + const uint32_t M5 = M4 ^ M1; + const uint32_t M6 = T3 & T16; + const uint32_t M7 = T22 & T9; + const uint32_t M8 = T26 ^ M6; + const uint32_t M9 = T20 & T17; + const uint32_t M10 = M9 ^ M6; + const uint32_t M11 = T1 & T15; + const uint32_t M12 = T4 & T27; + const uint32_t M13 = M12 ^ M11; + const uint32_t M14 = T2 & T10; + const uint32_t M15 = M14 ^ M11; + const uint32_t M16 = M3 ^ M2; + + const uint32_t M17 = M5 ^ T24; + const uint32_t M18 = M8 ^ M7; + const uint32_t M19 = M10 ^ M15; + const uint32_t M20 = M16 ^ M13; + const uint32_t M21 = M17 ^ M15; + const uint32_t M22 = M18 ^ M13; + const uint32_t M23 = M19 ^ T25; + const uint32_t M24 = M22 ^ M23; + const uint32_t M25 = M22 & M20; + const uint32_t M26 = M21 ^ M25; + const uint32_t M27 = M20 ^ M21; + const uint32_t M28 = M23 ^ M25; + const uint32_t M29 = M28 & M27; + const uint32_t M30 = M26 & M24; + const uint32_t M31 = M20 & M23; + const uint32_t M32 = M27 & M31; + + const uint32_t M33 = M27 ^ M25; + const uint32_t M34 = M21 & M22; + const uint32_t M35 = M24 & M34; + const uint32_t M36 = M24 ^ M25; + const uint32_t M37 = M21 ^ M29; + const uint32_t M38 = M32 ^ M33; + const uint32_t M39 = M23 ^ M30; + const uint32_t M40 = M35 ^ M36; + const uint32_t M41 = M38 ^ M40; + const uint32_t M42 = M37 ^ M39; + const uint32_t M43 = M37 ^ M38; + const uint32_t M44 = M39 ^ M40; + const uint32_t M45 = M42 ^ M41; + const uint32_t M46 = M44 & T6; + const uint32_t M47 = M40 & T8; + const uint32_t M48 = M39 & D; + + const uint32_t M49 = M43 & T16; + const uint32_t M50 = M38 & T9; + const uint32_t M51 = M37 & T17; + const uint32_t M52 = M42 & T15; + const uint32_t M53 = M45 & T27; + const uint32_t M54 = M41 & T10; + const uint32_t M55 = M44 & T13; + const uint32_t M56 = M40 & T23; + const uint32_t M57 = M39 & T19; + const uint32_t M58 = M43 & T3; + const uint32_t M59 = M38 & T22; + const uint32_t M60 = M37 & T20; + const uint32_t M61 = M42 & T1; + const uint32_t M62 = M45 & T4; + const uint32_t M63 = M41 & T2; + + // Figure 9 Bottom linear transform in reverse direction + const uint32_t P0 = M52 ^ M61; + const uint32_t P1 = M58 ^ M59; + const uint32_t P2 = M54 ^ M62; + const uint32_t P3 = M47 ^ M50; + const uint32_t P4 = M48 ^ M56; + const uint32_t P5 = M46 ^ M51; + const uint32_t P6 = M49 ^ M60; + const uint32_t P7 = P0 ^ P1; + const uint32_t P8 = M50 ^ M53; + const uint32_t P9 = M55 ^ M63; + const uint32_t P10 = M57 ^ P4; + const uint32_t P11 = P0 ^ P3; + const uint32_t P12 = M46 ^ M48; + const uint32_t P13 = M49 ^ M51; + const uint32_t P14 = M49 ^ M62; + const uint32_t P15 = M54 ^ M59; + const uint32_t P16 = M57 ^ M61; + const uint32_t P17 = M58 ^ P2; + const uint32_t P18 = M63 ^ P5; + const uint32_t P19 = P2 ^ P3; + const uint32_t P20 = P4 ^ P6; + const uint32_t P22 = P2 ^ P7; + const uint32_t P23 = P7 ^ P8; + const uint32_t P24 = P5 ^ P7; + const uint32_t P25 = P6 ^ P10; + const uint32_t P26 = P9 ^ P11; + const uint32_t P27 = P10 ^ P18; + const uint32_t P28 = P11 ^ P25; + const uint32_t P29 = P15 ^ P20; + const uint32_t W0 = P13 ^ P22; + const uint32_t W1 = P26 ^ P29; + const uint32_t W2 = P17 ^ P28; + const uint32_t W3 = P12 ^ P22; + const uint32_t W4 = P23 ^ P27; + const uint32_t W5 = P19 ^ P24; + const uint32_t W6 = P14 ^ P23; + const uint32_t W7 = P9 ^ P16; + + V[0] = W0; + V[1] = W1; + V[2] = W2; + V[3] = W3; + V[4] = W4; + V[5] = W5; + V[6] = W6; + V[7] = W7; + } + inline uint32_t SE_word(uint32_t x) { uint32_t I[8] = { 0 }; @@ -312,6 +471,17 @@ inline void shift_rows(uint32_t B[8]) } } +inline void inv_shift_rows(uint32_t B[8]) + { + for(size_t i = 0; i != 8; ++i) + { + uint32_t x = B[i]; + x = bit_permute_step(x, 0x00550055, 1); // Butterfly, stage 0 + x = bit_permute_step(x, 0x00223311, 2); // Butterfly, stage 1 + B[i] = x; + } + } + inline void mix_columns(uint32_t B[8]) { /* @@ -362,14 +532,14 @@ inline void mix_columns(uint32_t B[8]) }; const uint32_t R8_16[8] = { - rotr<8>(B[0]) ^ rotr<16>(B[0]), - rotr<8>(B[1]) ^ rotr<16>(B[1]), - rotr<8>(B[2]) ^ rotr<16>(B[2]), - rotr<8>(B[3]) ^ rotr<16>(B[3]), - rotr<8>(B[4]) ^ rotr<16>(B[4]), - rotr<8>(B[5]) ^ rotr<16>(B[5]), - rotr<8>(B[6]) ^ rotr<16>(B[6]), - rotr<8>(B[7]) ^ rotr<16>(B[7]) + rotr<8>(B[0] ^ rotr<8>(B[0])), + rotr<8>(B[1] ^ rotr<8>(B[1])), + rotr<8>(B[2] ^ rotr<8>(B[2])), + rotr<8>(B[3] ^ rotr<8>(B[3])), + rotr<8>(B[4] ^ rotr<8>(B[4])), + rotr<8>(B[5] ^ rotr<8>(B[5])), + rotr<8>(B[6] ^ rotr<8>(B[6])), + rotr<8>(B[7] ^ rotr<8>(B[7])), }; const uint32_t B0 = B[1] ^ R24[0] ^ R24[1] ^ R8_16[0]; @@ -383,6 +553,55 @@ inline void mix_columns(uint32_t B[8]) B[0] = B0; } +void inv_mix_columns(uint32_t B[8]) + { + const uint32_t X2[8] = { + B[1], + B[2], + B[3], + B[4] ^ B[0], + B[5] ^ B[0], + B[6], + B[7] ^ B[0], + B[0], + }; + const uint32_t X4[8] = { + X2[1], + X2[2], + X2[3], + X2[4] ^ X2[0], + X2[5] ^ X2[0], + X2[6], + X2[7] ^ X2[0], + X2[0], + }; + const uint32_t X8[8] = { + X4[1], + X4[2], + X4[3], + X4[4] ^ X4[0], + X4[5] ^ X4[0], + X4[6], + X4[7] ^ X4[0], + X4[0], + }; + + for(size_t i = 0; i != 8; i++) + { + const uint32_t X9 = X8[i] ^ B[i]; + const uint32_t X11 = X9 ^ X2[i]; + const uint32_t X13 = X9 ^ X4[i]; + const uint32_t X14 = X8[i] ^ X4[i] ^ X2[i]; + + uint8_t b0 = get_byte(0, X14) ^ get_byte(1, X11) ^ get_byte(2, X13) ^ get_byte(3, X9); + uint8_t b1 = get_byte(0, X9) ^ get_byte(1, X14) ^ get_byte(2, X11) ^ get_byte(3, X13); + uint8_t b2 = get_byte(0, X13) ^ get_byte(1, X9) ^ get_byte(2, X14) ^ get_byte(3, X11); + uint8_t b3 = get_byte(0, X11) ^ get_byte(1, X13) ^ get_byte(2, X9) ^ get_byte(3, X14); + + B[i] = make_uint32(b0, b1, b2, b3); + } + } + /* * AES Encryption */ @@ -408,14 +627,8 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], load_be(B, in, this_loop*4); - B[0] ^= EK[0]; - B[1] ^= EK[1]; - B[2] ^= EK[2]; - B[3] ^= EK[3]; - B[4] ^= EK[0]; - B[5] ^= EK[1]; - B[6] ^= EK[2]; - B[7] ^= EK[3]; + for(size_t i = 0; i != 8; ++i) + B[i] ^= EK[i % 4]; bit_transpose(B); @@ -448,38 +661,6 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], } } -const uint32_t* AES_TD() - { - class TD_Table final - { - public: - TD_Table() - { - uint32_t* p = reinterpret_cast(&data); - for(size_t i = 0; i != 256; ++i) - { - p[i] = InvMixColumn(SD[i]); - } - } - - const uint32_t* ptr() const - { - return reinterpret_cast(&data); - } - private: - std::aligned_storage<256*sizeof(uint32_t), 64>::type data; - }; - - static TD_Table table; - return table.ptr(); - } - -#define AES_T(T, K, V0, V1, V2, V3) \ - (K ^ T[get_byte(0, V0)] ^ \ - rotr< 8>(T[get_byte(1, V1)]) ^ \ - rotr<16>(T[get_byte(2, V2)]) ^ \ - rotr<24>(T[get_byte(3, V3)])) - /* * AES Decryption */ @@ -489,71 +670,54 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, { BOTAN_ASSERT(DK.size() && MD.size() == 16, "Key was set"); - const size_t cache_line_size = CPUID::cache_line_size(); - const uint32_t* TD = AES_TD(); - - volatile uint32_t Z = 0; - for(size_t i = 0; i < 256; i += cache_line_size / sizeof(uint32_t)) + uint32_t KS[56*2] = { 0 }; // actual maximum is DK.size() * 2 + for(size_t i = 4; i < DK.size(); i += 4) { - Z |= TD[i]; + ks_expand(&KS[2*(i-4)], DK.data(), i); } - for(size_t i = 0; i < 256; i += cache_line_size) - { - Z |= SD[i]; - } - Z &= TD[99]; // this is zero, which hopefully the compiler cannot deduce - for(size_t i = 0; i != blocks; ++i) + while(blocks > 0) { - uint32_t T0 = load_be(in, 0) ^ DK[0]; - uint32_t T1 = load_be(in, 1) ^ DK[1]; - uint32_t T2 = load_be(in, 2) ^ DK[2]; - uint32_t T3 = load_be(in, 3) ^ DK[3]; + const size_t this_loop = (blocks >= 2) ? 2 : 1; - T0 ^= Z; + uint32_t B[8] = { 0 }; - uint32_t B0 = AES_T(TD, DK[4], T0, T3, T2, T1); - uint32_t B1 = AES_T(TD, DK[5], T1, T0, T3, T2); - uint32_t B2 = AES_T(TD, DK[6], T2, T1, T0, T3); - uint32_t B3 = AES_T(TD, DK[7], T3, T2, T1, T0); + load_be(B, in, this_loop*4); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= DK[i % 4]; - for(size_t r = 2*4; r < DK.size(); r += 2*4) + bit_transpose(B); + + for(size_t r = 4; r < DK.size(); r += 4) { - T0 = AES_T(TD, DK[r ], B0, B3, B2, B1); - T1 = AES_T(TD, DK[r+1], B1, B0, B3, B2); - T2 = AES_T(TD, DK[r+2], B2, B1, B0, B3); - T3 = AES_T(TD, DK[r+3], B3, B2, B1, B0); - - B0 = AES_T(TD, DK[r+4], T0, T3, T2, T1); - B1 = AES_T(TD, DK[r+5], T1, T0, T3, T2); - B2 = AES_T(TD, DK[r+6], T2, T1, T0, T3); - B3 = AES_T(TD, DK[r+7], T3, T2, T1, T0); + AES_INV_SBOX(B); + inv_shift_rows(B); + inv_mix_columns(B); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= KS[2*(r-4) + i]; } - out[ 0] = SD[get_byte(0, B0)] ^ MD[0]; - out[ 1] = SD[get_byte(1, B3)] ^ MD[1]; - out[ 2] = SD[get_byte(2, B2)] ^ MD[2]; - out[ 3] = SD[get_byte(3, B1)] ^ MD[3]; - out[ 4] = SD[get_byte(0, B1)] ^ MD[4]; - out[ 5] = SD[get_byte(1, B0)] ^ MD[5]; - out[ 6] = SD[get_byte(2, B3)] ^ MD[6]; - out[ 7] = SD[get_byte(3, B2)] ^ MD[7]; - out[ 8] = SD[get_byte(0, B2)] ^ MD[8]; - out[ 9] = SD[get_byte(1, B1)] ^ MD[9]; - out[10] = SD[get_byte(2, B0)] ^ MD[10]; - out[11] = SD[get_byte(3, B3)] ^ MD[11]; - out[12] = SD[get_byte(0, B3)] ^ MD[12]; - out[13] = SD[get_byte(1, B2)] ^ MD[13]; - out[14] = SD[get_byte(2, B1)] ^ MD[14]; - out[15] = SD[get_byte(3, B0)] ^ MD[15]; - - in += 16; - out += 16; + // Final round: + AES_INV_SBOX(B); + inv_shift_rows(B); + bit_transpose(B); + + for(size_t i = 0; i != 8; ++i) + B[i] ^= load_be(MD.data(), i % 4); + + if(this_loop == 2) + store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); + else + store_be(out, B[0], B[1], B[2], B[3]); + + in += this_loop*16; + out += this_loop*16; + blocks -= this_loop; } } -#undef AES_T - void aes_key_schedule(const uint8_t key[], size_t length, secure_vector& EK, secure_vector& DK, -- cgit v1.2.3 From 6490b10a7ac691a3455b6c66c3adfe865cfd2f7f Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 04:32:05 -0400 Subject: Faster InvMixColumn --- src/lib/block/aes/aes.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 93b17e528..1e7103582 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -593,12 +593,7 @@ void inv_mix_columns(uint32_t B[8]) const uint32_t X13 = X9 ^ X4[i]; const uint32_t X14 = X8[i] ^ X4[i] ^ X2[i]; - uint8_t b0 = get_byte(0, X14) ^ get_byte(1, X11) ^ get_byte(2, X13) ^ get_byte(3, X9); - uint8_t b1 = get_byte(0, X9) ^ get_byte(1, X14) ^ get_byte(2, X11) ^ get_byte(3, X13); - uint8_t b2 = get_byte(0, X13) ^ get_byte(1, X9) ^ get_byte(2, X14) ^ get_byte(3, X11); - uint8_t b3 = get_byte(0, X11) ^ get_byte(1, X13) ^ get_byte(2, X9) ^ get_byte(3, X14); - - B[i] = make_uint32(b0, b1, b2, b3); + B[i] = X14 ^ rotr<8>(X9) ^ rotr<24>(X11) ^ rotr<16>(X13); } } -- cgit v1.2.3 From e3a9b5daab2372a75cd49edbab981f82689af6f0 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 04:44:33 -0400 Subject: Simpler loop is actually faster for MixColumns --- src/lib/block/aes/aes.cpp | 54 ++--------------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 1e7103582..134c40e87 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -484,9 +484,6 @@ inline void inv_shift_rows(uint32_t B[8]) inline void mix_columns(uint32_t B[8]) { - /* - This is equivalent to what T-tables mix columns looks like when you decompose it: - // carry high bits in B[0] to positions in 0x1b == 0b11011 const uint32_t X2[8] = { B[1], @@ -498,59 +495,12 @@ inline void mix_columns(uint32_t B[8]) B[7] ^ B[0], B[0], }; + for(size_t i = 0; i != 8; i++) { const uint32_t X3 = B[i] ^ X2[i]; - - uint8_t b0 = get_byte(0, X2[i]) ^ get_byte(1, X3) ^ get_byte(2, B[i]) ^ get_byte(3, B[i]); - uint8_t b1 = get_byte(0, B[i]) ^ get_byte(1, X2[i]) ^ get_byte(2, X3) ^ get_byte(3, B[i]); - uint8_t b2 = get_byte(0, B[i]) ^ get_byte(1, B[i]) ^ get_byte(2, X2[i]) ^ get_byte(3, X3); - uint8_t b3 = get_byte(0, X3) ^ get_byte(1, B[i]) ^ get_byte(2, B[i]) ^ get_byte(3, X2[i]); - - B[i] = make_uint32(b0, b1, b2, b3); + B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3); } - - Notice that each byte of B[i], X2[i] and X3 is used once in each column, so - we can instead effect the selections by rotations and do the XORs in word units - instead of bytes. Unrolling and expanding the definition of X2 then combining - similar terms results in the expressions below. The end result is very - similar to the MixColumns found in section 4.4 and Appendix A of "Faster and - Timing-Attack Resistant AES-GCM" (https://eprint.iacr.org/2009/129.pdf) except - suited to our word size, and of course we cannot make use of word/byte shuffles - to perform the rotations. - */ - - const uint32_t R24[8] = { - rotr<24>(B[0]), - rotr<24>(B[1]), - rotr<24>(B[2]), - rotr<24>(B[3]), - rotr<24>(B[4]), - rotr<24>(B[5]), - rotr<24>(B[6]), - rotr<24>(B[7]) - }; - - const uint32_t R8_16[8] = { - rotr<8>(B[0] ^ rotr<8>(B[0])), - rotr<8>(B[1] ^ rotr<8>(B[1])), - rotr<8>(B[2] ^ rotr<8>(B[2])), - rotr<8>(B[3] ^ rotr<8>(B[3])), - rotr<8>(B[4] ^ rotr<8>(B[4])), - rotr<8>(B[5] ^ rotr<8>(B[5])), - rotr<8>(B[6] ^ rotr<8>(B[6])), - rotr<8>(B[7] ^ rotr<8>(B[7])), - }; - - const uint32_t B0 = B[1] ^ R24[0] ^ R24[1] ^ R8_16[0]; - B[1] = B[2] ^ R24[1] ^ R24[2] ^ R8_16[1]; - B[2] = B[3] ^ R24[2] ^ R24[3] ^ R8_16[2]; - B[3] = B[0] ^ B[4] ^ R24[0] ^ R24[3] ^ R24[4] ^ R8_16[3]; - B[4] = B[5] ^ B[0] ^ R24[0] ^ R24[4] ^ R24[5] ^ R8_16[4]; - B[5] = B[6] ^ R24[5] ^ R24[6] ^ R8_16[5]; - B[6] = B[7] ^ B[0] ^ R24[0] ^ R24[6] ^ R24[7] ^ R8_16[6]; - B[7] = B[0] ^ R24[0] ^ R24[7] ^ R8_16[7]; - B[0] = B0; } void inv_mix_columns(uint32_t B[8]) -- cgit v1.2.3 From 4cb92ac0a08e8258be59c4209e9d43a4f21736b7 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 05:27:33 -0400 Subject: Remove SD table and move some functions around --- src/lib/block/aes/aes.cpp | 112 ++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 69 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 134c40e87..849ee11d1 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -10,51 +10,11 @@ #include #include #include -#include namespace Botan { namespace { -alignas(64) -const uint8_t SD[256] = { - 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, - 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, - 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32, - 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, - 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, - 0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, - 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50, - 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, - 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, - 0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, - 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41, - 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, - 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, - 0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, - 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B, - 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, - 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, - 0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, - 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D, - 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, - 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, - 0x55, 0x21, 0x0C, 0x7D }; - -inline constexpr uint8_t xtime(uint8_t s) { return static_cast(s << 1) ^ ((s >> 7) * 0x1B); } - -inline uint32_t InvMixColumn(uint8_t s1) - { - const uint8_t s2 = xtime(s1); - const uint8_t s4 = xtime(s2); - const uint8_t s8 = xtime(s4); - const uint8_t s9 = s8 ^ s1; - const uint8_t s11 = s9 ^ s2; - const uint8_t s13 = s9 ^ s4; - const uint8_t s14 = s8 ^ s4 ^ s2; - return make_uint32(s14, s9, s13, s11); - } - /* This is an AES sbox circuit which can execute in bitsliced mode up to 32x in parallel. @@ -386,35 +346,6 @@ void AES_INV_SBOX(uint32_t V[8]) V[7] = W7; } -inline uint32_t SE_word(uint32_t x) - { - uint32_t I[8] = { 0 }; - - // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 - x = bit_permute_step(x, 0x00aa00aa, 7); // Bit index swap 0,3 - x = bit_permute_step(x, 0x0000cccc, 14); // Bit index swap 1,4 - x = bit_permute_step(x, 0x00f000f0, 4); // Bit index swap 2,3 - x = bit_permute_step(x, 0x0000ff00, 8); // Bit index swap 3,4 - - for(size_t i = 0; i != 8; ++i) - I[i] = (x >> (28-4*i)) & 0xF; - - AES_SBOX(I); - - x = 0; - - for(size_t i = 0; i != 8; ++i) - x = (x << 4) + (I[i] & 0xF); - - // 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31 - x = bit_permute_step(x, 0x0a0a0a0a, 3); // Bit index swap 0,2 - x = bit_permute_step(x, 0x00cc00cc, 6); // Bit index swap 1,3 - x = bit_permute_step(x, 0x0000f0f0, 12); // Bit index swap 2,4 - x = bit_permute_step(x, 0x0000ff00, 8); // Bit index swap 3,4 - - return x; - } - inline void bit_transpose(uint32_t B[8]) { swap_bits(B[1], B[0], 0x55555555, 1); @@ -663,6 +594,49 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, } } +inline constexpr uint8_t xtime(uint8_t s) { return static_cast(s << 1) ^ ((s >> 7) * 0x1B); } + +inline uint32_t InvMixColumn(uint8_t s1) + { + const uint8_t s2 = xtime(s1); + const uint8_t s4 = xtime(s2); + const uint8_t s8 = xtime(s4); + const uint8_t s9 = s8 ^ s1; + const uint8_t s11 = s9 ^ s2; + const uint8_t s13 = s9 ^ s4; + const uint8_t s14 = s8 ^ s4 ^ s2; + return make_uint32(s14, s9, s13, s11); + } + +inline uint32_t SE_word(uint32_t x) + { + uint32_t I[8] = { 0 }; + + // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 + x = bit_permute_step(x, 0x00aa00aa, 7); // Bit index swap 0,3 + x = bit_permute_step(x, 0x0000cccc, 14); // Bit index swap 1,4 + x = bit_permute_step(x, 0x00f000f0, 4); // Bit index swap 2,3 + x = bit_permute_step(x, 0x0000ff00, 8); // Bit index swap 3,4 + + for(size_t i = 0; i != 8; ++i) + I[i] = (x >> (28-4*i)) & 0xF; + + AES_SBOX(I); + + x = 0; + + for(size_t i = 0; i != 8; ++i) + x = (x << 4) + (I[i] & 0xF); + + // 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31 + x = bit_permute_step(x, 0x0a0a0a0a, 3); // Bit index swap 0,2 + x = bit_permute_step(x, 0x00cc00cc, 6); // Bit index swap 1,3 + x = bit_permute_step(x, 0x0000f0f0, 12); // Bit index swap 2,4 + x = bit_permute_step(x, 0x0000ff00, 8); // Bit index swap 3,4 + + return x; + } + void aes_key_schedule(const uint8_t key[], size_t length, secure_vector& EK, secure_vector& DK, -- cgit v1.2.3 From c6ddac790dafc2aca434bd209d85e64c03f753b9 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 08:07:23 -0400 Subject: Simplify the round loops --- src/lib/block/aes/aes.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 849ee11d1..ad1ae669c 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -489,10 +489,12 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set"); BOTAN_ASSERT(EK.size() == 40 || EK.size() == 48 || EK.size() == 56, "Expected EK size"); - uint32_t KS[56*2] = { 0 }; // actual maximum is EK.size() * 2 - for(size_t i = 4; i < EK.size(); i += 4) + const size_t rounds = EK.size() / 4; + + uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8 + for(size_t i = 0; i < rounds - 1; i += 1) { - ks_expand(&KS[2*(i-4)], EK.data(), i); + ks_expand(&KS[8*i], EK.data(), 4*i + 4); } while(blocks > 0) @@ -508,14 +510,14 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], bit_transpose(B); - for(size_t r = 4; r < EK.size(); r += 4) + for(size_t r = 0; r != rounds - 1; ++r) { AES_SBOX(B); shift_rows(B); mix_columns(B); for(size_t i = 0; i != 8; ++i) - B[i] ^= KS[2*(r-4) + i]; + B[i] ^= KS[8*r + i]; } // Final round: @@ -546,10 +548,12 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, { BOTAN_ASSERT(DK.size() && MD.size() == 16, "Key was set"); - uint32_t KS[56*2] = { 0 }; // actual maximum is DK.size() * 2 - for(size_t i = 4; i < DK.size(); i += 4) + const size_t rounds = DK.size() / 4; + + uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8 + for(size_t i = 0; i < rounds - 1; i += 1) { - ks_expand(&KS[2*(i-4)], DK.data(), i); + ks_expand(&KS[8*i], DK.data(), 4*i + 4); } while(blocks > 0) @@ -565,14 +569,14 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, bit_transpose(B); - for(size_t r = 4; r < DK.size(); r += 4) + for(size_t r = 0; r != rounds - 1; ++r) { AES_INV_SBOX(B); inv_shift_rows(B); inv_mix_columns(B); for(size_t i = 0; i != 8; ++i) - B[i] ^= KS[2*(r-4) + i]; + B[i] ^= KS[8*r + i]; } // Final round: -- cgit v1.2.3 From a4a237e7f69be2742da6ee1c5a35412081e2c789 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 08:41:50 -0400 Subject: Store all AES key bits as uint32_t The ME/MD had the final round key in bytes which was slightly useful in the T-tables implementation but actively not helpful for every other implementation. --- src/lib/block/aes/aes.cpp | 74 +++++------- src/lib/block/aes/aes.h | 3 - src/lib/block/aes/aes_armv8/aes_armv8.cpp | 174 ++++++++++++++-------------- src/lib/block/aes/aes_power8/aes_power8.cpp | 12 +- 4 files changed, 116 insertions(+), 147 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index ad1ae669c..7732e0909 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -483,13 +483,11 @@ void inv_mix_columns(uint32_t B[8]) */ void aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, - const secure_vector& EK, - const secure_vector& ME) + const secure_vector& EK) { - BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set"); - BOTAN_ASSERT(EK.size() == 40 || EK.size() == 48 || EK.size() == 56, "Expected EK size"); + BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set"); - const size_t rounds = EK.size() / 4; + const size_t rounds = (EK.size() - 4) / 4; uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8 for(size_t i = 0; i < rounds - 1; i += 1) @@ -526,7 +524,7 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], bit_transpose(B); for(size_t i = 0; i != 8; ++i) - B[i] ^= load_be(ME.data(), i % 4); + B[i] ^= EK[4*rounds + i % 4]; if(this_loop == 2) store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); @@ -543,12 +541,11 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], * AES Decryption */ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, - const secure_vector& DK, - const secure_vector& MD) + const secure_vector& DK) { - BOTAN_ASSERT(DK.size() && MD.size() == 16, "Key was set"); + BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set"); - const size_t rounds = DK.size() / 4; + const size_t rounds = (DK.size() - 4) / 4; uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8 for(size_t i = 0; i < rounds - 1; i += 1) @@ -585,7 +582,7 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, bit_transpose(B); for(size_t i = 0; i != 8; ++i) - B[i] ^= load_be(MD.data(), i % 4); + B[i] ^= DK[4*rounds + i % 4]; if(this_loop == 2) store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); @@ -643,9 +640,7 @@ inline uint32_t SE_word(uint32_t x) void aes_key_schedule(const uint8_t key[], size_t length, secure_vector& EK, - secure_vector& DK, - secure_vector& ME, - secure_vector& MD) + secure_vector& DK) { static const uint32_t RC[10] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, @@ -702,17 +697,8 @@ void aes_key_schedule(const uint8_t key[], size_t length, rotr<24>(InvMixColumn(s3)); } - ME.resize(16); - MD.resize(16); - - for(size_t i = 0; i != 4; ++i) - { - store_be(XEK[i+4*rounds], &ME[4*i]); - store_be(XEK[i], &MD[4*i]); - } - - EK.resize(length + 24); - DK.resize(length + 24); + EK.resize(length + 24 + 4); + DK.resize(length + 24 + 4); copy_mem(EK.data(), XEK.data(), EK.size()); copy_mem(DK.data(), XDK.data(), DK.size()); @@ -730,8 +716,6 @@ void aes_key_schedule(const uint8_t key[], size_t length, CT::unpoison(EK.data(), EK.size()); CT::unpoison(DK.data(), DK.size()); - CT::unpoison(ME.data(), ME.size()); - CT::unpoison(MD.data(), MD.size()); CT::unpoison(key, length); } @@ -844,7 +828,7 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif - aes_encrypt_n(in, out, blocks, m_EK, m_ME); + aes_encrypt_n(in, out, blocks, m_EK); } void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const @@ -879,7 +863,7 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif - aes_decrypt_n(in, out, blocks, m_DK, m_MD); + aes_decrypt_n(in, out, blocks, m_DK); } void AES_128::key_schedule(const uint8_t key[], size_t length) @@ -894,14 +878,14 @@ void AES_128::key_schedule(const uint8_t key[], size_t length) #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { - return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + return aes_key_schedule(key, length, m_EK, m_DK); } #endif #if defined(BOTAN_HAS_AES_POWER8) if(CPUID::has_power_crypto()) { - return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + return aes_key_schedule(key, length, m_EK, m_DK); } #endif @@ -912,15 +896,13 @@ void AES_128::key_schedule(const uint8_t key[], size_t length) } #endif - aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + aes_key_schedule(key, length, m_EK, m_DK); } void AES_128::clear() { zap(m_EK); zap(m_DK); - zap(m_ME); - zap(m_MD); } void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const @@ -955,7 +937,7 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif - aes_encrypt_n(in, out, blocks, m_EK, m_ME); + aes_encrypt_n(in, out, blocks, m_EK); } void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const @@ -990,7 +972,7 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif - aes_decrypt_n(in, out, blocks, m_DK, m_MD); + aes_decrypt_n(in, out, blocks, m_DK); } void AES_192::key_schedule(const uint8_t key[], size_t length) @@ -1005,14 +987,14 @@ void AES_192::key_schedule(const uint8_t key[], size_t length) #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { - return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + return aes_key_schedule(key, length, m_EK, m_DK); } #endif #if defined(BOTAN_HAS_AES_POWER8) if(CPUID::has_power_crypto()) { - return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + return aes_key_schedule(key, length, m_EK, m_DK); } #endif @@ -1023,15 +1005,13 @@ void AES_192::key_schedule(const uint8_t key[], size_t length) } #endif - aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + aes_key_schedule(key, length, m_EK, m_DK); } void AES_192::clear() { zap(m_EK); zap(m_DK); - zap(m_ME); - zap(m_MD); } void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const @@ -1066,7 +1046,7 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif - aes_encrypt_n(in, out, blocks, m_EK, m_ME); + aes_encrypt_n(in, out, blocks, m_EK); } void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const @@ -1101,7 +1081,7 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } #endif - aes_decrypt_n(in, out, blocks, m_DK, m_MD); + aes_decrypt_n(in, out, blocks, m_DK); } void AES_256::key_schedule(const uint8_t key[], size_t length) @@ -1116,14 +1096,14 @@ void AES_256::key_schedule(const uint8_t key[], size_t length) #if defined(BOTAN_HAS_AES_ARMV8) if(CPUID::has_arm_aes()) { - return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + return aes_key_schedule(key, length, m_EK, m_DK); } #endif #if defined(BOTAN_HAS_AES_POWER8) if(CPUID::has_power_crypto()) { - return aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + return aes_key_schedule(key, length, m_EK, m_DK); } #endif @@ -1134,15 +1114,13 @@ void AES_256::key_schedule(const uint8_t key[], size_t length) } #endif - aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + aes_key_schedule(key, length, m_EK, m_DK); } void AES_256::clear() { zap(m_EK); zap(m_DK); - zap(m_ME); - zap(m_MD); } } diff --git a/src/lib/block/aes/aes.h b/src/lib/block/aes/aes.h index 84d997d05..ef1c3a7ca 100644 --- a/src/lib/block/aes/aes.h +++ b/src/lib/block/aes/aes.h @@ -56,7 +56,6 @@ class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16, #endif secure_vector m_EK, m_DK; - secure_vector m_ME, m_MD; }; /** @@ -101,7 +100,6 @@ class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16, void key_schedule(const uint8_t key[], size_t length) override; secure_vector m_EK, m_DK; - secure_vector m_ME, m_MD; }; /** @@ -147,7 +145,6 @@ class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16, void key_schedule(const uint8_t key[], size_t length) override; secure_vector m_EK, m_DK; - secure_vector m_ME, m_MD; }; } diff --git a/src/lib/block/aes/aes_armv8/aes_armv8.cpp b/src/lib/block/aes/aes_armv8/aes_armv8.cpp index f4261954b..0cd81b283 100644 --- a/src/lib/block/aes/aes_armv8/aes_armv8.cpp +++ b/src/lib/block/aes/aes_armv8/aes_armv8.cpp @@ -57,19 +57,18 @@ BOTAN_FUNC_ISA("+crypto") void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_EK.data()); - const uint8_t *mkey = reinterpret_cast(m_ME.data()); - - const uint8x16_t K0 = vld1q_u8(skey + 0); - const uint8x16_t K1 = vld1q_u8(skey + 16); - const uint8x16_t K2 = vld1q_u8(skey + 32); - const uint8x16_t K3 = vld1q_u8(skey + 48); - const uint8x16_t K4 = vld1q_u8(skey + 64); - const uint8x16_t K5 = vld1q_u8(skey + 80); - const uint8x16_t K6 = vld1q_u8(skey + 96); - const uint8x16_t K7 = vld1q_u8(skey + 112); - const uint8x16_t K8 = vld1q_u8(skey + 128); - const uint8x16_t K9 = vld1q_u8(skey + 144); - const uint8x16_t K10 = vld1q_u8(mkey); + + const uint8x16_t K0 = vld1q_u8(skey + 0*16); + const uint8x16_t K1 = vld1q_u8(skey + 1*16); + const uint8x16_t K2 = vld1q_u8(skey + 2*16); + const uint8x16_t K3 = vld1q_u8(skey + 3*16); + const uint8x16_t K4 = vld1q_u8(skey + 4*16); + const uint8x16_t K5 = vld1q_u8(skey + 5*16); + const uint8x16_t K6 = vld1q_u8(skey + 6*16); + const uint8x16_t K7 = vld1q_u8(skey + 7*16); + const uint8x16_t K8 = vld1q_u8(skey + 8*16); + const uint8x16_t K9 = vld1q_u8(skey + 9*16); + const uint8x16_t K10 = vld1q_u8(skey + 10*16); while(blocks >= 4) { @@ -123,19 +122,18 @@ BOTAN_FUNC_ISA("+crypto") void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_DK.data()); - const uint8_t *mkey = reinterpret_cast(m_MD.data()); - - const uint8x16_t K0 = vld1q_u8(skey + 0); - const uint8x16_t K1 = vld1q_u8(skey + 16); - const uint8x16_t K2 = vld1q_u8(skey + 32); - const uint8x16_t K3 = vld1q_u8(skey + 48); - const uint8x16_t K4 = vld1q_u8(skey + 64); - const uint8x16_t K5 = vld1q_u8(skey + 80); - const uint8x16_t K6 = vld1q_u8(skey + 96); - const uint8x16_t K7 = vld1q_u8(skey + 112); - const uint8x16_t K8 = vld1q_u8(skey + 128); - const uint8x16_t K9 = vld1q_u8(skey + 144); - const uint8x16_t K10 = vld1q_u8(mkey); + + const uint8x16_t K0 = vld1q_u8(skey + 0*16); + const uint8x16_t K1 = vld1q_u8(skey + 1*16); + const uint8x16_t K2 = vld1q_u8(skey + 2*16); + const uint8x16_t K3 = vld1q_u8(skey + 3*16); + const uint8x16_t K4 = vld1q_u8(skey + 4*16); + const uint8x16_t K5 = vld1q_u8(skey + 5*16); + const uint8x16_t K6 = vld1q_u8(skey + 6*16); + const uint8x16_t K7 = vld1q_u8(skey + 7*16); + const uint8x16_t K8 = vld1q_u8(skey + 8*16); + const uint8x16_t K9 = vld1q_u8(skey + 9*16); + const uint8x16_t K10 = vld1q_u8(skey + 10*16); while(blocks >= 4) { @@ -189,21 +187,20 @@ BOTAN_FUNC_ISA("+crypto") void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_EK.data()); - const uint8_t *mkey = reinterpret_cast(m_ME.data()); - - const uint8x16_t K0 = vld1q_u8(skey + 0); - const uint8x16_t K1 = vld1q_u8(skey + 16); - const uint8x16_t K2 = vld1q_u8(skey + 32); - const uint8x16_t K3 = vld1q_u8(skey + 48); - const uint8x16_t K4 = vld1q_u8(skey + 64); - const uint8x16_t K5 = vld1q_u8(skey + 80); - const uint8x16_t K6 = vld1q_u8(skey + 96); - const uint8x16_t K7 = vld1q_u8(skey + 112); - const uint8x16_t K8 = vld1q_u8(skey + 128); - const uint8x16_t K9 = vld1q_u8(skey + 144); - const uint8x16_t K10 = vld1q_u8(skey + 160); - const uint8x16_t K11 = vld1q_u8(skey + 176); - const uint8x16_t K12 = vld1q_u8(mkey); + + const uint8x16_t K0 = vld1q_u8(skey + 0*16); + const uint8x16_t K1 = vld1q_u8(skey + 1*16); + const uint8x16_t K2 = vld1q_u8(skey + 2*16); + const uint8x16_t K3 = vld1q_u8(skey + 3*16); + const uint8x16_t K4 = vld1q_u8(skey + 4*16); + const uint8x16_t K5 = vld1q_u8(skey + 5*16); + const uint8x16_t K6 = vld1q_u8(skey + 6*16); + const uint8x16_t K7 = vld1q_u8(skey + 7*16); + const uint8x16_t K8 = vld1q_u8(skey + 8*16); + const uint8x16_t K9 = vld1q_u8(skey + 9*16); + const uint8x16_t K10 = vld1q_u8(skey + 10*16); + const uint8x16_t K11 = vld1q_u8(skey + 11*16); + const uint8x16_t K12 = vld1q_u8(skey + 12*16); while(blocks >= 4) { @@ -261,21 +258,20 @@ BOTAN_FUNC_ISA("+crypto") void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_DK.data()); - const uint8_t *mkey = reinterpret_cast(m_MD.data()); - - const uint8x16_t K0 = vld1q_u8(skey + 0); - const uint8x16_t K1 = vld1q_u8(skey + 16); - const uint8x16_t K2 = vld1q_u8(skey + 32); - const uint8x16_t K3 = vld1q_u8(skey + 48); - const uint8x16_t K4 = vld1q_u8(skey + 64); - const uint8x16_t K5 = vld1q_u8(skey + 80); - const uint8x16_t K6 = vld1q_u8(skey + 96); - const uint8x16_t K7 = vld1q_u8(skey + 112); - const uint8x16_t K8 = vld1q_u8(skey + 128); - const uint8x16_t K9 = vld1q_u8(skey + 144); - const uint8x16_t K10 = vld1q_u8(skey + 160); - const uint8x16_t K11 = vld1q_u8(skey + 176); - const uint8x16_t K12 = vld1q_u8(mkey); + + const uint8x16_t K0 = vld1q_u8(skey + 0*16); + const uint8x16_t K1 = vld1q_u8(skey + 1*16); + const uint8x16_t K2 = vld1q_u8(skey + 2*16); + const uint8x16_t K3 = vld1q_u8(skey + 3*16); + const uint8x16_t K4 = vld1q_u8(skey + 4*16); + const uint8x16_t K5 = vld1q_u8(skey + 5*16); + const uint8x16_t K6 = vld1q_u8(skey + 6*16); + const uint8x16_t K7 = vld1q_u8(skey + 7*16); + const uint8x16_t K8 = vld1q_u8(skey + 8*16); + const uint8x16_t K9 = vld1q_u8(skey + 9*16); + const uint8x16_t K10 = vld1q_u8(skey + 10*16); + const uint8x16_t K11 = vld1q_u8(skey + 11*16); + const uint8x16_t K12 = vld1q_u8(skey + 12*16); while(blocks >= 4) { @@ -333,23 +329,22 @@ BOTAN_FUNC_ISA("+crypto") void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_EK.data()); - const uint8_t *mkey = reinterpret_cast(m_ME.data()); - - const uint8x16_t K0 = vld1q_u8(skey + 0); - const uint8x16_t K1 = vld1q_u8(skey + 16); - const uint8x16_t K2 = vld1q_u8(skey + 32); - const uint8x16_t K3 = vld1q_u8(skey + 48); - const uint8x16_t K4 = vld1q_u8(skey + 64); - const uint8x16_t K5 = vld1q_u8(skey + 80); - const uint8x16_t K6 = vld1q_u8(skey + 96); - const uint8x16_t K7 = vld1q_u8(skey + 112); - const uint8x16_t K8 = vld1q_u8(skey + 128); - const uint8x16_t K9 = vld1q_u8(skey + 144); - const uint8x16_t K10 = vld1q_u8(skey + 160); - const uint8x16_t K11 = vld1q_u8(skey + 176); - const uint8x16_t K12 = vld1q_u8(skey + 192); - const uint8x16_t K13 = vld1q_u8(skey + 208); - const uint8x16_t K14 = vld1q_u8(mkey); + + const uint8x16_t K0 = vld1q_u8(skey + 0*16); + const uint8x16_t K1 = vld1q_u8(skey + 1*16); + const uint8x16_t K2 = vld1q_u8(skey + 2*16); + const uint8x16_t K3 = vld1q_u8(skey + 3*16); + const uint8x16_t K4 = vld1q_u8(skey + 4*16); + const uint8x16_t K5 = vld1q_u8(skey + 5*16); + const uint8x16_t K6 = vld1q_u8(skey + 6*16); + const uint8x16_t K7 = vld1q_u8(skey + 7*16); + const uint8x16_t K8 = vld1q_u8(skey + 8*16); + const uint8x16_t K9 = vld1q_u8(skey + 9*16); + const uint8x16_t K10 = vld1q_u8(skey + 10*16); + const uint8x16_t K11 = vld1q_u8(skey + 11*16); + const uint8x16_t K12 = vld1q_u8(skey + 12*16); + const uint8x16_t K13 = vld1q_u8(skey + 13*16); + const uint8x16_t K14 = vld1q_u8(skey + 14*16); while(blocks >= 4) { @@ -411,23 +406,22 @@ BOTAN_FUNC_ISA("+crypto") void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_DK.data()); - const uint8_t *mkey = reinterpret_cast(m_MD.data()); - - const uint8x16_t K0 = vld1q_u8(skey + 0); - const uint8x16_t K1 = vld1q_u8(skey + 16); - const uint8x16_t K2 = vld1q_u8(skey + 32); - const uint8x16_t K3 = vld1q_u8(skey + 48); - const uint8x16_t K4 = vld1q_u8(skey + 64); - const uint8x16_t K5 = vld1q_u8(skey + 80); - const uint8x16_t K6 = vld1q_u8(skey + 96); - const uint8x16_t K7 = vld1q_u8(skey + 112); - const uint8x16_t K8 = vld1q_u8(skey + 128); - const uint8x16_t K9 = vld1q_u8(skey + 144); - const uint8x16_t K10 = vld1q_u8(skey + 160); - const uint8x16_t K11 = vld1q_u8(skey + 176); - const uint8x16_t K12 = vld1q_u8(skey + 192); - const uint8x16_t K13 = vld1q_u8(skey + 208); - const uint8x16_t K14 = vld1q_u8(mkey); + + const uint8x16_t K0 = vld1q_u8(skey + 0*16); + const uint8x16_t K1 = vld1q_u8(skey + 1*16); + const uint8x16_t K2 = vld1q_u8(skey + 2*16); + const uint8x16_t K3 = vld1q_u8(skey + 3*16); + const uint8x16_t K4 = vld1q_u8(skey + 4*16); + const uint8x16_t K5 = vld1q_u8(skey + 5*16); + const uint8x16_t K6 = vld1q_u8(skey + 6*16); + const uint8x16_t K7 = vld1q_u8(skey + 7*16); + const uint8x16_t K8 = vld1q_u8(skey + 8*16); + const uint8x16_t K9 = vld1q_u8(skey + 9*16); + const uint8x16_t K10 = vld1q_u8(skey + 10*16); + const uint8x16_t K11 = vld1q_u8(skey + 11*16); + const uint8x16_t K12 = vld1q_u8(skey + 12*16); + const uint8x16_t K13 = vld1q_u8(skey + 13*16); + const uint8x16_t K14 = vld1q_u8(skey + 14*16); while(blocks >= 4) { diff --git a/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/lib/block/aes/aes_power8/aes_power8.cpp index b9af23ef2..0dea7a953 100644 --- a/src/lib/block/aes/aes_power8/aes_power8.cpp +++ b/src/lib/block/aes/aes_power8/aes_power8.cpp @@ -124,7 +124,7 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const Altivec64x2 K7 = load_key(&m_EK[28]); const Altivec64x2 K8 = load_key(&m_EK[32]); const Altivec64x2 K9 = load_key(&m_EK[36]); - const Altivec64x2 K10 = load_block(m_ME.data()); + const Altivec64x2 K10 = load_key(&m_EK[40]); while(blocks >= 4) { @@ -178,7 +178,7 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[40]); const Altivec64x2 K1 = load_key(&m_EK[36]); const Altivec64x2 K2 = load_key(&m_EK[32]); const Altivec64x2 K3 = load_key(&m_EK[28]); @@ -254,7 +254,7 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const Altivec64x2 K9 = load_key(&m_EK[36]); const Altivec64x2 K10 = load_key(&m_EK[40]); const Altivec64x2 K11 = load_key(&m_EK[44]); - const Altivec64x2 K12 = load_block(m_ME.data()); + const Altivec64x2 K12 = load_key(&m_EK[48]); while(blocks >= 4) { @@ -312,7 +312,7 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[48]); const Altivec64x2 K1 = load_key(&m_EK[44]); const Altivec64x2 K2 = load_key(&m_EK[40]); const Altivec64x2 K3 = load_key(&m_EK[36]); @@ -396,7 +396,7 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const Altivec64x2 K11 = load_key(&m_EK[44]); const Altivec64x2 K12 = load_key(&m_EK[48]); const Altivec64x2 K13 = load_key(&m_EK[52]); - const Altivec64x2 K14 = load_block(m_ME.data()); + const Altivec64x2 K14 = load_key(&m_EK[56]); while(blocks >= 4) { @@ -458,7 +458,7 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) BOTAN_FUNC_ISA("crypto") void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { - const Altivec64x2 K0 = load_block(m_ME.data()); + const Altivec64x2 K0 = load_key(&m_EK[56]); const Altivec64x2 K1 = load_key(&m_EK[52]); const Altivec64x2 K2 = load_key(&m_EK[48]); const Altivec64x2 K3 = load_key(&m_EK[44]); -- cgit v1.2.3 From 213eca6b830787a98d3b274c76bd8eb86b1ce506 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 09:10:21 -0400 Subject: Consolidate the logic for hardware AES We can safely assume there is only one hardware instruction set. (Well, at least until VAES support is added, we can address that when it happens) --- src/lib/block/aes/aes.cpp | 202 ++++++---------------------- src/lib/block/aes/aes.h | 39 ++---- src/lib/block/aes/aes_armv8/aes_armv8.cpp | 12 +- src/lib/block/aes/aes_ni/aes_ni.cpp | 12 +- src/lib/block/aes/aes_power8/aes_power8.cpp | 12 +- src/lib/utils/cpuid/cpuid.h | 16 +++ 6 files changed, 81 insertions(+), 212 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 7732e0909..b10c7b249 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -13,6 +13,10 @@ namespace Botan { +#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI) + #define BOTAN_HAS_HW_AES_SUPPORT +#endif + namespace { /* @@ -721,31 +725,17 @@ void aes_key_schedule(const uint8_t key[], size_t length, size_t aes_parallelism() { -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) - { - return 4; - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return 4; - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return 4; + return 4; // pipelined } #endif #if defined(BOTAN_HAS_AES_VPERM) if(CPUID::has_vperm()) { - return 2; + return 2; // pipelined } #endif @@ -755,24 +745,10 @@ size_t aes_parallelism() const char* aes_provider() { -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) - { - return "aesni"; - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return "power8"; - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return "armv8"; + return "cpu"; } #endif @@ -800,24 +776,10 @@ void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_EK.empty() == false); -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) - { - return aesni_encrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return armv8_encrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return power8_encrypt_n(in, out, blocks); + return hw_aes_encrypt_n(in, out, blocks); } #endif @@ -835,24 +797,10 @@ void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_DK.empty() == false); -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) - { - return aesni_decrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return armv8_decrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return power8_decrypt_n(in, out, blocks); + return hw_aes_decrypt_n(in, out, blocks); } #endif @@ -875,16 +823,10 @@ void AES_128::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return aes_key_schedule(key, length, m_EK, m_DK); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { + // POWER and ARM use the standard key schedule code return aes_key_schedule(key, length, m_EK, m_DK); } #endif @@ -909,24 +851,10 @@ void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_EK.empty() == false); -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return aesni_encrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return armv8_encrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) - { - return power8_encrypt_n(in, out, blocks); + return hw_aes_encrypt_n(in, out, blocks); } #endif @@ -944,24 +872,10 @@ void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_DK.empty() == false); -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) - { - return aesni_decrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return armv8_decrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return power8_decrypt_n(in, out, blocks); + return hw_aes_decrypt_n(in, out, blocks); } #endif @@ -984,16 +898,10 @@ void AES_192::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return aes_key_schedule(key, length, m_EK, m_DK); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { + // POWER and ARM use the standard key schedule code return aes_key_schedule(key, length, m_EK, m_DK); } #endif @@ -1018,24 +926,10 @@ void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_EK.empty() == false); -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return aesni_encrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return armv8_encrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) - { - return power8_encrypt_n(in, out, blocks); + return hw_aes_encrypt_n(in, out, blocks); } #endif @@ -1053,24 +947,10 @@ void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_DK.empty() == false); -#if defined(BOTAN_HAS_AES_NI) - if(CPUID::has_aes_ni()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { - return aesni_decrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return armv8_decrypt_n(in, out, blocks); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) - { - return power8_decrypt_n(in, out, blocks); + return hw_aes_decrypt_n(in, out, blocks); } #endif @@ -1093,16 +973,10 @@ void AES_256::key_schedule(const uint8_t key[], size_t length) } #endif -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) - { - return aes_key_schedule(key, length, m_EK, m_DK); - } -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - if(CPUID::has_power_crypto()) +#if defined(BOTAN_HAS_HW_AES_SUPPORT) + if(CPUID::has_hw_aes()) { + // POWER and ARM use the standard key schedule code return aes_key_schedule(key, length, m_EK, m_DK); } #endif diff --git a/src/lib/block/aes/aes.h b/src/lib/block/aes/aes.h index ef1c3a7ca..76248200d 100644 --- a/src/lib/block/aes/aes.h +++ b/src/lib/block/aes/aes.h @@ -40,19 +40,12 @@ class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16, #endif #if defined(BOTAN_HAS_AES_NI) - void aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; void aesni_key_schedule(const uint8_t key[], size_t length); #endif -#if defined(BOTAN_HAS_AES_ARMV8) - void armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - void power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI) + void hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; #endif secure_vector m_EK, m_DK; @@ -82,19 +75,12 @@ class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16, #endif #if defined(BOTAN_HAS_AES_NI) - void aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; void aesni_key_schedule(const uint8_t key[], size_t length); #endif -#if defined(BOTAN_HAS_AES_ARMV8) - void armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - void power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI) + void hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; #endif void key_schedule(const uint8_t key[], size_t length) override; @@ -127,19 +113,12 @@ class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16, #endif #if defined(BOTAN_HAS_AES_NI) - void aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; void aesni_key_schedule(const uint8_t key[], size_t length); #endif -#if defined(BOTAN_HAS_AES_ARMV8) - void armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; -#endif - -#if defined(BOTAN_HAS_AES_POWER8) - void power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; - void power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI) + void hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; #endif void key_schedule(const uint8_t key[], size_t length) override; diff --git a/src/lib/block/aes/aes_armv8/aes_armv8.cpp b/src/lib/block/aes/aes_armv8/aes_armv8.cpp index 0cd81b283..9766bf88c 100644 --- a/src/lib/block/aes/aes_armv8/aes_armv8.cpp +++ b/src/lib/block/aes/aes_armv8/aes_armv8.cpp @@ -54,7 +54,7 @@ namespace Botan { * AES-128 Encryption */ BOTAN_FUNC_ISA("+crypto") -void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_EK.data()); @@ -119,7 +119,7 @@ void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-128 Decryption */ BOTAN_FUNC_ISA("+crypto") -void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_DK.data()); @@ -184,7 +184,7 @@ void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-192 Encryption */ BOTAN_FUNC_ISA("+crypto") -void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_EK.data()); @@ -255,7 +255,7 @@ void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-192 Decryption */ BOTAN_FUNC_ISA("+crypto") -void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_DK.data()); @@ -326,7 +326,7 @@ void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-256 Encryption */ BOTAN_FUNC_ISA("+crypto") -void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_EK.data()); @@ -403,7 +403,7 @@ void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-256 Decryption */ BOTAN_FUNC_ISA("+crypto") -void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const uint8_t *skey = reinterpret_cast(m_DK.data()); diff --git a/src/lib/block/aes/aes_ni/aes_ni.cpp b/src/lib/block/aes/aes_ni/aes_ni.cpp index 0160bc1ee..76c695f32 100644 --- a/src/lib/block/aes/aes_ni/aes_ni.cpp +++ b/src/lib/block/aes/aes_ni/aes_ni.cpp @@ -107,7 +107,7 @@ __m128i aes_256_key_expansion(__m128i key, __m128i key2) * AES-128 Encryption */ BOTAN_FUNC_ISA("ssse3,aes") -void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const __m128i* in_mm = reinterpret_cast(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); @@ -184,7 +184,7 @@ void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-128 Decryption */ BOTAN_FUNC_ISA("ssse3,aes") -void AES_128::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const __m128i* in_mm = reinterpret_cast(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); @@ -314,7 +314,7 @@ void AES_128::aesni_key_schedule(const uint8_t key[], size_t) * AES-192 Encryption */ BOTAN_FUNC_ISA("ssse3,aes") -void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const __m128i* in_mm = reinterpret_cast(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); @@ -397,7 +397,7 @@ void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-192 Decryption */ BOTAN_FUNC_ISA("ssse3,aes") -void AES_192::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const __m128i* in_mm = reinterpret_cast(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); @@ -530,7 +530,7 @@ void AES_192::aesni_key_schedule(const uint8_t key[], size_t) * AES-256 Encryption */ BOTAN_FUNC_ISA("ssse3,aes") -void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const __m128i* in_mm = reinterpret_cast(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); @@ -619,7 +619,7 @@ void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) * AES-256 Decryption */ BOTAN_FUNC_ISA("ssse3,aes") -void AES_256::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const __m128i* in_mm = reinterpret_cast(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); diff --git a/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/lib/block/aes/aes_power8/aes_power8.cpp index 0dea7a953..02dca5fdc 100644 --- a/src/lib/block/aes/aes_power8/aes_power8.cpp +++ b/src/lib/block/aes/aes_power8/aes_power8.cpp @@ -112,7 +112,7 @@ inline void store_blocks(Altivec64x2 B0, Altivec64x2 B1, } BOTAN_FUNC_ISA("crypto") -void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const Altivec64x2 K0 = load_key(&m_EK[0]); const Altivec64x2 K1 = load_key(&m_EK[4]); @@ -176,7 +176,7 @@ void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) } BOTAN_FUNC_ISA("crypto") -void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const Altivec64x2 K0 = load_key(&m_EK[40]); const Altivec64x2 K1 = load_key(&m_EK[36]); @@ -240,7 +240,7 @@ void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) } BOTAN_FUNC_ISA("crypto") -void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const Altivec64x2 K0 = load_key(&m_EK[0]); const Altivec64x2 K1 = load_key(&m_EK[4]); @@ -310,7 +310,7 @@ void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) } BOTAN_FUNC_ISA("crypto") -void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const Altivec64x2 K0 = load_key(&m_EK[48]); const Altivec64x2 K1 = load_key(&m_EK[44]); @@ -380,7 +380,7 @@ void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) } BOTAN_FUNC_ISA("crypto") -void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const Altivec64x2 K0 = load_key(&m_EK[0]); const Altivec64x2 K1 = load_key(&m_EK[4]); @@ -456,7 +456,7 @@ void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) } BOTAN_FUNC_ISA("crypto") -void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const +void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { const Altivec64x2 K0 = load_key(&m_EK[56]); const Altivec64x2 K1 = load_key(&m_EK[52]); diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h index d9e6b97b3..d9e0a74a6 100644 --- a/src/lib/utils/cpuid/cpuid.h +++ b/src/lib/utils/cpuid/cpuid.h @@ -334,6 +334,22 @@ class BOTAN_PUBLIC_API(2,1) CPUID final #endif } + /** + * Check if the processor supports hardware AES instructions + */ + static bool has_hw_aes() + { +#if defined(BOTAN_TARGET_CPU_IS_X86_FAMILY) + return has_aes_ni(); +#elif defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY) + return has_arm_aes(); +#elif defined(BOTAN_TARGET_CPU_IS_PPC_FAMILY) + return has_power_crypto(); +#else + return false; +#endif + } + /** * Check if the processor supports carryless multiply * (CLMUL, PMULL) -- cgit v1.2.3 From 4583823284bc9e856b6c2a4d7bc48ffced89e066 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 09:23:46 -0400 Subject: Generalize byteswapping of AES keys for hw support Then POWER avoids having to swap on load. And the same code can even be used for AES-NI also --- src/lib/block/aes/aes.cpp | 19 +++++++------------ src/lib/block/aes/aes_power8/aes_power8.cpp | 21 +++++---------------- 2 files changed, 12 insertions(+), 28 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index b10c7b249..db9b68f38 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -644,7 +644,8 @@ inline uint32_t SE_word(uint32_t x) void aes_key_schedule(const uint8_t key[], size_t length, secure_vector& EK, - secure_vector& DK) + secure_vector& DK, + bool bswap_keys = false) { static const uint32_t RC[10] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, @@ -706,17 +707,14 @@ void aes_key_schedule(const uint8_t key[], size_t length, copy_mem(EK.data(), XEK.data(), EK.size()); copy_mem(DK.data(), XDK.data(), DK.size()); -#if defined(BOTAN_HAS_AES_ARMV8) - if(CPUID::has_arm_aes()) + if(bswap_keys) { - // ARM needs the subkeys to be byte reversed - + // HW AES on little endian needs the subkeys to be byte reversed for(size_t i = 0; i != EK.size(); ++i) EK[i] = reverse_bytes(EK[i]); for(size_t i = 0; i != DK.size(); ++i) DK[i] = reverse_bytes(DK[i]); } -#endif CT::unpoison(EK.data(), EK.size()); CT::unpoison(DK.data(), DK.size()); @@ -826,8 +824,7 @@ void AES_128::key_schedule(const uint8_t key[], size_t length) #if defined(BOTAN_HAS_HW_AES_SUPPORT) if(CPUID::has_hw_aes()) { - // POWER and ARM use the standard key schedule code - return aes_key_schedule(key, length, m_EK, m_DK); + return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian()); } #endif @@ -901,8 +898,7 @@ void AES_192::key_schedule(const uint8_t key[], size_t length) #if defined(BOTAN_HAS_HW_AES_SUPPORT) if(CPUID::has_hw_aes()) { - // POWER and ARM use the standard key schedule code - return aes_key_schedule(key, length, m_EK, m_DK); + return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian()); } #endif @@ -976,8 +972,7 @@ void AES_256::key_schedule(const uint8_t key[], size_t length) #if defined(BOTAN_HAS_HW_AES_SUPPORT) if(CPUID::has_hw_aes()) { - // POWER and ARM use the standard key schedule code - return aes_key_schedule(key, length, m_EK, m_DK); + return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian()); } #endif diff --git a/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/lib/block/aes/aes_power8/aes_power8.cpp index 02dca5fdc..18bc85933 100644 --- a/src/lib/block/aes/aes_power8/aes_power8.cpp +++ b/src/lib/block/aes/aes_power8/aes_power8.cpp @@ -24,22 +24,6 @@ typedef __vector unsigned char Altivec8x16; namespace { -inline Altivec64x2 load_key(const uint32_t key[]) - { - Altivec32x4 vec = vec_vsx_ld(0, key); - - if(CPUID::is_little_endian()) - { - const Altivec8x16 mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; - const Altivec8x16 zero = {0}; - return (Altivec64x2)vec_perm((Altivec8x16)vec, zero, mask); - } - else - { - return (Altivec64x2)vec; - } - } - inline Altivec8x16 reverse_vec(Altivec8x16 src) { if(CPUID::is_little_endian()) @@ -54,6 +38,11 @@ inline Altivec8x16 reverse_vec(Altivec8x16 src) } } +inline Altivec64x2 load_key(const uint32_t key[]) + { + return (Altivec64x2)reverse_vec((Altivec8x16)vec_vsx_ld(0, key));; + } + inline Altivec64x2 load_block(const uint8_t src[]) { return (Altivec64x2)reverse_vec(vec_vsx_ld(0, src)); -- cgit v1.2.3 From c06cb332d7039256a9d3c0c3aa4024e1e37b7626 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 09:48:11 -0400 Subject: Avoid allocating extra vector during AES key schedule This ended up being about 10% of the total cost. --- src/lib/block/aes/aes.cpp | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index db9b68f38..21f580641 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -660,43 +660,43 @@ void aes_key_schedule(const uint8_t key[], size_t length, CT::poison(key, length); - secure_vector XEK(length + 32); - secure_vector XDK(length + 32); + EK.resize(length + 32); + DK.resize(length + 32); for(size_t i = 0; i != X; ++i) - XEK[i] = load_be(key, i); + EK[i] = load_be(key, i); for(size_t i = X; i < 4*(rounds+1); i += X) { - XEK[i] = XEK[i-X] ^ RC[(i-X)/X] ^ rotl<8>(SE_word(XEK[i-1])); + EK[i] = EK[i-X] ^ RC[(i-X)/X] ^ rotl<8>(SE_word(EK[i-1])); for(size_t j = 1; j != X; ++j) { - XEK[i+j] = XEK[i+j-X]; + EK[i+j] = EK[i+j-X]; if(X == 8 && j == 4) - XEK[i+j] ^= SE_word(XEK[i+j-1]); + EK[i+j] ^= SE_word(EK[i+j-1]); else - XEK[i+j] ^= XEK[i+j-1]; + EK[i+j] ^= EK[i+j-1]; } } for(size_t i = 0; i != 4*(rounds+1); i += 4) { - XDK[i ] = XEK[4*rounds-i ]; - XDK[i+1] = XEK[4*rounds-i+1]; - XDK[i+2] = XEK[4*rounds-i+2]; - XDK[i+3] = XEK[4*rounds-i+3]; + DK[i ] = EK[4*rounds-i ]; + DK[i+1] = EK[4*rounds-i+1]; + DK[i+2] = EK[4*rounds-i+2]; + DK[i+3] = EK[4*rounds-i+3]; } for(size_t i = 4; i != length + 24; ++i) { - const uint8_t s0 = get_byte(0, XDK[i]); - const uint8_t s1 = get_byte(1, XDK[i]); - const uint8_t s2 = get_byte(2, XDK[i]); - const uint8_t s3 = get_byte(3, XDK[i]); + const uint8_t s0 = get_byte(0, DK[i]); + const uint8_t s1 = get_byte(1, DK[i]); + const uint8_t s2 = get_byte(2, DK[i]); + const uint8_t s3 = get_byte(3, DK[i]); - XDK[i] = InvMixColumn(s0) ^ + DK[i] = InvMixColumn(s0) ^ rotr<8>(InvMixColumn(s1)) ^ rotr<16>(InvMixColumn(s2)) ^ rotr<24>(InvMixColumn(s3)); @@ -704,8 +704,6 @@ void aes_key_schedule(const uint8_t key[], size_t length, EK.resize(length + 24 + 4); DK.resize(length + 24 + 4); - copy_mem(EK.data(), XEK.data(), EK.size()); - copy_mem(DK.data(), XDK.data(), DK.size()); if(bswap_keys) { -- cgit v1.2.3 From 4a895ebf662403cccb2451f9905dd105ca46fe13 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 6 May 2020 20:28:17 -0400 Subject: Avoid extra resize during AES key schedule --- src/lib/block/aes/aes.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 21f580641..b1792561f 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -660,8 +660,8 @@ void aes_key_schedule(const uint8_t key[], size_t length, CT::poison(key, length); - EK.resize(length + 32); - DK.resize(length + 32); + EK.resize(length + 28); + DK.resize(length + 28); for(size_t i = 0; i != X; ++i) EK[i] = load_be(key, i); @@ -670,7 +670,7 @@ void aes_key_schedule(const uint8_t key[], size_t length, { EK[i] = EK[i-X] ^ RC[(i-X)/X] ^ rotl<8>(SE_word(EK[i-1])); - for(size_t j = 1; j != X; ++j) + for(size_t j = 1; j != X && (i+j) < EK.size(); ++j) { EK[i+j] = EK[i+j-X]; @@ -689,7 +689,7 @@ void aes_key_schedule(const uint8_t key[], size_t length, DK[i+3] = EK[4*rounds-i+3]; } - for(size_t i = 4; i != length + 24; ++i) + for(size_t i = 4; i != DK.size() - 4; ++i) { const uint8_t s0 = get_byte(0, DK[i]); const uint8_t s1 = get_byte(1, DK[i]); @@ -702,9 +702,6 @@ void aes_key_schedule(const uint8_t key[], size_t length, rotr<24>(InvMixColumn(s3)); } - EK.resize(length + 24 + 4); - DK.resize(length + 24 + 4); - if(bswap_keys) { // HW AES on little endian needs the subkeys to be byte reversed -- cgit v1.2.3 From f66b3edf39de8c3d455d5aaee3ab444301a29db4 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Fri, 8 May 2020 05:40:07 -0400 Subject: Small optimizations for bitsliced AES Doing the shift rows using 64-bit words when available saves around 3 cpb on x86-64 --- src/lib/block/aes/aes.cpp | 80 ++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 35 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index b1792561f..0accba435 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -396,25 +396,49 @@ inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) inline void shift_rows(uint32_t B[8]) { + // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31 +#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT) + for(size_t i = 0; i != 8; i += 2) + { + uint64_t x = (static_cast(B[i]) << 32) | B[i+1]; + x = bit_permute_step(x, 0x0022331100223311, 2); + x = bit_permute_step(x, 0x0055005500550055, 1); + B[i] = static_cast(x >> 32); + B[i+1] = static_cast(x); + } +#else for(size_t i = 0; i != 8; ++i) { uint32_t x = B[i]; - // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31 - x = bit_permute_step(x, 0x00223311, 2); // Butterfly, stage 1 - x = bit_permute_step(x, 0x00550055, 1); // Butterfly, stage 0 + x = bit_permute_step(x, 0x00223311, 2); + x = bit_permute_step(x, 0x00550055, 1); B[i] = x; } +#endif } inline void inv_shift_rows(uint32_t B[8]) { + // Inverse of shift_rows, just inverting the steps + +#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT) + for(size_t i = 0; i != 8; i += 2) + { + uint64_t x = (static_cast(B[i]) << 32) | B[i+1]; + x = bit_permute_step(x, 0x0055005500550055, 1); + x = bit_permute_step(x, 0x0022331100223311, 2); + B[i] = static_cast(x >> 32); + B[i+1] = static_cast(x); + } +#else for(size_t i = 0; i != 8; ++i) { uint32_t x = B[i]; - x = bit_permute_step(x, 0x00550055, 1); // Butterfly, stage 0 - x = bit_permute_step(x, 0x00223311, 2); // Butterfly, stage 1 + x = bit_permute_step(x, 0x00550055, 1); + x = bit_permute_step(x, 0x00223311, 2); B[i] = x; } +#endif } inline void mix_columns(uint32_t B[8]) @@ -478,7 +502,7 @@ void inv_mix_columns(uint32_t B[8]) const uint32_t X13 = X9 ^ X4[i]; const uint32_t X14 = X8[i] ^ X4[i] ^ X2[i]; - B[i] = X14 ^ rotr<8>(X9) ^ rotr<24>(X11) ^ rotr<16>(X13); + B[i] = X14 ^ rotr<8>(X9) ^ rotr<16>(X13) ^ rotr<24>(X11); } } @@ -499,9 +523,11 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], ks_expand(&KS[8*i], EK.data(), 4*i + 4); } + const size_t BITSLICED_BLOCKS = 2; + while(blocks > 0) { - const size_t this_loop = (blocks >= 2) ? 2 : 1; + const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS); uint32_t B[8] = { 0 }; @@ -530,13 +556,10 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], for(size_t i = 0; i != 8; ++i) B[i] ^= EK[4*rounds + i % 4]; - if(this_loop == 2) - store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); - else - store_be(out, B[0], B[1], B[2], B[3]); + copy_out_be(out, this_loop*4*sizeof(uint32_t), B); - in += this_loop*16; - out += this_loop*16; + in += this_loop * 16; + out += this_loop * 16; blocks -= this_loop; } } @@ -557,9 +580,11 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, ks_expand(&KS[8*i], DK.data(), 4*i + 4); } + const size_t BITSLICED_BLOCKS = 2; + while(blocks > 0) { - const size_t this_loop = (blocks >= 2) ? 2 : 1; + const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS); uint32_t B[8] = { 0 }; @@ -588,13 +613,10 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, for(size_t i = 0; i != 8; ++i) B[i] ^= DK[4*rounds + i % 4]; - if(this_loop == 2) - store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]); - else - store_be(out, B[0], B[1], B[2], B[3]); + copy_out_be(out, this_loop*4*sizeof(uint32_t), B); - in += this_loop*16; - out += this_loop*16; + in += this_loop * 16; + out += this_loop * 16; blocks -= this_loop; } } @@ -613,31 +635,19 @@ inline uint32_t InvMixColumn(uint8_t s1) return make_uint32(s14, s9, s13, s11); } -inline uint32_t SE_word(uint32_t x) +uint32_t SE_word(uint32_t x) { uint32_t I[8] = { 0 }; - // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 - x = bit_permute_step(x, 0x00aa00aa, 7); // Bit index swap 0,3 - x = bit_permute_step(x, 0x0000cccc, 14); // Bit index swap 1,4 - x = bit_permute_step(x, 0x00f000f0, 4); // Bit index swap 2,3 - x = bit_permute_step(x, 0x0000ff00, 8); // Bit index swap 3,4 - for(size_t i = 0; i != 8; ++i) - I[i] = (x >> (28-4*i)) & 0xF; + I[i] = (x >> (7-i)) & 0x01010101; AES_SBOX(I); x = 0; for(size_t i = 0; i != 8; ++i) - x = (x << 4) + (I[i] & 0xF); - - // 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31 - x = bit_permute_step(x, 0x0a0a0a0a, 3); // Bit index swap 0,2 - x = bit_permute_step(x, 0x00cc00cc, 6); // Bit index swap 1,3 - x = bit_permute_step(x, 0x0000f0f0, 12); // Bit index swap 2,4 - x = bit_permute_step(x, 0x0000ff00, 8); // Bit index swap 3,4 + x |= ((I[i] & 0x01010101) << (7-i)); return x; } -- cgit v1.2.3 From 5e2d037bff14b7125d53bd33307fc0cc2748f8c0 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Fri, 8 May 2020 06:07:31 -0400 Subject: Fewer magic constants --- src/lib/block/aes/aes.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'src/lib') diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 0accba435..e1a50a2a3 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -523,7 +523,8 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], ks_expand(&KS[8*i], EK.data(), 4*i + 4); } - const size_t BITSLICED_BLOCKS = 2; + const size_t BLOCK_SIZE = 16; + const size_t BITSLICED_BLOCKS = 8*sizeof(uint32_t) / BLOCK_SIZE; while(blocks > 0) { @@ -558,8 +559,8 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[], copy_out_be(out, this_loop*4*sizeof(uint32_t), B); - in += this_loop * 16; - out += this_loop * 16; + in += this_loop * BLOCK_SIZE; + out += this_loop * BLOCK_SIZE; blocks -= this_loop; } } @@ -580,7 +581,8 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, ks_expand(&KS[8*i], DK.data(), 4*i + 4); } - const size_t BITSLICED_BLOCKS = 2; + const size_t BLOCK_SIZE = 16; + const size_t BITSLICED_BLOCKS = 8*sizeof(uint32_t) / BLOCK_SIZE; while(blocks > 0) { @@ -615,8 +617,8 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, copy_out_be(out, this_loop*4*sizeof(uint32_t), B); - in += this_loop * 16; - out += this_loop * 16; + in += this_loop * BLOCK_SIZE; + out += this_loop * BLOCK_SIZE; blocks -= this_loop; } } -- cgit v1.2.3