aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2020-05-05 11:00:37 -0400
committerJack Lloyd <[email protected]>2020-05-06 04:26:55 -0400
commit40bca8a33c51c2b057003a66c4e1a380ccf78a24 (patch)
treea3e38d173ddce9bc37f238951639860fb1ba7b5f /src
parent03f7fe39da25fa8efba6eb3fbc1dbb1bb0512393 (diff)
Add constant time bitsliced AES encryption for CPUs without vperm or hardware
Diffstat (limited to 'src')
-rw-r--r--src/lib/block/aes/aes.cpp363
-rw-r--r--src/tests/data/block/aes.vec16
2 files changed, 202 insertions, 177 deletions
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp
index 64205504f..7b3c9bc37 100644
--- a/src/lib/block/aes/aes.cpp
+++ b/src/lib/block/aes/aes.cpp
@@ -1,9 +1,6 @@
/*
-* AES
* (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
*
-* Based on the public domain reference implementation by Paulo Baretto
-*
* Botan is released under the Simplified BSD License (see license.txt)
*/
@@ -15,69 +12,11 @@
#include <botan/internal/ct_utils.h>
#include <type_traits>
-/*
-* This implementation is based on table lookups which are known to be
-* vulnerable to timing and cache based side channel attacks. Some
-* countermeasures are used which may be helpful in some situations:
-*
-* - Only a single 256-word T-table is used, with rotations applied.
-* Most implementations use 4 (or sometimes 5) T-tables, which leaks
-* much more information via cache usage.
-*
-* - The TE and TD tables are computed at runtime to avoid flush+reload
-* attacks using clflush. As different processes will not share the
-* same underlying table data, an attacker can't manipulate another
-* processes cache lines via their shared reference to the library
-* read only segment. (However, prime+probe attacks are still possible.)
-*
-* - Each cache line of the lookup tables is accessed at the beginning
-* of each call to encrypt or decrypt. (See the Z variable below)
-*
-* If available SSSE3 or AES-NI are used instead of this version, as both
-* are faster and immune to side channel attacks.
-*
-* Some AES cache timing papers for reference:
-*
-* "Software mitigations to hedge AES against cache-based software side
-* channel vulnerabilities" https://eprint.iacr.org/2006/052.pdf
-*
-* "Cache Games - Bringing Access-Based Cache Attacks on AES to Practice"
-* http://www.ieee-security.org/TC/SP2011/PAPERS/2011/paper031.pdf
-*
-* "Cache-Collision Timing Attacks Against AES" Bonneau, Mironov
-* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.88.4753
-*/
-
namespace Botan {
namespace {
alignas(64)
-const uint8_t SE[256] = {
- 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B,
- 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
- 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26,
- 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
- 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2,
- 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
- 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED,
- 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
- 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F,
- 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
- 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC,
- 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
- 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14,
- 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
- 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D,
- 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
- 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F,
- 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
- 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11,
- 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
- 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F,
- 0xB0, 0x54, 0xBB, 0x16 };
-
-alignas(64)
const uint8_t SD[256] = {
0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E,
0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
@@ -103,7 +42,6 @@ const uint8_t SD[256] = {
0x55, 0x21, 0x0C, 0x7D };
inline constexpr uint8_t xtime(uint8_t s) { return static_cast<uint8_t>(s << 1) ^ ((s >> 7) * 0x1B); }
-inline constexpr uint8_t xtime3(uint8_t s) { return xtime(s) ^ s; }
inline uint32_t InvMixColumn(uint8_t s1)
{
@@ -118,8 +56,8 @@ inline uint32_t InvMixColumn(uint8_t s1)
}
/*
-This is a bitsliced AES sbox computation which can execute up
-to 32 parallel sbox computations.
+This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
+parallel.
The circuit is from "A depth-16 circuit for the AES S-box" by Boyar
and Peralta (https://eprint.iacr.org/2011/332.pdf)
@@ -318,64 +256,132 @@ inline uint32_t SE_word(uint32_t x)
return x;
}
-const uint32_t* AES_TE()
+inline void bit_transpose(uint32_t B[8])
{
- class TE_Table final
- {
- public:
- TE_Table()
- {
- uint32_t* p = reinterpret_cast<uint32_t*>(&data);
- for(size_t i = 0; i != 256; ++i)
- {
- const uint8_t s = SE[i];
- p[i] = make_uint32(xtime(s), s, s, xtime3(s));
- }
- }
+ swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
+ swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
+ swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
+ swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
+
+ swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
+ swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
+ swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
+ swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
+
+ swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
+ }
- const uint32_t* ptr() const
- {
- return reinterpret_cast<const uint32_t*>(&data);
- }
- private:
- std::aligned_storage<256*sizeof(uint32_t), 64>::type data;
- };
+inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r)
+ {
+ /*
+ This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
+ due to knowing the first and second halves are the same data.
+ */
+ for(size_t i = 0; i != 4; ++i)
+ B[i] = K[r + i];
- static TE_Table table;
- return table.ptr();
+ swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
+ swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
+
+ swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
+ swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
+
+ B[4] = B[0];
+ B[5] = B[1];
+ B[6] = B[2];
+ B[7] = B[3];
+
+ swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
}
-const uint32_t* AES_TD()
+inline void shift_rows(uint32_t B[8])
{
- class TD_Table final
+ for(size_t i = 0; i != 8; ++i)
{
- public:
- TD_Table()
- {
- uint32_t* p = reinterpret_cast<uint32_t*>(&data);
- for(size_t i = 0; i != 256; ++i)
- {
- p[i] = InvMixColumn(SD[i]);
- }
- }
+ uint32_t x = B[i];
+ // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
+ x = bit_permute_step<uint32_t>(x, 0x00223311, 2); // Butterfly, stage 1
+ x = bit_permute_step<uint32_t>(x, 0x00550055, 1); // Butterfly, stage 0
+ B[i] = x;
+ }
+ }
- const uint32_t* ptr() const
- {
- return reinterpret_cast<const uint32_t*>(&data);
- }
- private:
- std::aligned_storage<256*sizeof(uint32_t), 64>::type data;
- };
+inline void mix_columns(uint32_t B[8])
+ {
+ /*
+ This is equivalent to what T-tables mix columns looks like when you decompose it:
+
+ // carry high bits in B[0] to positions in 0x1b == 0b11011
+ const uint32_t X2[8] = {
+ B[1],
+ B[2],
+ B[3],
+ B[4] ^ B[0],
+ B[5] ^ B[0],
+ B[6],
+ B[7] ^ B[0],
+ B[0],
+ };
+ for(size_t i = 0; i != 8; i++)
+ {
+ const uint32_t X3 = B[i] ^ X2[i];
- static TD_Table table;
- return table.ptr();
- }
+ uint8_t b0 = get_byte(0, X2[i]) ^ get_byte(1, X3) ^ get_byte(2, B[i]) ^ get_byte(3, B[i]);
+ uint8_t b1 = get_byte(0, B[i]) ^ get_byte(1, X2[i]) ^ get_byte(2, X3) ^ get_byte(3, B[i]);
+ uint8_t b2 = get_byte(0, B[i]) ^ get_byte(1, B[i]) ^ get_byte(2, X2[i]) ^ get_byte(3, X3);
+ uint8_t b3 = get_byte(0, X3) ^ get_byte(1, B[i]) ^ get_byte(2, B[i]) ^ get_byte(3, X2[i]);
-#define AES_T(T, K, V0, V1, V2, V3) \
- (K ^ T[get_byte(0, V0)] ^ \
- rotr< 8>(T[get_byte(1, V1)]) ^ \
- rotr<16>(T[get_byte(2, V2)]) ^ \
- rotr<24>(T[get_byte(3, V3)]))
+ B[i] = make_uint32(b0, b1, b2, b3);
+ }
+
+ Notice that each byte of B[i], X2[i] and X3 is used once in each column, so
+ we can instead effect the selections by rotations and do the XORs in word units
+ instead of bytes. Unrolling and expanding the definition of X2 then combining
+ similar terms results in the expressions below. The end result is very
+ similar to the MixColumns found in section 4.4 and Appendix A of "Faster and
+ Timing-Attack Resistant AES-GCM" (https://eprint.iacr.org/2009/129.pdf) except
+ suited to our word size, and of course we cannot make use of word/byte shuffles
+ to perform the rotations.
+ */
+
+ const uint32_t R24[8] = {
+ rotr<24>(B[0]),
+ rotr<24>(B[1]),
+ rotr<24>(B[2]),
+ rotr<24>(B[3]),
+ rotr<24>(B[4]),
+ rotr<24>(B[5]),
+ rotr<24>(B[6]),
+ rotr<24>(B[7])
+ };
+
+ const uint32_t R8_16[8] = {
+ rotr<8>(B[0]) ^ rotr<16>(B[0]),
+ rotr<8>(B[1]) ^ rotr<16>(B[1]),
+ rotr<8>(B[2]) ^ rotr<16>(B[2]),
+ rotr<8>(B[3]) ^ rotr<16>(B[3]),
+ rotr<8>(B[4]) ^ rotr<16>(B[4]),
+ rotr<8>(B[5]) ^ rotr<16>(B[5]),
+ rotr<8>(B[6]) ^ rotr<16>(B[6]),
+ rotr<8>(B[7]) ^ rotr<16>(B[7])
+ };
+
+ const uint32_t B0 = B[1] ^ R24[0] ^ R24[1] ^ R8_16[0];
+ B[1] = B[2] ^ R24[1] ^ R24[2] ^ R8_16[1];
+ B[2] = B[3] ^ R24[2] ^ R24[3] ^ R8_16[2];
+ B[3] = B[0] ^ B[4] ^ R24[0] ^ R24[3] ^ R24[4] ^ R8_16[3];
+ B[4] = B[5] ^ B[0] ^ R24[0] ^ R24[4] ^ R24[5] ^ R8_16[4];
+ B[5] = B[6] ^ R24[5] ^ R24[6] ^ R8_16[5];
+ B[6] = B[7] ^ B[0] ^ R24[0] ^ R24[6] ^ R24[7] ^ R8_16[6];
+ B[7] = B[0] ^ R24[0] ^ R24[7] ^ R8_16[7];
+ B[0] = B0;
+ }
/*
* AES Encryption
@@ -386,71 +392,94 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[],
const secure_vector<uint8_t>& ME)
{
BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set");
+ BOTAN_ASSERT(EK.size() == 40 || EK.size() == 48 || EK.size() == 56, "Expected EK size");
- const size_t cache_line_size = CPUID::cache_line_size();
- const uint32_t* TE = AES_TE();
-
- // Hit every cache line of TE
- volatile uint32_t Z = 0;
- for(size_t i = 0; i < 256; i += cache_line_size / sizeof(uint32_t))
+ uint32_t KS[56*2] = { 0 }; // actual maximum is EK.size() * 2
+ for(size_t i = 4; i < EK.size(); i += 4)
{
- Z |= TE[i];
+ ks_expand(&KS[2*(i-4)], EK.data(), i);
}
- Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce
- for(size_t i = 0; i < blocks; ++i)
+ while(blocks > 0)
{
- uint32_t T0, T1, T2, T3;
- load_be(in + 16*i, T0, T1, T2, T3);
+ const size_t this_loop = (blocks >= 2) ? 2 : 1;
- T0 ^= EK[0];
- T1 ^= EK[1];
- T2 ^= EK[2];
- T3 ^= EK[3];
+ uint32_t B[8] = { 0 };
- T0 ^= Z;
+ load_be(B, in, this_loop*4);
- uint32_t B0 = AES_T(TE, EK[4], T0, T1, T2, T3);
- uint32_t B1 = AES_T(TE, EK[5], T1, T2, T3, T0);
- uint32_t B2 = AES_T(TE, EK[6], T2, T3, T0, T1);
- uint32_t B3 = AES_T(TE, EK[7], T3, T0, T1, T2);
+ B[0] ^= EK[0];
+ B[1] ^= EK[1];
+ B[2] ^= EK[2];
+ B[3] ^= EK[3];
+ B[4] ^= EK[0];
+ B[5] ^= EK[1];
+ B[6] ^= EK[2];
+ B[7] ^= EK[3];
- for(size_t r = 2*4; r < EK.size(); r += 2*4)
+ bit_transpose(B);
+
+ for(size_t r = 4; r < EK.size(); r += 4)
{
- T0 = AES_T(TE, EK[r ], B0, B1, B2, B3);
- T1 = AES_T(TE, EK[r+1], B1, B2, B3, B0);
- T2 = AES_T(TE, EK[r+2], B2, B3, B0, B1);
- T3 = AES_T(TE, EK[r+3], B3, B0, B1, B2);
-
- B0 = AES_T(TE, EK[r+4], T0, T1, T2, T3);
- B1 = AES_T(TE, EK[r+5], T1, T2, T3, T0);
- B2 = AES_T(TE, EK[r+6], T2, T3, T0, T1);
- B3 = AES_T(TE, EK[r+7], T3, T0, T1, T2);
+ AES_SBOX(B);
+ shift_rows(B);
+ mix_columns(B);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= KS[2*(r-4) + i];
}
- /*
- * Use TE[x] >> 8 instead of SE[] so encryption only references a single
- * lookup table.
- */
- out[16*i+ 0] = static_cast<uint8_t>(TE[get_byte(0, B0)] >> 8) ^ ME[0];
- out[16*i+ 1] = static_cast<uint8_t>(TE[get_byte(1, B1)] >> 8) ^ ME[1];
- out[16*i+ 2] = static_cast<uint8_t>(TE[get_byte(2, B2)] >> 8) ^ ME[2];
- out[16*i+ 3] = static_cast<uint8_t>(TE[get_byte(3, B3)] >> 8) ^ ME[3];
- out[16*i+ 4] = static_cast<uint8_t>(TE[get_byte(0, B1)] >> 8) ^ ME[4];
- out[16*i+ 5] = static_cast<uint8_t>(TE[get_byte(1, B2)] >> 8) ^ ME[5];
- out[16*i+ 6] = static_cast<uint8_t>(TE[get_byte(2, B3)] >> 8) ^ ME[6];
- out[16*i+ 7] = static_cast<uint8_t>(TE[get_byte(3, B0)] >> 8) ^ ME[7];
- out[16*i+ 8] = static_cast<uint8_t>(TE[get_byte(0, B2)] >> 8) ^ ME[8];
- out[16*i+ 9] = static_cast<uint8_t>(TE[get_byte(1, B3)] >> 8) ^ ME[9];
- out[16*i+10] = static_cast<uint8_t>(TE[get_byte(2, B0)] >> 8) ^ ME[10];
- out[16*i+11] = static_cast<uint8_t>(TE[get_byte(3, B1)] >> 8) ^ ME[11];
- out[16*i+12] = static_cast<uint8_t>(TE[get_byte(0, B3)] >> 8) ^ ME[12];
- out[16*i+13] = static_cast<uint8_t>(TE[get_byte(1, B0)] >> 8) ^ ME[13];
- out[16*i+14] = static_cast<uint8_t>(TE[get_byte(2, B1)] >> 8) ^ ME[14];
- out[16*i+15] = static_cast<uint8_t>(TE[get_byte(3, B2)] >> 8) ^ ME[15];
+ // Final round:
+ AES_SBOX(B);
+ shift_rows(B);
+ bit_transpose(B);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= load_be<uint32_t>(ME.data(), i % 4);
+
+ if(this_loop == 2)
+ store_be(out, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]);
+ else
+ store_be(out, B[0], B[1], B[2], B[3]);
+
+ in += this_loop*16;
+ out += this_loop*16;
+ blocks -= this_loop;
}
}
+const uint32_t* AES_TD()
+ {
+ class TD_Table final
+ {
+ public:
+ TD_Table()
+ {
+ uint32_t* p = reinterpret_cast<uint32_t*>(&data);
+ for(size_t i = 0; i != 256; ++i)
+ {
+ p[i] = InvMixColumn(SD[i]);
+ }
+ }
+
+ const uint32_t* ptr() const
+ {
+ return reinterpret_cast<const uint32_t*>(&data);
+ }
+ private:
+ std::aligned_storage<256*sizeof(uint32_t), 64>::type data;
+ };
+
+ static TD_Table table;
+ return table.ptr();
+ }
+
+#define AES_T(T, K, V0, V1, V2, V3) \
+ (K ^ T[get_byte(0, V0)] ^ \
+ rotr< 8>(T[get_byte(1, V1)]) ^ \
+ rotr<16>(T[get_byte(2, V2)]) ^ \
+ rotr<24>(T[get_byte(3, V3)]))
+
/*
* AES Decryption
*/
@@ -642,7 +671,15 @@ size_t aes_parallelism()
}
#endif
- return 1;
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return 2;
+ }
+#endif
+
+ // bitsliced:
+ return 2;
}
const char* aes_provider()
diff --git a/src/tests/data/block/aes.vec b/src/tests/data/block/aes.vec
index 8cd7f4d85..e08df20d4 100644
--- a/src/tests/data/block/aes.vec
+++ b/src/tests/data/block/aes.vec
@@ -1033,20 +1033,8 @@ In = 00000000000000000000000000000000
Out = 0545AAD56DA2A97C3663D1432A3D1C84
Key = 00000000000000000000000000000000
-In = 80000000000000000000000000000000
-Out = 3AD78E726C1EC02B7EBFE92B23D9EC34
-
-Key = 00000000000000000000000000000000
-In = 40000000000000000000000000000000
-Out = 45BC707D29E8204D88DFBA2F0B0CAD9B
-
-Key = 00000000000000000000000000000000
-In = 20000000000000000000000000000000
-Out = 161556838018F52805CDBD6202002E3F
-
-Key = 00000000000000000000000000000000
-In = 10000000000000000000000000000000
-Out = F5569B3AB6A6D11EFDE1BF0A64C6854A
+In = 80000000000000000000000000000000400000000000000000000000000000002000000000000000000000000000000010000000000000000000000000000000
+Out = 3AD78E726C1EC02B7EBFE92B23D9EC3445BC707D29E8204D88DFBA2F0B0CAD9B161556838018F52805CDBD6202002E3FF5569B3AB6A6D11EFDE1BF0A64C6854A
Key = 00000000000000000000000000000000
In = 08000000000000000000000000000000