aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/block
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-10-13 20:12:03 -0400
committerJack Lloyd <[email protected]>2017-10-13 20:14:27 -0400
commit69706d9a4cbca0f502d6a810e1e1cbda280367a7 (patch)
tree675a89e33e9c118482654a562b4cdcaafc052682 /src/lib/block
parentd717356bb4dd5d2ab1bf62bbdf9c26a7c223554c (diff)
Optimizations for SM4
Using a larger table helps quite a bit. Using 4 tables (ala AES T-tables) didn't seem to help much at all, it's only slightly faster than a single table with rotations. Continue to use the 8 bit table in the first and last rounds as a countermeasure against cache attacks.
Diffstat (limited to 'src/lib/block')
-rw-r--r--src/lib/block/sm4/sm4.cpp129
1 files changed, 94 insertions, 35 deletions
diff --git a/src/lib/block/sm4/sm4.cpp b/src/lib/block/sm4/sm4.cpp
index 42c865faf..2902d514c 100644
--- a/src/lib/block/sm4/sm4.cpp
+++ b/src/lib/block/sm4/sm4.cpp
@@ -10,9 +10,9 @@
namespace Botan {
-namespace SM4_F {
+namespace {
-const uint8_t SBOX[256] = {
+const uint8_t SM4_SBOX[256] = {
0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
@@ -31,37 +31,92 @@ const uint8_t SBOX[256] = {
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
};
-inline uint32_t T(uint32_t b)
+/*
+* SM4_SBOX_T[j] == L(SM4_SBOX[j]).
+*/
+const uint32_t SM4_SBOX_T[256] = {
+ 0x8ED55B5B, 0xD0924242, 0x4DEAA7A7, 0x06FDFBFB, 0xFCCF3333, 0x65E28787,
+ 0xC93DF4F4, 0x6BB5DEDE, 0x4E165858, 0x6EB4DADA, 0x44145050, 0xCAC10B0B,
+ 0x8828A0A0, 0x17F8EFEF, 0x9C2CB0B0, 0x11051414, 0x872BACAC, 0xFB669D9D,
+ 0xF2986A6A, 0xAE77D9D9, 0x822AA8A8, 0x46BCFAFA, 0x14041010, 0xCFC00F0F,
+ 0x02A8AAAA, 0x54451111, 0x5F134C4C, 0xBE269898, 0x6D482525, 0x9E841A1A,
+ 0x1E061818, 0xFD9B6666, 0xEC9E7272, 0x4A430909, 0x10514141, 0x24F7D3D3,
+ 0xD5934646, 0x53ECBFBF, 0xF89A6262, 0x927BE9E9, 0xFF33CCCC, 0x04555151,
+ 0x270B2C2C, 0x4F420D0D, 0x59EEB7B7, 0xF3CC3F3F, 0x1CAEB2B2, 0xEA638989,
+ 0x74E79393, 0x7FB1CECE, 0x6C1C7070, 0x0DABA6A6, 0xEDCA2727, 0x28082020,
+ 0x48EBA3A3, 0xC1975656, 0x80820202, 0xA3DC7F7F, 0xC4965252, 0x12F9EBEB,
+ 0xA174D5D5, 0xB38D3E3E, 0xC33FFCFC, 0x3EA49A9A, 0x5B461D1D, 0x1B071C1C,
+ 0x3BA59E9E, 0x0CFFF3F3, 0x3FF0CFCF, 0xBF72CDCD, 0x4B175C5C, 0x52B8EAEA,
+ 0x8F810E0E, 0x3D586565, 0xCC3CF0F0, 0x7D196464, 0x7EE59B9B, 0x91871616,
+ 0x734E3D3D, 0x08AAA2A2, 0xC869A1A1, 0xC76AADAD, 0x85830606, 0x7AB0CACA,
+ 0xB570C5C5, 0xF4659191, 0xB2D96B6B, 0xA7892E2E, 0x18FBE3E3, 0x47E8AFAF,
+ 0x330F3C3C, 0x674A2D2D, 0xB071C1C1, 0x0E575959, 0xE99F7676, 0xE135D4D4,
+ 0x661E7878, 0xB4249090, 0x360E3838, 0x265F7979, 0xEF628D8D, 0x38596161,
+ 0x95D24747, 0x2AA08A8A, 0xB1259494, 0xAA228888, 0x8C7DF1F1, 0xD73BECEC,
+ 0x05010404, 0xA5218484, 0x9879E1E1, 0x9B851E1E, 0x84D75353, 0x00000000,
+ 0x5E471919, 0x0B565D5D, 0xE39D7E7E, 0x9FD04F4F, 0xBB279C9C, 0x1A534949,
+ 0x7C4D3131, 0xEE36D8D8, 0x0A020808, 0x7BE49F9F, 0x20A28282, 0xD4C71313,
+ 0xE8CB2323, 0xE69C7A7A, 0x42E9ABAB, 0x43BDFEFE, 0xA2882A2A, 0x9AD14B4B,
+ 0x40410101, 0xDBC41F1F, 0xD838E0E0, 0x61B7D6D6, 0x2FA18E8E, 0x2BF4DFDF,
+ 0x3AF1CBCB, 0xF6CD3B3B, 0x1DFAE7E7, 0xE5608585, 0x41155454, 0x25A38686,
+ 0x60E38383, 0x16ACBABA, 0x295C7575, 0x34A69292, 0xF7996E6E, 0xE434D0D0,
+ 0x721A6868, 0x01545555, 0x19AFB6B6, 0xDF914E4E, 0xFA32C8C8, 0xF030C0C0,
+ 0x21F6D7D7, 0xBC8E3232, 0x75B3C6C6, 0x6FE08F8F, 0x691D7474, 0x2EF5DBDB,
+ 0x6AE18B8B, 0x962EB8B8, 0x8A800A0A, 0xFE679999, 0xE2C92B2B, 0xE0618181,
+ 0xC0C30303, 0x8D29A4A4, 0xAF238C8C, 0x07A9AEAE, 0x390D3434, 0x1F524D4D,
+ 0x764F3939, 0xD36EBDBD, 0x81D65757, 0xB7D86F6F, 0xEB37DCDC, 0x51441515,
+ 0xA6DD7B7B, 0x09FEF7F7, 0xB68C3A3A, 0x932FBCBC, 0x0F030C0C, 0x03FCFFFF,
+ 0xC26BA9A9, 0xBA73C9C9, 0xD96CB5B5, 0xDC6DB1B1, 0x375A6D6D, 0x15504545,
+ 0xB98F3636, 0x771B6C6C, 0x13ADBEBE, 0xDA904A4A, 0x57B9EEEE, 0xA9DE7777,
+ 0x4CBEF2F2, 0x837EFDFD, 0x55114444, 0xBDDA6767, 0x2C5D7171, 0x45400505,
+ 0x631F7C7C, 0x50104040, 0x325B6969, 0xB8DB6363, 0x220A2828, 0xC5C20707,
+ 0xF531C4C4, 0xA88A2222, 0x31A79696, 0xF9CE3737, 0x977AEDED, 0x49BFF6F6,
+ 0x992DB4B4, 0xA475D1D1, 0x90D34343, 0x5A124848, 0x58BAE2E2, 0x71E69797,
+ 0x64B6D2D2, 0x70B2C2C2, 0xAD8B2626, 0xCD68A5A5, 0xCB955E5E, 0x624B2929,
+ 0x3C0C3030, 0xCE945A5A, 0xAB76DDDD, 0x867FF9F9, 0xF1649595, 0x5DBBE6E6,
+ 0x35F2C7C7, 0x2D092424, 0xD1C61717, 0xD66FB9B9, 0xDEC51B1B, 0x94861212,
+ 0x78186060, 0x30F3C3C3, 0x897CF5F5, 0x5CEFB3B3, 0xD23AE8E8, 0xACDF7373,
+ 0x794C3535, 0xA0208080, 0x9D78E5E5, 0x56EDBBBB, 0x235E7D7D, 0xC63EF8F8,
+ 0x8BD45F5F, 0xE7C82F2F, 0xDD39E4E4, 0x68492121 };
+
+inline uint32_t SM4_T_slow(uint32_t b)
{
- /*
- TODO: could convert this to 4 8->32 bit tables combined with xor.
- It was about 66% faster (110 MiB->181 MiB) on an i7-6700k, but the
- large tables will be likely vulnerable to AES style cache attacks.
- */
-
- const uint8_t b0 = get_byte(0, b);
- const uint8_t b1 = get_byte(1, b);
- const uint8_t b2 = get_byte(2, b);
- const uint8_t b3 = get_byte(3, b);
- const uint32_t t = make_uint32(SBOX[b0], SBOX[b1], SBOX[b2], SBOX[b3]);
+ const uint32_t t = make_uint32(SM4_SBOX[get_byte(0,b)],
+ SM4_SBOX[get_byte(1,b)],
+ SM4_SBOX[get_byte(2,b)],
+ SM4_SBOX[get_byte(3,b)]);
// L linear transform
return t ^ rotl<2>(t) ^ rotl<10>(t) ^ rotl<18>(t) ^ rotl<24>(t);
}
+inline uint32_t SM4_T(uint32_t b)
+ {
+ return SM4_SBOX_T[get_byte(0,b)] ^
+ rotr< 8>(SM4_SBOX_T[get_byte(1,b)]) ^
+ rotr<16>(SM4_SBOX_T[get_byte(2,b)]) ^
+ rotr<24>(SM4_SBOX_T[get_byte(3,b)]);
+ }
+
// Variant of T for key schedule
-inline uint32_t Tp(uint32_t b)
+inline uint32_t SM4_Tp(uint32_t b)
{
- const uint8_t b0 = get_byte(0, b);
- const uint8_t b1 = get_byte(1, b);
- const uint8_t b2 = get_byte(2, b);
- const uint8_t b3 = get_byte(3, b);
- const uint32_t t = make_uint32(SBOX[b0], SBOX[b1], SBOX[b2], SBOX[b3]);
+ const uint32_t t = make_uint32(SM4_SBOX[get_byte(0,b)],
+ SM4_SBOX[get_byte(1,b)],
+ SM4_SBOX[get_byte(2,b)],
+ SM4_SBOX[get_byte(3,b)]);
// L' linear transform
return t ^ rotl<13>(t) ^ rotl<23>(t);
}
+#define SM4_RNDS(k0,k1,k2,k3,F) do { \
+ B0 ^= F(B1 ^ B2 ^ B3 ^ m_RK[k0]); \
+ B1 ^= F(B0 ^ B2 ^ B3 ^ m_RK[k1]); \
+ B2 ^= F(B0 ^ B1 ^ B3 ^ m_RK[k2]); \
+ B3 ^= F(B0 ^ B1 ^ B2 ^ m_RK[k3]); \
+ } while(0)
+
}
/*
@@ -76,13 +131,14 @@ void SM4::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
uint32_t B2 = load_be<uint32_t>(in, 2);
uint32_t B3 = load_be<uint32_t>(in, 3);
- for(size_t r = 0; r != 32; r += 4)
- {
- B0 ^= SM4_F::T(B1 ^ B2 ^ B3 ^ m_RK[r + 0]);
- B1 ^= SM4_F::T(B0 ^ B2 ^ B3 ^ m_RK[r + 1]);
- B2 ^= SM4_F::T(B0 ^ B1 ^ B3 ^ m_RK[r + 2]);
- B3 ^= SM4_F::T(B0 ^ B1 ^ B2 ^ m_RK[r + 3]);
- }
+ SM4_RNDS( 0, 1, 2, 3, SM4_T_slow);
+ SM4_RNDS( 4, 5, 6, 7, SM4_T);
+ SM4_RNDS( 8, 9, 10, 11, SM4_T);
+ SM4_RNDS(12, 13, 14, 15, SM4_T);
+ SM4_RNDS(16, 17, 18, 19, SM4_T);
+ SM4_RNDS(20, 21, 22, 23, SM4_T);
+ SM4_RNDS(24, 25, 26, 27, SM4_T);
+ SM4_RNDS(28, 29, 30, 31, SM4_T_slow);
store_be(out, B3, B2, B1, B0);
@@ -103,13 +159,14 @@ void SM4::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
uint32_t B2 = load_be<uint32_t>(in, 2);
uint32_t B3 = load_be<uint32_t>(in, 3);
- for(size_t r = 0; r != 32; r += 4)
- {
- B0 ^= SM4_F::T(B1 ^ B2 ^ B3 ^ m_RK[31 - (r + 0)]);
- B1 ^= SM4_F::T(B0 ^ B2 ^ B3 ^ m_RK[31 - (r + 1)]);
- B2 ^= SM4_F::T(B0 ^ B1 ^ B3 ^ m_RK[31 - (r + 2)]);
- B3 ^= SM4_F::T(B0 ^ B1 ^ B2 ^ m_RK[31 - (r + 3)]);
- }
+ SM4_RNDS(31, 30, 29, 28, SM4_T_slow);
+ SM4_RNDS(27, 26, 25, 24, SM4_T);
+ SM4_RNDS(23, 22, 21, 20, SM4_T);
+ SM4_RNDS(19, 18, 17, 16, SM4_T);
+ SM4_RNDS(15, 14, 13, 12, SM4_T);
+ SM4_RNDS(11, 10, 9, 8, SM4_T);
+ SM4_RNDS( 7, 6, 5, 4, SM4_T);
+ SM4_RNDS( 3, 2, 1, 0, SM4_T_slow);
store_be(out, B3, B2, B1, B0);
@@ -118,6 +175,8 @@ void SM4::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
}
}
+#undef SM4_RNDS
+
/*
* SM4 Key Schedule
*/
@@ -146,7 +205,7 @@ void SM4::key_schedule(const uint8_t key[], size_t)
m_RK.resize(32);
for(size_t i = 0; i != 32; ++i)
{
- K[i % 4] ^= SM4_F::Tp(K[(i+1)%4] ^ K[(i+2)%4] ^ K[(i+3)%4] ^ CK[i]);
+ K[i % 4] ^= SM4_Tp(K[(i+1)%4] ^ K[(i+2)%4] ^ K[(i+3)%4] ^ CK[i]);
m_RK[i] = K[i % 4];
}
}