diff options
author | Jack Lloyd <[email protected]> | 2017-10-13 20:12:03 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-13 20:14:27 -0400 |
commit | 69706d9a4cbca0f502d6a810e1e1cbda280367a7 (patch) | |
tree | 675a89e33e9c118482654a562b4cdcaafc052682 /src/lib/block | |
parent | d717356bb4dd5d2ab1bf62bbdf9c26a7c223554c (diff) |
Optimizations for SM4
Using a larger table helps quite a bit. Using 4 tables (ala AES T-tables)
didn't seem to help much at all, it's only slightly faster than a single
table with rotations.
Continue to use the 8 bit table in the first and last rounds as a
countermeasure against cache attacks.
Diffstat (limited to 'src/lib/block')
-rw-r--r-- | src/lib/block/sm4/sm4.cpp | 129 |
1 files changed, 94 insertions, 35 deletions
diff --git a/src/lib/block/sm4/sm4.cpp b/src/lib/block/sm4/sm4.cpp index 42c865faf..2902d514c 100644 --- a/src/lib/block/sm4/sm4.cpp +++ b/src/lib/block/sm4/sm4.cpp @@ -10,9 +10,9 @@ namespace Botan { -namespace SM4_F { +namespace { -const uint8_t SBOX[256] = { +const uint8_t SM4_SBOX[256] = { 0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, @@ -31,37 +31,92 @@ const uint8_t SBOX[256] = { 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 }; -inline uint32_t T(uint32_t b) +/* +* SM4_SBOX_T[j] == L(SM4_SBOX[j]). +*/ +const uint32_t SM4_SBOX_T[256] = { + 0x8ED55B5B, 0xD0924242, 0x4DEAA7A7, 0x06FDFBFB, 0xFCCF3333, 0x65E28787, + 0xC93DF4F4, 0x6BB5DEDE, 0x4E165858, 0x6EB4DADA, 0x44145050, 0xCAC10B0B, + 0x8828A0A0, 0x17F8EFEF, 0x9C2CB0B0, 0x11051414, 0x872BACAC, 0xFB669D9D, + 0xF2986A6A, 0xAE77D9D9, 0x822AA8A8, 0x46BCFAFA, 0x14041010, 0xCFC00F0F, + 0x02A8AAAA, 0x54451111, 0x5F134C4C, 0xBE269898, 0x6D482525, 0x9E841A1A, + 0x1E061818, 0xFD9B6666, 0xEC9E7272, 0x4A430909, 0x10514141, 0x24F7D3D3, + 0xD5934646, 0x53ECBFBF, 0xF89A6262, 0x927BE9E9, 0xFF33CCCC, 0x04555151, + 0x270B2C2C, 0x4F420D0D, 0x59EEB7B7, 0xF3CC3F3F, 0x1CAEB2B2, 0xEA638989, + 0x74E79393, 0x7FB1CECE, 0x6C1C7070, 0x0DABA6A6, 0xEDCA2727, 0x28082020, + 0x48EBA3A3, 0xC1975656, 0x80820202, 0xA3DC7F7F, 0xC4965252, 0x12F9EBEB, + 0xA174D5D5, 0xB38D3E3E, 0xC33FFCFC, 0x3EA49A9A, 0x5B461D1D, 0x1B071C1C, + 0x3BA59E9E, 0x0CFFF3F3, 0x3FF0CFCF, 0xBF72CDCD, 0x4B175C5C, 0x52B8EAEA, + 0x8F810E0E, 0x3D586565, 0xCC3CF0F0, 0x7D196464, 0x7EE59B9B, 0x91871616, + 0x734E3D3D, 0x08AAA2A2, 0xC869A1A1, 0xC76AADAD, 0x85830606, 0x7AB0CACA, + 0xB570C5C5, 0xF4659191, 0xB2D96B6B, 0xA7892E2E, 0x18FBE3E3, 0x47E8AFAF, + 0x330F3C3C, 0x674A2D2D, 0xB071C1C1, 0x0E575959, 0xE99F7676, 0xE135D4D4, + 0x661E7878, 0xB4249090, 0x360E3838, 0x265F7979, 0xEF628D8D, 0x38596161, + 0x95D24747, 0x2AA08A8A, 0xB1259494, 0xAA228888, 0x8C7DF1F1, 0xD73BECEC, + 0x05010404, 0xA5218484, 0x9879E1E1, 0x9B851E1E, 0x84D75353, 0x00000000, + 0x5E471919, 0x0B565D5D, 0xE39D7E7E, 0x9FD04F4F, 0xBB279C9C, 0x1A534949, + 0x7C4D3131, 0xEE36D8D8, 0x0A020808, 0x7BE49F9F, 0x20A28282, 0xD4C71313, + 0xE8CB2323, 0xE69C7A7A, 0x42E9ABAB, 0x43BDFEFE, 0xA2882A2A, 0x9AD14B4B, + 0x40410101, 0xDBC41F1F, 0xD838E0E0, 0x61B7D6D6, 0x2FA18E8E, 0x2BF4DFDF, + 0x3AF1CBCB, 0xF6CD3B3B, 0x1DFAE7E7, 0xE5608585, 0x41155454, 0x25A38686, + 0x60E38383, 0x16ACBABA, 0x295C7575, 0x34A69292, 0xF7996E6E, 0xE434D0D0, + 0x721A6868, 0x01545555, 0x19AFB6B6, 0xDF914E4E, 0xFA32C8C8, 0xF030C0C0, + 0x21F6D7D7, 0xBC8E3232, 0x75B3C6C6, 0x6FE08F8F, 0x691D7474, 0x2EF5DBDB, + 0x6AE18B8B, 0x962EB8B8, 0x8A800A0A, 0xFE679999, 0xE2C92B2B, 0xE0618181, + 0xC0C30303, 0x8D29A4A4, 0xAF238C8C, 0x07A9AEAE, 0x390D3434, 0x1F524D4D, + 0x764F3939, 0xD36EBDBD, 0x81D65757, 0xB7D86F6F, 0xEB37DCDC, 0x51441515, + 0xA6DD7B7B, 0x09FEF7F7, 0xB68C3A3A, 0x932FBCBC, 0x0F030C0C, 0x03FCFFFF, + 0xC26BA9A9, 0xBA73C9C9, 0xD96CB5B5, 0xDC6DB1B1, 0x375A6D6D, 0x15504545, + 0xB98F3636, 0x771B6C6C, 0x13ADBEBE, 0xDA904A4A, 0x57B9EEEE, 0xA9DE7777, + 0x4CBEF2F2, 0x837EFDFD, 0x55114444, 0xBDDA6767, 0x2C5D7171, 0x45400505, + 0x631F7C7C, 0x50104040, 0x325B6969, 0xB8DB6363, 0x220A2828, 0xC5C20707, + 0xF531C4C4, 0xA88A2222, 0x31A79696, 0xF9CE3737, 0x977AEDED, 0x49BFF6F6, + 0x992DB4B4, 0xA475D1D1, 0x90D34343, 0x5A124848, 0x58BAE2E2, 0x71E69797, + 0x64B6D2D2, 0x70B2C2C2, 0xAD8B2626, 0xCD68A5A5, 0xCB955E5E, 0x624B2929, + 0x3C0C3030, 0xCE945A5A, 0xAB76DDDD, 0x867FF9F9, 0xF1649595, 0x5DBBE6E6, + 0x35F2C7C7, 0x2D092424, 0xD1C61717, 0xD66FB9B9, 0xDEC51B1B, 0x94861212, + 0x78186060, 0x30F3C3C3, 0x897CF5F5, 0x5CEFB3B3, 0xD23AE8E8, 0xACDF7373, + 0x794C3535, 0xA0208080, 0x9D78E5E5, 0x56EDBBBB, 0x235E7D7D, 0xC63EF8F8, + 0x8BD45F5F, 0xE7C82F2F, 0xDD39E4E4, 0x68492121 }; + +inline uint32_t SM4_T_slow(uint32_t b) { - /* - TODO: could convert this to 4 8->32 bit tables combined with xor. - It was about 66% faster (110 MiB->181 MiB) on an i7-6700k, but the - large tables will be likely vulnerable to AES style cache attacks. - */ - - const uint8_t b0 = get_byte(0, b); - const uint8_t b1 = get_byte(1, b); - const uint8_t b2 = get_byte(2, b); - const uint8_t b3 = get_byte(3, b); - const uint32_t t = make_uint32(SBOX[b0], SBOX[b1], SBOX[b2], SBOX[b3]); + const uint32_t t = make_uint32(SM4_SBOX[get_byte(0,b)], + SM4_SBOX[get_byte(1,b)], + SM4_SBOX[get_byte(2,b)], + SM4_SBOX[get_byte(3,b)]); // L linear transform return t ^ rotl<2>(t) ^ rotl<10>(t) ^ rotl<18>(t) ^ rotl<24>(t); } +inline uint32_t SM4_T(uint32_t b) + { + return SM4_SBOX_T[get_byte(0,b)] ^ + rotr< 8>(SM4_SBOX_T[get_byte(1,b)]) ^ + rotr<16>(SM4_SBOX_T[get_byte(2,b)]) ^ + rotr<24>(SM4_SBOX_T[get_byte(3,b)]); + } + // Variant of T for key schedule -inline uint32_t Tp(uint32_t b) +inline uint32_t SM4_Tp(uint32_t b) { - const uint8_t b0 = get_byte(0, b); - const uint8_t b1 = get_byte(1, b); - const uint8_t b2 = get_byte(2, b); - const uint8_t b3 = get_byte(3, b); - const uint32_t t = make_uint32(SBOX[b0], SBOX[b1], SBOX[b2], SBOX[b3]); + const uint32_t t = make_uint32(SM4_SBOX[get_byte(0,b)], + SM4_SBOX[get_byte(1,b)], + SM4_SBOX[get_byte(2,b)], + SM4_SBOX[get_byte(3,b)]); // L' linear transform return t ^ rotl<13>(t) ^ rotl<23>(t); } +#define SM4_RNDS(k0,k1,k2,k3,F) do { \ + B0 ^= F(B1 ^ B2 ^ B3 ^ m_RK[k0]); \ + B1 ^= F(B0 ^ B2 ^ B3 ^ m_RK[k1]); \ + B2 ^= F(B0 ^ B1 ^ B3 ^ m_RK[k2]); \ + B3 ^= F(B0 ^ B1 ^ B2 ^ m_RK[k3]); \ + } while(0) + } /* @@ -76,13 +131,14 @@ void SM4::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint32_t B2 = load_be<uint32_t>(in, 2); uint32_t B3 = load_be<uint32_t>(in, 3); - for(size_t r = 0; r != 32; r += 4) - { - B0 ^= SM4_F::T(B1 ^ B2 ^ B3 ^ m_RK[r + 0]); - B1 ^= SM4_F::T(B0 ^ B2 ^ B3 ^ m_RK[r + 1]); - B2 ^= SM4_F::T(B0 ^ B1 ^ B3 ^ m_RK[r + 2]); - B3 ^= SM4_F::T(B0 ^ B1 ^ B2 ^ m_RK[r + 3]); - } + SM4_RNDS( 0, 1, 2, 3, SM4_T_slow); + SM4_RNDS( 4, 5, 6, 7, SM4_T); + SM4_RNDS( 8, 9, 10, 11, SM4_T); + SM4_RNDS(12, 13, 14, 15, SM4_T); + SM4_RNDS(16, 17, 18, 19, SM4_T); + SM4_RNDS(20, 21, 22, 23, SM4_T); + SM4_RNDS(24, 25, 26, 27, SM4_T); + SM4_RNDS(28, 29, 30, 31, SM4_T_slow); store_be(out, B3, B2, B1, B0); @@ -103,13 +159,14 @@ void SM4::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const uint32_t B2 = load_be<uint32_t>(in, 2); uint32_t B3 = load_be<uint32_t>(in, 3); - for(size_t r = 0; r != 32; r += 4) - { - B0 ^= SM4_F::T(B1 ^ B2 ^ B3 ^ m_RK[31 - (r + 0)]); - B1 ^= SM4_F::T(B0 ^ B2 ^ B3 ^ m_RK[31 - (r + 1)]); - B2 ^= SM4_F::T(B0 ^ B1 ^ B3 ^ m_RK[31 - (r + 2)]); - B3 ^= SM4_F::T(B0 ^ B1 ^ B2 ^ m_RK[31 - (r + 3)]); - } + SM4_RNDS(31, 30, 29, 28, SM4_T_slow); + SM4_RNDS(27, 26, 25, 24, SM4_T); + SM4_RNDS(23, 22, 21, 20, SM4_T); + SM4_RNDS(19, 18, 17, 16, SM4_T); + SM4_RNDS(15, 14, 13, 12, SM4_T); + SM4_RNDS(11, 10, 9, 8, SM4_T); + SM4_RNDS( 7, 6, 5, 4, SM4_T); + SM4_RNDS( 3, 2, 1, 0, SM4_T_slow); store_be(out, B3, B2, B1, B0); @@ -118,6 +175,8 @@ void SM4::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const } } +#undef SM4_RNDS + /* * SM4 Key Schedule */ @@ -146,7 +205,7 @@ void SM4::key_schedule(const uint8_t key[], size_t) m_RK.resize(32); for(size_t i = 0; i != 32; ++i) { - K[i % 4] ^= SM4_F::Tp(K[(i+1)%4] ^ K[(i+2)%4] ^ K[(i+3)%4] ^ CK[i]); + K[i % 4] ^= SM4_Tp(K[(i+1)%4] ^ K[(i+2)%4] ^ K[(i+3)%4] ^ CK[i]); m_RK[i] = K[i % 4]; } } |