/* * Turing * (C) 1999-2008 Jack Lloyd * * Distributed under the terms of the Botan license */ #include #include #include namespace Botan { namespace { /* * Perform an N-way PHT */ inline void PHT(u32bit buf[], u32bit buf_size) { u32bit sum = 0; for(u32bit i = 0; i < buf_size - 1; ++i) sum += buf[i]; buf[buf_size-1] += sum; sum = buf[buf_size-1]; for(u32bit i = 0; i < buf_size - 1; ++i) buf[i] += sum; } } /* * Combine cipher stream with message */ void Turing::cipher(const byte in[], byte out[], u32bit length) { while(length >= buffer.size() - position) { xor_buf(out, in, buffer.begin() + position, buffer.size() - position); length -= (buffer.size() - position); in += (buffer.size() - position); out += (buffer.size() - position); generate(); } xor_buf(out, in, buffer.begin() + position, length); position += length; } /* * Generate cipher stream */ void Turing::generate() { // Table for Turing's polynomial multiplication static const u32bit MULT_TAB[256] = { 0x00000000, 0xD02B4367, 0xED5686CE, 0x3D7DC5A9, 0x97AC41D1, 0x478702B6, 0x7AFAC71F, 0xAAD18478, 0x631582EF, 0xB33EC188, 0x8E430421, 0x5E684746, 0xF4B9C33E, 0x24928059, 0x19EF45F0, 0xC9C40697, 0xC62A4993, 0x16010AF4, 0x2B7CCF5D, 0xFB578C3A, 0x51860842, 0x81AD4B25, 0xBCD08E8C, 0x6CFBCDEB, 0xA53FCB7C, 0x7514881B, 0x48694DB2, 0x98420ED5, 0x32938AAD, 0xE2B8C9CA, 0xDFC50C63, 0x0FEE4F04, 0xC154926B, 0x117FD10C, 0x2C0214A5, 0xFC2957C2, 0x56F8D3BA, 0x86D390DD, 0xBBAE5574, 0x6B851613, 0xA2411084, 0x726A53E3, 0x4F17964A, 0x9F3CD52D, 0x35ED5155, 0xE5C61232, 0xD8BBD79B, 0x089094FC, 0x077EDBF8, 0xD755989F, 0xEA285D36, 0x3A031E51, 0x90D29A29, 0x40F9D94E, 0x7D841CE7, 0xADAF5F80, 0x646B5917, 0xB4401A70, 0x893DDFD9, 0x59169CBE, 0xF3C718C6, 0x23EC5BA1, 0x1E919E08, 0xCEBADD6F, 0xCFA869D6, 0x1F832AB1, 0x22FEEF18, 0xF2D5AC7F, 0x58042807, 0x882F6B60, 0xB552AEC9, 0x6579EDAE, 0xACBDEB39, 0x7C96A85E, 0x41EB6DF7, 0x91C02E90, 0x3B11AAE8, 0xEB3AE98F, 0xD6472C26, 0x066C6F41, 0x09822045, 0xD9A96322, 0xE4D4A68B, 0x34FFE5EC, 0x9E2E6194, 0x4E0522F3, 0x7378E75A, 0xA353A43D, 0x6A97A2AA, 0xBABCE1CD, 0x87C12464, 0x57EA6703, 0xFD3BE37B, 0x2D10A01C, 0x106D65B5, 0xC04626D2, 0x0EFCFBBD, 0xDED7B8DA, 0xE3AA7D73, 0x33813E14, 0x9950BA6C, 0x497BF90B, 0x74063CA2, 0xA42D7FC5, 0x6DE97952, 0xBDC23A35, 0x80BFFF9C, 0x5094BCFB, 0xFA453883, 0x2A6E7BE4, 0x1713BE4D, 0xC738FD2A, 0xC8D6B22E, 0x18FDF149, 0x258034E0, 0xF5AB7787, 0x5F7AF3FF, 0x8F51B098, 0xB22C7531, 0x62073656, 0xABC330C1, 0x7BE873A6, 0x4695B60F, 0x96BEF568, 0x3C6F7110, 0xEC443277, 0xD139F7DE, 0x0112B4B9, 0xD31DD2E1, 0x03369186, 0x3E4B542F, 0xEE601748, 0x44B19330, 0x949AD057, 0xA9E715FE, 0x79CC5699, 0xB008500E, 0x60231369, 0x5D5ED6C0, 0x8D7595A7, 0x27A411DF, 0xF78F52B8, 0xCAF29711, 0x1AD9D476, 0x15379B72, 0xC51CD815, 0xF8611DBC, 0x284A5EDB, 0x829BDAA3, 0x52B099C4, 0x6FCD5C6D, 0xBFE61F0A, 0x7622199D, 0xA6095AFA, 0x9B749F53, 0x4B5FDC34, 0xE18E584C, 0x31A51B2B, 0x0CD8DE82, 0xDCF39DE5, 0x1249408A, 0xC26203ED, 0xFF1FC644, 0x2F348523, 0x85E5015B, 0x55CE423C, 0x68B38795, 0xB898C4F2, 0x715CC265, 0xA1778102, 0x9C0A44AB, 0x4C2107CC, 0xE6F083B4, 0x36DBC0D3, 0x0BA6057A, 0xDB8D461D, 0xD4630919, 0x04484A7E, 0x39358FD7, 0xE91ECCB0, 0x43CF48C8, 0x93E40BAF, 0xAE99CE06, 0x7EB28D61, 0xB7768BF6, 0x675DC891, 0x5A200D38, 0x8A0B4E5F, 0x20DACA27, 0xF0F18940, 0xCD8C4CE9, 0x1DA70F8E, 0x1CB5BB37, 0xCC9EF850, 0xF1E33DF9, 0x21C87E9E, 0x8B19FAE6, 0x5B32B981, 0x664F7C28, 0xB6643F4F, 0x7FA039D8, 0xAF8B7ABF, 0x92F6BF16, 0x42DDFC71, 0xE80C7809, 0x38273B6E, 0x055AFEC7, 0xD571BDA0, 0xDA9FF2A4, 0x0AB4B1C3, 0x37C9746A, 0xE7E2370D, 0x4D33B375, 0x9D18F012, 0xA06535BB, 0x704E76DC, 0xB98A704B, 0x69A1332C, 0x54DCF685, 0x84F7B5E2, 0x2E26319A, 0xFE0D72FD, 0xC370B754, 0x135BF433, 0xDDE1295C, 0x0DCA6A3B, 0x30B7AF92, 0xE09CECF5, 0x4A4D688D, 0x9A662BEA, 0xA71BEE43, 0x7730AD24, 0xBEF4ABB3, 0x6EDFE8D4, 0x53A22D7D, 0x83896E1A, 0x2958EA62, 0xF973A905, 0xC40E6CAC, 0x14252FCB, 0x1BCB60CF, 0xCBE023A8, 0xF69DE601, 0x26B6A566, 0x8C67211E, 0x5C4C6279, 0x6131A7D0, 0xB11AE4B7, 0x78DEE220, 0xA8F5A147, 0x958864EE, 0x45A32789, 0xEF72A3F1, 0x3F59E096, 0x0224253F, 0xD20F6658 }; /* I tried an implementation without precomputed LFSR offsets, since I thought that might allow (especially on x86-64) the use of leal to compute all the offsets.. However on my Core2 with GCC 4.3 it turned out significantly slower (238 Mib/s, versus 300 Mib/s with precomputed offsets) I also tried using byte vs u32bit for the offset variable (since x86 memory addressing modes can be odd), but it made things even slower (186 Mib/s) */ static const byte OFFSETS[221] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 14, 15, 16, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 2, 3, 4, 10, 11, 12, 13, 14, 15, 16, 0, 1, 5, 7, 8, 9, 15, 16, 0, 1, 2, 3, 4, 5, 6, 10, 12, 13, 14, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 0, 1, 2, 8, 9, 10, 11, 12, 13, 14, 15, 16, 3, 5, 6, 7, 13, 14, 15, 16, 0, 1, 2, 3, 4, 8, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 15, 16, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 3, 4, 5, 11, 12, 13, 14, 15, 16, 0, 1, 2, 6, 8, 9, 10, 16, 0, 1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 0, 4, 6, 7, 8, 14, 15, 16, 0, 1, 2, 3, 4, 5, 9, 11, 12, 13, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 16, 0, 1, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 4, 5, 6, 12, 13, 14, 15, 16, 0, 1, 2, 3, 7, 9, 10, 11 }; for(u32bit j = 0; j != 17; ++j) { const byte* R_off = OFFSETS + 13*j; u32bit R0 = R[R_off[0]]; u32bit R1 = R[R_off[1]]; u32bit R2 = R[R_off[2]]; u32bit R3 = R[R_off[3]]; u32bit R4 = R[R_off[4]]; const u32bit R5 = R[R_off[5]]; const u32bit R6 = R[R_off[6]]; const u32bit R7 = R[R_off[7]]; const u32bit R8 = R[R_off[8]]; const u32bit R9 = R[R_off[9]]; const u32bit R10 = R[R_off[10]]; const u32bit R11 = R[R_off[11]]; const u32bit R12 = R[R_off[12]]; R[R_off[0]] = R0 = ((R0 << 8) ^ MULT_TAB[(R0 >> 24) & 0xFF]) ^ R11 ^ R4; u32bit A = R0; u32bit B = R10; u32bit C = R7; u32bit D = R2; u32bit E = R1; E += A + B + C + D; A += E; B += E; C += E; D += E; A = S0[get_byte(0, A)] ^ S1[get_byte(1, A)] ^ S2[get_byte(2, A)] ^ S3[get_byte(3, A)]; B = S0[get_byte(1, B)] ^ S1[get_byte(2, B)] ^ S2[get_byte(3, B)] ^ S3[get_byte(0, B)]; C = S0[get_byte(2, C)] ^ S1[get_byte(3, C)] ^ S2[get_byte(0, C)] ^ S3[get_byte(1, C)]; D = S0[get_byte(3, D)] ^ S1[get_byte(0, D)] ^ S2[get_byte(1, D)] ^ S3[get_byte(2, D)]; E = S0[get_byte(0, E)] ^ S1[get_byte(1, E)] ^ S2[get_byte(2, E)] ^ S3[get_byte(3, E)]; E += A + B + C + D; A += E; B += E; C += E; D += E; R[R_off[1]] = R1 = ((R1 << 8) ^ MULT_TAB[(R1 >> 24) & 0xFF]) ^ R12 ^ R5; R[R_off[2]] = R2 = ((R2 << 8) ^ MULT_TAB[(R2 >> 24) & 0xFF]) ^ R0 ^ R6; R[R_off[3]] = ((R3 << 8) ^ MULT_TAB[(R3 >> 24) & 0xFF]) ^ R1 ^ R7; E += R4; R[R_off[4]] = ((R4 << 8) ^ MULT_TAB[(R4 >> 24) & 0xFF]) ^ R2 ^ R8; A += R1; B += R12; C += R9; D += R5; store_be(A, buffer + 20*j + 0); store_be(B, buffer + 20*j + 4); store_be(C, buffer + 20*j + 8); store_be(D, buffer + 20*j + 12); store_be(E, buffer + 20*j + 16); } position = 0; } /* * Turing's byte mixing step */ u32bit Turing::fixedS(u32bit W) { for(u32bit j = 0; j != 4; ++j) { byte B = SBOX[get_byte(j, W)]; W ^= rotate_left(Q_BOX[B], j*8); W &= rotate_right(0x00FFFFFF, j*8); W |= B << (24-j*8); } return W; } /* * Turing Key Schedule */ void Turing::key_schedule(const byte key[], u32bit length) { K.resize(length / 4); for(u32bit j = 0; j != length; ++j) K[j/4] = (K[j/4] << 8) + key[j]; for(u32bit j = 0; j != K.size(); ++j) K[j] = fixedS(K[j]); PHT(K, K.size()); for(u32bit i = 0; i != 256; ++i) { u32bit W0 = 0, C0 = i; u32bit W1 = 0, C1 = i; u32bit W2 = 0, C2 = i; u32bit W3 = 0, C3 = i; for(u32bit j = 0; j < K.size(); ++j) { C0 = SBOX[get_byte(0, K[j]) ^ C0]; C1 = SBOX[get_byte(1, K[j]) ^ C1]; C2 = SBOX[get_byte(2, K[j]) ^ C2]; C3 = SBOX[get_byte(3, K[j]) ^ C3]; W0 ^= rotate_left(Q_BOX[C0], j); W1 ^= rotate_left(Q_BOX[C1], j + 8); W2 ^= rotate_left(Q_BOX[C2], j + 16); W3 ^= rotate_left(Q_BOX[C3], j + 24); } S0[i] = (W0 & 0x00FFFFFF) | (C0 << 24); S1[i] = (W1 & 0xFF00FFFF) | (C1 << 16); S2[i] = (W2 & 0xFFFF00FF) | (C2 << 8); S3[i] = (W3 & 0xFFFFFF00) | C3; } set_iv(0, 0); } /* * Resynchronization */ void Turing::set_iv(const byte iv[], u32bit length) { if(!valid_iv_length(length)) throw Invalid_IV_Length(name(), length); SecureVector IV(length / 4); for(u32bit i = 0; i != length; ++i) IV[i/4] = (IV[i/4] << 8) + iv[i]; for(u32bit i = 0; i != IV.size(); ++i) R[i] = IV[i] = fixedS(IV[i]); for(u32bit i = 0; i != K.size(); ++i) R[i+IV.size()] = K[i]; R[K.size() + IV.size()] = (0x010203 << 8) | (K.size() << 4) | IV.size(); for(u32bit i = K.size() + IV.size() + 1; i != 17; ++i) { const u32bit W = R[i-K.size()-IV.size()-1] + R[i-1]; R[i] = S0[get_byte(0, W)] ^ S1[get_byte(1, W)] ^ S2[get_byte(2, W)] ^ S3[get_byte(3, W)]; } PHT(R, 17); generate(); } /* * Clear memory of sensitive data */ void Turing::clear() { S0.clear(); S1.clear(); S2.clear(); S3.clear(); buffer.clear(); position = 0; } }