diff options
author | Jack Lloyd <[email protected]> | 2017-11-16 17:19:11 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-11-16 18:59:56 -0500 |
commit | 5620a20509ba51b67d8329f2acab4242a733d2a5 (patch) | |
tree | 1e55ce5c1555d99d28e191986962616e859b1353 /src/lib/block | |
parent | 89a8317e3fce81df54c5664c19807e7fd416053f (diff) |
Optimize Twofish
Interleaving two blocks is 40-50% faster for any mode that supports
parallel operation.
Diffstat (limited to 'src/lib/block')
-rw-r--r-- | src/lib/block/twofish/twofish.cpp | 232 |
1 files changed, 156 insertions, 76 deletions
diff --git a/src/lib/block/twofish/twofish.cpp b/src/lib/block/twofish/twofish.cpp index 496c31a36..3a508dc9d 100644 --- a/src/lib/block/twofish/twofish.cpp +++ b/src/lib/block/twofish/twofish.cpp @@ -1,6 +1,6 @@ /* * Twofish -* (C) 1999-2007 Jack Lloyd +* (C) 1999-2007,2017 Jack Lloyd * * The key schedule implemenation is based on a public domain * implementation by Matthew Skala @@ -14,6 +14,48 @@ namespace Botan { +namespace { + +inline void TF_E(uint32_t A, uint32_t B, uint32_t& C, uint32_t& D, + uint32_t RK1, uint32_t RK2, + const secure_vector<uint32_t>& SB) + { + uint32_t X = SB[ get_byte(3, A)] ^ SB[256+get_byte(2, A)] ^ + SB[512+get_byte(1, A)] ^ SB[768+get_byte(0, A)]; + uint32_t Y = SB[ get_byte(0, B)] ^ SB[256+get_byte(3, B)] ^ + SB[512+get_byte(2, B)] ^ SB[768+get_byte(1, B)]; + + X += Y; + Y += X; + + X += RK1; + Y += RK2; + + C = rotr<1>(C ^ X); + D = rotl<1>(D) ^ Y; + } + +inline void TF_D(uint32_t A, uint32_t B, uint32_t& C, uint32_t& D, + uint32_t RK1, uint32_t RK2, + const secure_vector<uint32_t>& SB) + { + uint32_t X = SB[ get_byte(3, A)] ^ SB[256+get_byte(2, A)] ^ + SB[512+get_byte(1, A)] ^ SB[768+get_byte(0, A)]; + uint32_t Y = SB[ get_byte(0, B)] ^ SB[256+get_byte(3, B)] ^ + SB[512+get_byte(2, B)] ^ SB[768+get_byte(1, B)]; + + X += Y; + Y += X; + + X += RK1; + Y += RK2; + + C = rotl<1>(C) ^ X; + D = rotr<1>(D ^ Y); + } + +} + /* * Twofish Encryption */ @@ -21,41 +63,60 @@ void Twofish::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_SB.empty() == false); - BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) + while(blocks >= 2) + { + uint32_t A0, B0, C0, D0; + uint32_t A1, B1, C1, D1; + load_le(in, A0, B0, C0, D0, A1, B1, C1, D1); + + A0 ^= m_RK[0]; + A1 ^= m_RK[0]; + B0 ^= m_RK[1]; + B1 ^= m_RK[1]; + C0 ^= m_RK[2]; + C1 ^= m_RK[2]; + D0 ^= m_RK[3]; + D1 ^= m_RK[3]; + + for(size_t k = 8; k != 40; k += 4) + { + TF_E(A0, B0, C0, D0, m_RK[k+0], m_RK[k+1], m_SB); + TF_E(A1, B1, C1, D1, m_RK[k+0], m_RK[k+1], m_SB); + + TF_E(C0, D0, A0, B0, m_RK[k+2], m_RK[k+3], m_SB); + TF_E(C1, D1, A1, B1, m_RK[k+2], m_RK[k+3], m_SB); + } + + C0 ^= m_RK[4]; + C1 ^= m_RK[4]; + D0 ^= m_RK[5]; + D1 ^= m_RK[5]; + A0 ^= m_RK[6]; + A1 ^= m_RK[6]; + B0 ^= m_RK[7]; + B1 ^= m_RK[7]; + + store_le(out, C0, D0, A0, B0, C1, D1, A1, B1); + + blocks -= 2; + out += 2*BLOCK_SIZE; + in += 2*BLOCK_SIZE; + } + + if(blocks) { uint32_t A, B, C, D; - load_le(in + BLOCK_SIZE*i, A, B, C, D); + load_le(in, A, B, C, D); A ^= m_RK[0]; B ^= m_RK[1]; C ^= m_RK[2]; D ^= m_RK[3]; - for(size_t j = 0; j != 16; j += 2) + for(size_t k = 8; k != 40; k += 4) { - uint32_t X, Y; - - X = m_SB[ get_byte(3, A)] ^ m_SB[256+get_byte(2, A)] ^ - m_SB[512+get_byte(1, A)] ^ m_SB[768+get_byte(0, A)]; - Y = m_SB[ get_byte(0, B)] ^ m_SB[256+get_byte(3, B)] ^ - m_SB[512+get_byte(2, B)] ^ m_SB[768+get_byte(1, B)]; - X += Y; - Y += X + m_RK[2*j + 9]; - X += m_RK[2*j + 8]; - - C = rotr<1>(C ^ X); - D = rotl<1>(D) ^ Y; - - X = m_SB[ get_byte(3, C)] ^ m_SB[256+get_byte(2, C)] ^ - m_SB[512+get_byte(1, C)] ^ m_SB[768+get_byte(0, C)]; - Y = m_SB[ get_byte(0, D)] ^ m_SB[256+get_byte(3, D)] ^ - m_SB[512+get_byte(2, D)] ^ m_SB[768+get_byte(1, D)]; - X += Y; - Y += X + m_RK[2*j + 11]; - X += m_RK[2*j + 10]; - - A = rotr<1>(A ^ X); - B = rotl<1>(B) ^ Y; + TF_E(A, B, C, D, m_RK[k ], m_RK[k+1], m_SB); + TF_E(C, D, A, B, m_RK[k+2], m_RK[k+3], m_SB); } C ^= m_RK[4]; @@ -63,7 +124,7 @@ void Twofish::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const A ^= m_RK[6]; B ^= m_RK[7]; - store_le(out + BLOCK_SIZE*i, C, D, A, B); + store_le(out, C, D, A, B); } } @@ -74,41 +135,60 @@ void Twofish::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_SB.empty() == false); - BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) + while(blocks >= 2) + { + uint32_t A0, B0, C0, D0; + uint32_t A1, B1, C1, D1; + load_le(in, A0, B0, C0, D0, A1, B1, C1, D1); + + A0 ^= m_RK[4]; + A1 ^= m_RK[4]; + B0 ^= m_RK[5]; + B1 ^= m_RK[5]; + C0 ^= m_RK[6]; + C1 ^= m_RK[6]; + D0 ^= m_RK[7]; + D1 ^= m_RK[7]; + + for(size_t k = 40; k != 8; k -= 4) + { + TF_D(A0, B0, C0, D0, m_RK[k-2], m_RK[k-1], m_SB); + TF_D(A1, B1, C1, D1, m_RK[k-2], m_RK[k-1], m_SB); + + TF_D(C0, D0, A0, B0, m_RK[k-4], m_RK[k-3], m_SB); + TF_D(C1, D1, A1, B1, m_RK[k-4], m_RK[k-3], m_SB); + } + + C0 ^= m_RK[0]; + C1 ^= m_RK[0]; + D0 ^= m_RK[1]; + D1 ^= m_RK[1]; + A0 ^= m_RK[2]; + A1 ^= m_RK[2]; + B0 ^= m_RK[3]; + B1 ^= m_RK[3]; + + store_le(out, C0, D0, A0, B0, C1, D1, A1, B1); + + blocks -= 2; + out += 2*BLOCK_SIZE; + in += 2*BLOCK_SIZE; + } + + if(blocks) { uint32_t A, B, C, D; - load_le(in + BLOCK_SIZE*i, A, B, C, D); + load_le(in, A, B, C, D); A ^= m_RK[4]; B ^= m_RK[5]; C ^= m_RK[6]; D ^= m_RK[7]; - for(size_t j = 0; j != 16; j += 2) + for(size_t k = 40; k != 8; k -= 4) { - uint32_t X, Y; - - X = m_SB[ get_byte(3, A)] ^ m_SB[256+get_byte(2, A)] ^ - m_SB[512+get_byte(1, A)] ^ m_SB[768+get_byte(0, A)]; - Y = m_SB[ get_byte(0, B)] ^ m_SB[256+get_byte(3, B)] ^ - m_SB[512+get_byte(2, B)] ^ m_SB[768+get_byte(1, B)]; - X += Y; - Y += X + m_RK[39 - 2*j]; - X += m_RK[38 - 2*j]; - - C = rotl<1>(C) ^ X; - D = rotr<1>(D ^ Y); - - X = m_SB[ get_byte(3, C)] ^ m_SB[256+get_byte(2, C)] ^ - m_SB[512+get_byte(1, C)] ^ m_SB[768+get_byte(0, C)]; - Y = m_SB[ get_byte(0, D)] ^ m_SB[256+get_byte(3, D)] ^ - m_SB[512+get_byte(2, D)] ^ m_SB[768+get_byte(1, D)]; - X += Y; - Y += X + m_RK[37 - 2*j]; - X += m_RK[36 - 2*j]; - - A = rotl<1>(A) ^ X; - B = rotr<1>(B ^ Y); + TF_D(A, B, C, D, m_RK[k-2], m_RK[k-1], m_SB); + TF_D(C, D, A, B, m_RK[k-4], m_RK[k-3], m_SB); } C ^= m_RK[0]; @@ -116,7 +196,7 @@ void Twofish::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const A ^= m_RK[2]; B ^= m_RK[3]; - store_le(out + BLOCK_SIZE*i, C, D, A, B); + store_le(out, C, D, A, B); } } @@ -161,16 +241,16 @@ void Twofish::key_schedule(const uint8_t key[], size_t length) m_SB[768+i] = MDS3[Q1[Q1[i]^S[ 3]]^S[ 7]]; } - BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2) + for(size_t i = 0; i < 40; i += 2) { uint32_t X = MDS0[Q0[Q0[i ]^key[ 8]]^key[ 0]] ^ - MDS1[Q0[Q1[i ]^key[ 9]]^key[ 1]] ^ - MDS2[Q1[Q0[i ]^key[10]]^key[ 2]] ^ - MDS3[Q1[Q1[i ]^key[11]]^key[ 3]]; + MDS1[Q0[Q1[i ]^key[ 9]]^key[ 1]] ^ + MDS2[Q1[Q0[i ]^key[10]]^key[ 2]] ^ + MDS3[Q1[Q1[i ]^key[11]]^key[ 3]]; uint32_t Y = MDS0[Q0[Q0[i+1]^key[12]]^key[ 4]] ^ - MDS1[Q0[Q1[i+1]^key[13]]^key[ 5]] ^ - MDS2[Q1[Q0[i+1]^key[14]]^key[ 6]] ^ - MDS3[Q1[Q1[i+1]^key[15]]^key[ 7]]; + MDS1[Q0[Q1[i+1]^key[13]]^key[ 5]] ^ + MDS2[Q1[Q0[i+1]^key[14]]^key[ 6]] ^ + MDS3[Q1[Q1[i+1]^key[15]]^key[ 7]]; Y = rotl<8>(Y); X += Y; Y += X; @@ -188,16 +268,16 @@ void Twofish::key_schedule(const uint8_t key[], size_t length) m_SB[768+i] = MDS3[Q1[Q1[Q0[i]^S[ 3]]^S[ 7]]^S[11]]; } - BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2) + for(size_t i = 0; i < 40; i += 2) { uint32_t X = MDS0[Q0[Q0[Q1[i ]^key[16]]^key[ 8]]^key[ 0]] ^ - MDS1[Q0[Q1[Q1[i ]^key[17]]^key[ 9]]^key[ 1]] ^ - MDS2[Q1[Q0[Q0[i ]^key[18]]^key[10]]^key[ 2]] ^ - MDS3[Q1[Q1[Q0[i ]^key[19]]^key[11]]^key[ 3]]; + MDS1[Q0[Q1[Q1[i ]^key[17]]^key[ 9]]^key[ 1]] ^ + MDS2[Q1[Q0[Q0[i ]^key[18]]^key[10]]^key[ 2]] ^ + MDS3[Q1[Q1[Q0[i ]^key[19]]^key[11]]^key[ 3]]; uint32_t Y = MDS0[Q0[Q0[Q1[i+1]^key[20]]^key[12]]^key[ 4]] ^ - MDS1[Q0[Q1[Q1[i+1]^key[21]]^key[13]]^key[ 5]] ^ - MDS2[Q1[Q0[Q0[i+1]^key[22]]^key[14]]^key[ 6]] ^ - MDS3[Q1[Q1[Q0[i+1]^key[23]]^key[15]]^key[ 7]]; + MDS1[Q0[Q1[Q1[i+1]^key[21]]^key[13]]^key[ 5]] ^ + MDS2[Q1[Q0[Q0[i+1]^key[22]]^key[14]]^key[ 6]] ^ + MDS3[Q1[Q1[Q0[i+1]^key[23]]^key[15]]^key[ 7]]; Y = rotl<8>(Y); X += Y; Y += X; @@ -215,16 +295,16 @@ void Twofish::key_schedule(const uint8_t key[], size_t length) m_SB[768+i] = MDS3[Q1[Q1[Q0[Q1[i]^S[ 3]]^S[ 7]]^S[11]]^S[15]]; } - BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2) + for(size_t i = 0; i < 40; i += 2) { uint32_t X = MDS0[Q0[Q0[Q1[Q1[i ]^key[24]]^key[16]]^key[ 8]]^key[ 0]] ^ - MDS1[Q0[Q1[Q1[Q0[i ]^key[25]]^key[17]]^key[ 9]]^key[ 1]] ^ - MDS2[Q1[Q0[Q0[Q0[i ]^key[26]]^key[18]]^key[10]]^key[ 2]] ^ - MDS3[Q1[Q1[Q0[Q1[i ]^key[27]]^key[19]]^key[11]]^key[ 3]]; + MDS1[Q0[Q1[Q1[Q0[i ]^key[25]]^key[17]]^key[ 9]]^key[ 1]] ^ + MDS2[Q1[Q0[Q0[Q0[i ]^key[26]]^key[18]]^key[10]]^key[ 2]] ^ + MDS3[Q1[Q1[Q0[Q1[i ]^key[27]]^key[19]]^key[11]]^key[ 3]]; uint32_t Y = MDS0[Q0[Q0[Q1[Q1[i+1]^key[28]]^key[20]]^key[12]]^key[ 4]] ^ - MDS1[Q0[Q1[Q1[Q0[i+1]^key[29]]^key[21]]^key[13]]^key[ 5]] ^ - MDS2[Q1[Q0[Q0[Q0[i+1]^key[30]]^key[22]]^key[14]]^key[ 6]] ^ - MDS3[Q1[Q1[Q0[Q1[i+1]^key[31]]^key[23]]^key[15]]^key[ 7]]; + MDS1[Q0[Q1[Q1[Q0[i+1]^key[29]]^key[21]]^key[13]]^key[ 5]] ^ + MDS2[Q1[Q0[Q0[Q0[i+1]^key[30]]^key[22]]^key[14]]^key[ 6]] ^ + MDS3[Q1[Q1[Q0[Q1[i+1]^key[31]]^key[23]]^key[15]]^key[ 7]]; Y = rotl<8>(Y); X += Y; Y += X; |