aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/block
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-11-16 17:19:11 -0500
committerJack Lloyd <[email protected]>2017-11-16 18:59:56 -0500
commit5620a20509ba51b67d8329f2acab4242a733d2a5 (patch)
tree1e55ce5c1555d99d28e191986962616e859b1353 /src/lib/block
parent89a8317e3fce81df54c5664c19807e7fd416053f (diff)
Optimize Twofish
Interleaving two blocks is 40-50% faster for any mode that supports parallel operation.
Diffstat (limited to 'src/lib/block')
-rw-r--r--src/lib/block/twofish/twofish.cpp232
1 files changed, 156 insertions, 76 deletions
diff --git a/src/lib/block/twofish/twofish.cpp b/src/lib/block/twofish/twofish.cpp
index 496c31a36..3a508dc9d 100644
--- a/src/lib/block/twofish/twofish.cpp
+++ b/src/lib/block/twofish/twofish.cpp
@@ -1,6 +1,6 @@
/*
* Twofish
-* (C) 1999-2007 Jack Lloyd
+* (C) 1999-2007,2017 Jack Lloyd
*
* The key schedule implemenation is based on a public domain
* implementation by Matthew Skala
@@ -14,6 +14,48 @@
namespace Botan {
+namespace {
+
+inline void TF_E(uint32_t A, uint32_t B, uint32_t& C, uint32_t& D,
+ uint32_t RK1, uint32_t RK2,
+ const secure_vector<uint32_t>& SB)
+ {
+ uint32_t X = SB[ get_byte(3, A)] ^ SB[256+get_byte(2, A)] ^
+ SB[512+get_byte(1, A)] ^ SB[768+get_byte(0, A)];
+ uint32_t Y = SB[ get_byte(0, B)] ^ SB[256+get_byte(3, B)] ^
+ SB[512+get_byte(2, B)] ^ SB[768+get_byte(1, B)];
+
+ X += Y;
+ Y += X;
+
+ X += RK1;
+ Y += RK2;
+
+ C = rotr<1>(C ^ X);
+ D = rotl<1>(D) ^ Y;
+ }
+
+inline void TF_D(uint32_t A, uint32_t B, uint32_t& C, uint32_t& D,
+ uint32_t RK1, uint32_t RK2,
+ const secure_vector<uint32_t>& SB)
+ {
+ uint32_t X = SB[ get_byte(3, A)] ^ SB[256+get_byte(2, A)] ^
+ SB[512+get_byte(1, A)] ^ SB[768+get_byte(0, A)];
+ uint32_t Y = SB[ get_byte(0, B)] ^ SB[256+get_byte(3, B)] ^
+ SB[512+get_byte(2, B)] ^ SB[768+get_byte(1, B)];
+
+ X += Y;
+ Y += X;
+
+ X += RK1;
+ Y += RK2;
+
+ C = rotl<1>(C) ^ X;
+ D = rotr<1>(D ^ Y);
+ }
+
+}
+
/*
* Twofish Encryption
*/
@@ -21,41 +63,60 @@ void Twofish::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
verify_key_set(m_SB.empty() == false);
- BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
+ while(blocks >= 2)
+ {
+ uint32_t A0, B0, C0, D0;
+ uint32_t A1, B1, C1, D1;
+ load_le(in, A0, B0, C0, D0, A1, B1, C1, D1);
+
+ A0 ^= m_RK[0];
+ A1 ^= m_RK[0];
+ B0 ^= m_RK[1];
+ B1 ^= m_RK[1];
+ C0 ^= m_RK[2];
+ C1 ^= m_RK[2];
+ D0 ^= m_RK[3];
+ D1 ^= m_RK[3];
+
+ for(size_t k = 8; k != 40; k += 4)
+ {
+ TF_E(A0, B0, C0, D0, m_RK[k+0], m_RK[k+1], m_SB);
+ TF_E(A1, B1, C1, D1, m_RK[k+0], m_RK[k+1], m_SB);
+
+ TF_E(C0, D0, A0, B0, m_RK[k+2], m_RK[k+3], m_SB);
+ TF_E(C1, D1, A1, B1, m_RK[k+2], m_RK[k+3], m_SB);
+ }
+
+ C0 ^= m_RK[4];
+ C1 ^= m_RK[4];
+ D0 ^= m_RK[5];
+ D1 ^= m_RK[5];
+ A0 ^= m_RK[6];
+ A1 ^= m_RK[6];
+ B0 ^= m_RK[7];
+ B1 ^= m_RK[7];
+
+ store_le(out, C0, D0, A0, B0, C1, D1, A1, B1);
+
+ blocks -= 2;
+ out += 2*BLOCK_SIZE;
+ in += 2*BLOCK_SIZE;
+ }
+
+ if(blocks)
{
uint32_t A, B, C, D;
- load_le(in + BLOCK_SIZE*i, A, B, C, D);
+ load_le(in, A, B, C, D);
A ^= m_RK[0];
B ^= m_RK[1];
C ^= m_RK[2];
D ^= m_RK[3];
- for(size_t j = 0; j != 16; j += 2)
+ for(size_t k = 8; k != 40; k += 4)
{
- uint32_t X, Y;
-
- X = m_SB[ get_byte(3, A)] ^ m_SB[256+get_byte(2, A)] ^
- m_SB[512+get_byte(1, A)] ^ m_SB[768+get_byte(0, A)];
- Y = m_SB[ get_byte(0, B)] ^ m_SB[256+get_byte(3, B)] ^
- m_SB[512+get_byte(2, B)] ^ m_SB[768+get_byte(1, B)];
- X += Y;
- Y += X + m_RK[2*j + 9];
- X += m_RK[2*j + 8];
-
- C = rotr<1>(C ^ X);
- D = rotl<1>(D) ^ Y;
-
- X = m_SB[ get_byte(3, C)] ^ m_SB[256+get_byte(2, C)] ^
- m_SB[512+get_byte(1, C)] ^ m_SB[768+get_byte(0, C)];
- Y = m_SB[ get_byte(0, D)] ^ m_SB[256+get_byte(3, D)] ^
- m_SB[512+get_byte(2, D)] ^ m_SB[768+get_byte(1, D)];
- X += Y;
- Y += X + m_RK[2*j + 11];
- X += m_RK[2*j + 10];
-
- A = rotr<1>(A ^ X);
- B = rotl<1>(B) ^ Y;
+ TF_E(A, B, C, D, m_RK[k ], m_RK[k+1], m_SB);
+ TF_E(C, D, A, B, m_RK[k+2], m_RK[k+3], m_SB);
}
C ^= m_RK[4];
@@ -63,7 +124,7 @@ void Twofish::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
A ^= m_RK[6];
B ^= m_RK[7];
- store_le(out + BLOCK_SIZE*i, C, D, A, B);
+ store_le(out, C, D, A, B);
}
}
@@ -74,41 +135,60 @@ void Twofish::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
verify_key_set(m_SB.empty() == false);
- BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
+ while(blocks >= 2)
+ {
+ uint32_t A0, B0, C0, D0;
+ uint32_t A1, B1, C1, D1;
+ load_le(in, A0, B0, C0, D0, A1, B1, C1, D1);
+
+ A0 ^= m_RK[4];
+ A1 ^= m_RK[4];
+ B0 ^= m_RK[5];
+ B1 ^= m_RK[5];
+ C0 ^= m_RK[6];
+ C1 ^= m_RK[6];
+ D0 ^= m_RK[7];
+ D1 ^= m_RK[7];
+
+ for(size_t k = 40; k != 8; k -= 4)
+ {
+ TF_D(A0, B0, C0, D0, m_RK[k-2], m_RK[k-1], m_SB);
+ TF_D(A1, B1, C1, D1, m_RK[k-2], m_RK[k-1], m_SB);
+
+ TF_D(C0, D0, A0, B0, m_RK[k-4], m_RK[k-3], m_SB);
+ TF_D(C1, D1, A1, B1, m_RK[k-4], m_RK[k-3], m_SB);
+ }
+
+ C0 ^= m_RK[0];
+ C1 ^= m_RK[0];
+ D0 ^= m_RK[1];
+ D1 ^= m_RK[1];
+ A0 ^= m_RK[2];
+ A1 ^= m_RK[2];
+ B0 ^= m_RK[3];
+ B1 ^= m_RK[3];
+
+ store_le(out, C0, D0, A0, B0, C1, D1, A1, B1);
+
+ blocks -= 2;
+ out += 2*BLOCK_SIZE;
+ in += 2*BLOCK_SIZE;
+ }
+
+ if(blocks)
{
uint32_t A, B, C, D;
- load_le(in + BLOCK_SIZE*i, A, B, C, D);
+ load_le(in, A, B, C, D);
A ^= m_RK[4];
B ^= m_RK[5];
C ^= m_RK[6];
D ^= m_RK[7];
- for(size_t j = 0; j != 16; j += 2)
+ for(size_t k = 40; k != 8; k -= 4)
{
- uint32_t X, Y;
-
- X = m_SB[ get_byte(3, A)] ^ m_SB[256+get_byte(2, A)] ^
- m_SB[512+get_byte(1, A)] ^ m_SB[768+get_byte(0, A)];
- Y = m_SB[ get_byte(0, B)] ^ m_SB[256+get_byte(3, B)] ^
- m_SB[512+get_byte(2, B)] ^ m_SB[768+get_byte(1, B)];
- X += Y;
- Y += X + m_RK[39 - 2*j];
- X += m_RK[38 - 2*j];
-
- C = rotl<1>(C) ^ X;
- D = rotr<1>(D ^ Y);
-
- X = m_SB[ get_byte(3, C)] ^ m_SB[256+get_byte(2, C)] ^
- m_SB[512+get_byte(1, C)] ^ m_SB[768+get_byte(0, C)];
- Y = m_SB[ get_byte(0, D)] ^ m_SB[256+get_byte(3, D)] ^
- m_SB[512+get_byte(2, D)] ^ m_SB[768+get_byte(1, D)];
- X += Y;
- Y += X + m_RK[37 - 2*j];
- X += m_RK[36 - 2*j];
-
- A = rotl<1>(A) ^ X;
- B = rotr<1>(B ^ Y);
+ TF_D(A, B, C, D, m_RK[k-2], m_RK[k-1], m_SB);
+ TF_D(C, D, A, B, m_RK[k-4], m_RK[k-3], m_SB);
}
C ^= m_RK[0];
@@ -116,7 +196,7 @@ void Twofish::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
A ^= m_RK[2];
B ^= m_RK[3];
- store_le(out + BLOCK_SIZE*i, C, D, A, B);
+ store_le(out, C, D, A, B);
}
}
@@ -161,16 +241,16 @@ void Twofish::key_schedule(const uint8_t key[], size_t length)
m_SB[768+i] = MDS3[Q1[Q1[i]^S[ 3]]^S[ 7]];
}
- BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2)
+ for(size_t i = 0; i < 40; i += 2)
{
uint32_t X = MDS0[Q0[Q0[i ]^key[ 8]]^key[ 0]] ^
- MDS1[Q0[Q1[i ]^key[ 9]]^key[ 1]] ^
- MDS2[Q1[Q0[i ]^key[10]]^key[ 2]] ^
- MDS3[Q1[Q1[i ]^key[11]]^key[ 3]];
+ MDS1[Q0[Q1[i ]^key[ 9]]^key[ 1]] ^
+ MDS2[Q1[Q0[i ]^key[10]]^key[ 2]] ^
+ MDS3[Q1[Q1[i ]^key[11]]^key[ 3]];
uint32_t Y = MDS0[Q0[Q0[i+1]^key[12]]^key[ 4]] ^
- MDS1[Q0[Q1[i+1]^key[13]]^key[ 5]] ^
- MDS2[Q1[Q0[i+1]^key[14]]^key[ 6]] ^
- MDS3[Q1[Q1[i+1]^key[15]]^key[ 7]];
+ MDS1[Q0[Q1[i+1]^key[13]]^key[ 5]] ^
+ MDS2[Q1[Q0[i+1]^key[14]]^key[ 6]] ^
+ MDS3[Q1[Q1[i+1]^key[15]]^key[ 7]];
Y = rotl<8>(Y);
X += Y; Y += X;
@@ -188,16 +268,16 @@ void Twofish::key_schedule(const uint8_t key[], size_t length)
m_SB[768+i] = MDS3[Q1[Q1[Q0[i]^S[ 3]]^S[ 7]]^S[11]];
}
- BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2)
+ for(size_t i = 0; i < 40; i += 2)
{
uint32_t X = MDS0[Q0[Q0[Q1[i ]^key[16]]^key[ 8]]^key[ 0]] ^
- MDS1[Q0[Q1[Q1[i ]^key[17]]^key[ 9]]^key[ 1]] ^
- MDS2[Q1[Q0[Q0[i ]^key[18]]^key[10]]^key[ 2]] ^
- MDS3[Q1[Q1[Q0[i ]^key[19]]^key[11]]^key[ 3]];
+ MDS1[Q0[Q1[Q1[i ]^key[17]]^key[ 9]]^key[ 1]] ^
+ MDS2[Q1[Q0[Q0[i ]^key[18]]^key[10]]^key[ 2]] ^
+ MDS3[Q1[Q1[Q0[i ]^key[19]]^key[11]]^key[ 3]];
uint32_t Y = MDS0[Q0[Q0[Q1[i+1]^key[20]]^key[12]]^key[ 4]] ^
- MDS1[Q0[Q1[Q1[i+1]^key[21]]^key[13]]^key[ 5]] ^
- MDS2[Q1[Q0[Q0[i+1]^key[22]]^key[14]]^key[ 6]] ^
- MDS3[Q1[Q1[Q0[i+1]^key[23]]^key[15]]^key[ 7]];
+ MDS1[Q0[Q1[Q1[i+1]^key[21]]^key[13]]^key[ 5]] ^
+ MDS2[Q1[Q0[Q0[i+1]^key[22]]^key[14]]^key[ 6]] ^
+ MDS3[Q1[Q1[Q0[i+1]^key[23]]^key[15]]^key[ 7]];
Y = rotl<8>(Y);
X += Y; Y += X;
@@ -215,16 +295,16 @@ void Twofish::key_schedule(const uint8_t key[], size_t length)
m_SB[768+i] = MDS3[Q1[Q1[Q0[Q1[i]^S[ 3]]^S[ 7]]^S[11]]^S[15]];
}
- BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2)
+ for(size_t i = 0; i < 40; i += 2)
{
uint32_t X = MDS0[Q0[Q0[Q1[Q1[i ]^key[24]]^key[16]]^key[ 8]]^key[ 0]] ^
- MDS1[Q0[Q1[Q1[Q0[i ]^key[25]]^key[17]]^key[ 9]]^key[ 1]] ^
- MDS2[Q1[Q0[Q0[Q0[i ]^key[26]]^key[18]]^key[10]]^key[ 2]] ^
- MDS3[Q1[Q1[Q0[Q1[i ]^key[27]]^key[19]]^key[11]]^key[ 3]];
+ MDS1[Q0[Q1[Q1[Q0[i ]^key[25]]^key[17]]^key[ 9]]^key[ 1]] ^
+ MDS2[Q1[Q0[Q0[Q0[i ]^key[26]]^key[18]]^key[10]]^key[ 2]] ^
+ MDS3[Q1[Q1[Q0[Q1[i ]^key[27]]^key[19]]^key[11]]^key[ 3]];
uint32_t Y = MDS0[Q0[Q0[Q1[Q1[i+1]^key[28]]^key[20]]^key[12]]^key[ 4]] ^
- MDS1[Q0[Q1[Q1[Q0[i+1]^key[29]]^key[21]]^key[13]]^key[ 5]] ^
- MDS2[Q1[Q0[Q0[Q0[i+1]^key[30]]^key[22]]^key[14]]^key[ 6]] ^
- MDS3[Q1[Q1[Q0[Q1[i+1]^key[31]]^key[23]]^key[15]]^key[ 7]];
+ MDS1[Q0[Q1[Q1[Q0[i+1]^key[29]]^key[21]]^key[13]]^key[ 5]] ^
+ MDS2[Q1[Q0[Q0[Q0[i+1]^key[30]]^key[22]]^key[14]]^key[ 6]] ^
+ MDS3[Q1[Q1[Q0[Q1[i+1]^key[31]]^key[23]]^key[15]]^key[ 7]];
Y = rotl<8>(Y);
X += Y; Y += X;