aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2016-08-10 13:51:25 -0400
committerJack Lloyd <[email protected]>2016-11-26 12:34:15 -0500
commit3bc46d79c4509cbf871f762e39a366e95e8342ce (patch)
tree80359e3d3955412ae027393ec72db51456d5f575
parent5372d0b499ad317ab3776c9ac92df866cc6a1e84 (diff)
Add Cilk/OpenMP support
-rwxr-xr-xconfigure.py20
-rw-r--r--src/build-data/buildh.in46
-rw-r--r--src/build-data/cc/clang.txt2
-rw-r--r--src/build-data/cc/gcc.txt3
-rw-r--r--src/lib/block/aes/aes.cpp48
-rw-r--r--src/lib/block/aes/aes_ssse3/aes_ssse3.cpp4
-rw-r--r--src/lib/block/blowfish/blowfish.cpp22
-rw-r--r--src/lib/block/camellia/camellia.cpp22
-rw-r--r--src/lib/block/cast/cast128.cpp22
-rw-r--r--src/lib/block/des/des.cpp30
-rw-r--r--src/lib/block/idea/idea.cpp8
-rw-r--r--src/lib/block/serpent/serpent.cpp81
-rw-r--r--src/lib/block/threefish/threefish.cpp34
-rw-r--r--src/lib/block/twofish/twofish.cpp42
-rw-r--r--src/lib/block/xtea/xtea.cpp142
15 files changed, 275 insertions, 251 deletions
diff --git a/configure.py b/configure.py
index 0fabeed3e..789377129 100755
--- a/configure.py
+++ b/configure.py
@@ -323,6 +323,11 @@ def process_command_line(args):
build_group.add_option('--with-external-includedir', metavar='DIR', default='',
help='use DIR for external includes')
+ build_group.add_option('--with-openmp', default=False, action='store_true',
+ help='enable use of OpenMP')
+ build_group.add_option('--with-cilkplus', default=False, action='store_true',
+ help='enable use of Cilk Plus')
+
link_methods = ['symlink', 'hardlink', 'copy']
build_group.add_option('--link-method', default=None, metavar='METHOD',
choices=link_methods,
@@ -850,6 +855,11 @@ class ArchInfo(object):
if options.with_valgrind:
macros.append('HAS_VALGRIND')
+ if options.with_openmp:
+ macros.append('TARGET_HAS_OPENMP')
+ if options.with_cilkplus:
+ macros.append('TARGET_HAS_CILKPLUS')
+
return macros
class CompilerInfo(object):
@@ -953,6 +963,16 @@ class CompilerInfo(object):
raise Exception('No sanitizer handling for %s' % (self.basename))
abi_link.append(self.sanitizer_flags)
+ if options.with_openmp:
+ if 'openmp' not in self.mach_abi_linking:
+ raise Exception('No support for OpenMP for %s' % (self.basename))
+ abi_link.append(self.mach_abi_linking['openmp'])
+
+ if options.with_cilkplus:
+ if 'cilkplus' not in self.mach_abi_linking:
+ raise Exception('No support for Cilk Plus for %s' % (self.basename))
+ abi_link.append(self.mach_abi_linking['cilkplus'])
+
abi_flags = ' '.join(sorted(abi_link))
if options.cc_abi_flags != '':
diff --git a/src/build-data/buildh.in b/src/build-data/buildh.in
index 56b70e060..0702d1416 100644
--- a/src/build-data/buildh.in
+++ b/src/build-data/buildh.in
@@ -264,6 +264,52 @@ Each read generates 32 bits of output
#define BOTAN_NOEXCEPT noexcept
#endif
+#if !defined(BOTAN_PARALLEL_FOR)
+
+#if defined(BOTAN_TARGET_HAS_CILKPLUS)
+ #define BOTAN_PARALLEL_FOR _Cilk_for
+#elif defined(BOTAN_TARGET_HAS_OPENMP)
+ #define BOTAN_PARALLEL_FOR _Pragma("omp parallel for") for
+#else
+ #define BOTAN_PARALLEL_FOR for
+#endif
+
+#endif
+
+#if !defined(BOTAN_PARALLEL_SIMD_FOR)
+
+#if defined(BOTAN_TARGET_HAS_CILKPLUS)
+ #define BOTAN_PARALLEL_SIMD_FOR _Pragma("simd") for
+#elif defined(BOTAN_TARGET_HAS_OPENMP)
+ #define BOTAN_PARALLEL_SIMD_FOR _Pragma("omp simd") for
+#elif defined(BOTAN_TARGET_COMPILER_IS_GCC)
+ #define BOTAN_PARALLEL_FOR _Pragma("GCC ivdep") for
+#else
+ #define BOTAN_PARALLEL_SIMD_FOR for
+#endif
+
+#endif
+
+#if !defined(BOTAN_PARALLEL_SPAWN)
+
+#if defined(BOTAN_TARGET_HAS_CILKPLUS)
+ #define BOTAN_PARALLEL_SPAWN _Cilk_spawn
+#else
+ #define BOTAN_PARALLEL_SPAWN
+#endif
+
+#endif
+
+#if !defined(BOTAN_PARALLEL_SYNC)
+
+#if defined(BOTAN_TARGET_HAS_CILKPLUS)
+ #define BOTAN_PARALLEL_SYNC _Cilk_sync
+#else
+ #define BOTAN_PARALLEL_SYNC BOTAN_FORCE_SEMICOLON
+#endif
+
+#endif
+
/*
* Module availability definitions
*/
diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt
index c4a85658f..055315c3b 100644
--- a/src/build-data/cc/clang.txt
+++ b/src/build-data/cc/clang.txt
@@ -73,6 +73,8 @@ ivybridge -> "-march=core-avx-i"
<mach_abi_linking>
all -> "-pthread"
+openmp -> "-fopenmp"
+
x86_32 -> "-m32"
x86_64 -> "-m64"
ppc64 -> "-m64"
diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt
index b88454ce6..0a53e15c1 100644
--- a/src/build-data/cc/gcc.txt
+++ b/src/build-data/cc/gcc.txt
@@ -120,6 +120,9 @@ all_x86_64 -> "-momit-leaf-frame-pointer"
<mach_abi_linking>
all -> "-pthread -fstack-protector"
+cilkplus -> "-fcilkplus"
+openmp -> "-fopenmp"
+
mips64 -> "-mabi=64"
s390 -> "-m31"
s390x -> "-m64"
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp
index 39f5bd0db..8c7000135 100644
--- a/src/lib/block/aes/aes.cpp
+++ b/src/lib/block/aes/aes.cpp
@@ -168,12 +168,15 @@ void aes_encrypt_n(const byte in[], byte out[],
}
Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit T0 = load_be<u32bit>(in, 0) ^ EK[0];
- u32bit T1 = load_be<u32bit>(in, 1) ^ EK[1];
- u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2];
- u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3];
+ u32bit T0, T1, T2, T3;
+ load_be(in + 16*i, T0, T1, T2, T3);
+
+ T0 ^= EK[0];
+ T1 ^= EK[1];
+ T2 ^= EK[2];
+ T3 ^= EK[3];
T0 ^= Z;
@@ -226,25 +229,22 @@ void aes_encrypt_n(const byte in[], byte out[],
TE[get_byte(2, T1) + 512] ^ TE[get_byte(3, T2) + 768];
}
- out[ 0] = SE[get_byte(0, B0)] ^ ME[0];
- out[ 1] = SE[get_byte(1, B1)] ^ ME[1];
- out[ 2] = SE[get_byte(2, B2)] ^ ME[2];
- out[ 3] = SE[get_byte(3, B3)] ^ ME[3];
- out[ 4] = SE[get_byte(0, B1)] ^ ME[4];
- out[ 5] = SE[get_byte(1, B2)] ^ ME[5];
- out[ 6] = SE[get_byte(2, B3)] ^ ME[6];
- out[ 7] = SE[get_byte(3, B0)] ^ ME[7];
- out[ 8] = SE[get_byte(0, B2)] ^ ME[8];
- out[ 9] = SE[get_byte(1, B3)] ^ ME[9];
- out[10] = SE[get_byte(2, B0)] ^ ME[10];
- out[11] = SE[get_byte(3, B1)] ^ ME[11];
- out[12] = SE[get_byte(0, B3)] ^ ME[12];
- out[13] = SE[get_byte(1, B0)] ^ ME[13];
- out[14] = SE[get_byte(2, B1)] ^ ME[14];
- out[15] = SE[get_byte(3, B2)] ^ ME[15];
-
- in += 16;
- out += 16;
+ out[16*i+ 0] = SE[get_byte(0, B0)] ^ ME[0];
+ out[16*i+ 1] = SE[get_byte(1, B1)] ^ ME[1];
+ out[16*i+ 2] = SE[get_byte(2, B2)] ^ ME[2];
+ out[16*i+ 3] = SE[get_byte(3, B3)] ^ ME[3];
+ out[16*i+ 4] = SE[get_byte(0, B1)] ^ ME[4];
+ out[16*i+ 5] = SE[get_byte(1, B2)] ^ ME[5];
+ out[16*i+ 6] = SE[get_byte(2, B3)] ^ ME[6];
+ out[16*i+ 7] = SE[get_byte(3, B0)] ^ ME[7];
+ out[16*i+ 8] = SE[get_byte(0, B2)] ^ ME[8];
+ out[16*i+ 9] = SE[get_byte(1, B3)] ^ ME[9];
+ out[16*i+10] = SE[get_byte(2, B0)] ^ ME[10];
+ out[16*i+11] = SE[get_byte(3, B1)] ^ ME[11];
+ out[16*i+12] = SE[get_byte(0, B3)] ^ ME[12];
+ out[16*i+13] = SE[get_byte(1, B0)] ^ ME[13];
+ out[16*i+14] = SE[get_byte(2, B1)] ^ ME[14];
+ out[16*i+15] = SE[get_byte(3, B2)] ^ ME[15];
}
}
diff --git a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp b/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp
index ef24795bb..d8c7e7314 100644
--- a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp
+++ b/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp
@@ -355,7 +355,7 @@ void AES_128::ssse3_encrypt_n(const byte in[], byte out[], size_t blocks) const
CT::poison(in, blocks * block_size());
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
__m128i B = _mm_loadu_si128(in_mm + i);
_mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
@@ -378,7 +378,7 @@ void AES_128::ssse3_decrypt_n(const byte in[], byte out[], size_t blocks) const
CT::poison(in, blocks * block_size());
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
__m128i B = _mm_loadu_si128(in_mm + i);
_mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
diff --git a/src/lib/block/blowfish/blowfish.cpp b/src/lib/block/blowfish/blowfish.cpp
index 7a06cf797..69d345baa 100644
--- a/src/lib/block/blowfish/blowfish.cpp
+++ b/src/lib/block/blowfish/blowfish.cpp
@@ -202,10 +202,10 @@ void Blowfish::encrypt_n(const byte in[], byte out[], size_t blocks) const
const u32bit* S3 = &m_S[512];
const u32bit* S4 = &m_S[768];
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit L = load_be<u32bit>(in, 0);
- u32bit R = load_be<u32bit>(in, 1);
+ u32bit L, R;
+ load_be(in + BLOCK_SIZE*i, L, R);
for(size_t j = 0; j != 16; j += 2)
{
@@ -220,10 +220,7 @@ void Blowfish::encrypt_n(const byte in[], byte out[], size_t blocks) const
L ^= m_P[16]; R ^= m_P[17];
- store_be(out, R, L);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(out + BLOCK_SIZE*i, R, L);
}
}
@@ -237,10 +234,10 @@ void Blowfish::decrypt_n(const byte in[], byte out[], size_t blocks) const
const u32bit* S3 = &m_S[512];
const u32bit* S4 = &m_S[768];
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit L = load_be<u32bit>(in, 0);
- u32bit R = load_be<u32bit>(in, 1);
+ u32bit L, R;
+ load_be(in + BLOCK_SIZE*i, L, R);
for(size_t j = 17; j != 1; j -= 2)
{
@@ -255,10 +252,7 @@ void Blowfish::decrypt_n(const byte in[], byte out[], size_t blocks) const
L ^= m_P[1]; R ^= m_P[0];
- store_be(out, R, L);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(out + BLOCK_SIZE*i, R, L);
}
}
diff --git a/src/lib/block/camellia/camellia.cpp b/src/lib/block/camellia/camellia.cpp
index ac5d57d4e..5ac13b9ab 100644
--- a/src/lib/block/camellia/camellia.cpp
+++ b/src/lib/block/camellia/camellia.cpp
@@ -645,10 +645,10 @@ inline u64bit FLINV(u64bit v, u64bit K)
void encrypt(const byte in[], byte out[], size_t blocks,
const secure_vector<u64bit>& SK, const size_t rounds)
{
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u64bit D1 = load_be<u64bit>(in, 0);
- u64bit D2 = load_be<u64bit>(in, 1);
+ u64bit D1, D2;
+ load_be(in + 16*i, D1, D2);
const u64bit* K = SK.data();
@@ -676,10 +676,7 @@ void encrypt(const byte in[], byte out[], size_t blocks,
D2 ^= *K++;
D1 ^= *K++;
- store_be(out, D2, D1);
-
- in += 16;
- out += 16;
+ store_be(out + 16*i, D2, D1);
}
}
@@ -689,10 +686,10 @@ void encrypt(const byte in[], byte out[], size_t blocks,
void decrypt(const byte in[], byte out[], size_t blocks,
const secure_vector<u64bit>& SK, const size_t rounds)
{
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u64bit D1 = load_be<u64bit>(in, 0);
- u64bit D2 = load_be<u64bit>(in, 1);
+ u64bit D1, D2;
+ load_be(in + 16*i, D1, D2);
const u64bit* K = &SK[SK.size()-1];
@@ -720,10 +717,7 @@ void decrypt(const byte in[], byte out[], size_t blocks,
D1 ^= *K--;
D2 ^= *K;
- store_be(out, D2, D1);
-
- in += 16;
- out += 16;
+ store_be(out + 16*i, D2, D1);
}
}
diff --git a/src/lib/block/cast/cast128.cpp b/src/lib/block/cast/cast128.cpp
index 53f7d4611..96c4f45a7 100644
--- a/src/lib/block/cast/cast128.cpp
+++ b/src/lib/block/cast/cast128.cpp
@@ -50,10 +50,10 @@ inline void R3(u32bit& L, u32bit R, u32bit MK, byte RK)
*/
void CAST_128::encrypt_n(const byte in[], byte out[], size_t blocks) const
{
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit L = load_be<u32bit>(in, 0);
- u32bit R = load_be<u32bit>(in, 1);
+ u32bit L, R;
+ load_be(in + BLOCK_SIZE*i, L, R);
R1(L, R, m_MK[ 0], m_RK[ 0]);
R2(R, L, m_MK[ 1], m_RK[ 1]);
@@ -72,10 +72,7 @@ void CAST_128::encrypt_n(const byte in[], byte out[], size_t blocks) const
R3(L, R, m_MK[14], m_RK[14]);
R1(R, L, m_MK[15], m_RK[15]);
- store_be(out, R, L);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(out + BLOCK_SIZE*i, R, L);
}
}
@@ -84,10 +81,10 @@ void CAST_128::encrypt_n(const byte in[], byte out[], size_t blocks) const
*/
void CAST_128::decrypt_n(const byte in[], byte out[], size_t blocks) const
{
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit L = load_be<u32bit>(in, 0);
- u32bit R = load_be<u32bit>(in, 1);
+ u32bit L, R;
+ load_be(in + BLOCK_SIZE*i, L, R);
R1(L, R, m_MK[15], m_RK[15]);
R3(R, L, m_MK[14], m_RK[14]);
@@ -106,10 +103,7 @@ void CAST_128::decrypt_n(const byte in[], byte out[], size_t blocks) const
R2(L, R, m_MK[ 1], m_RK[ 1]);
R1(R, L, m_MK[ 0], m_RK[ 0]);
- store_be(out, R, L);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(out + BLOCK_SIZE*i, R, L);
}
}
diff --git a/src/lib/block/des/des.cpp b/src/lib/block/des/des.cpp
index 88671df8d..a55c43ec7 100644
--- a/src/lib/block/des/des.cpp
+++ b/src/lib/block/des/des.cpp
@@ -144,12 +144,12 @@ void des_decrypt(u32bit& L, u32bit& R,
*/
void DES::encrypt_n(const byte in[], byte out[], size_t blocks) const
{
- for(size_t i = 0; i != blocks; ++i)
+ for(size_t i = 0; i < blocks; ++i)
{
- u64bit T = (DES_IPTAB1[in[0]] ) | (DES_IPTAB1[in[1]] << 1) |
- (DES_IPTAB1[in[2]] << 2) | (DES_IPTAB1[in[3]] << 3) |
- (DES_IPTAB1[in[4]] << 4) | (DES_IPTAB1[in[5]] << 5) |
- (DES_IPTAB1[in[6]] << 6) | (DES_IPTAB2[in[7]] );
+ u64bit T = (DES_IPTAB1[in[8*i+0]] ) | (DES_IPTAB1[in[8*i+1]] << 1) |
+ (DES_IPTAB1[in[8*i+2]] << 2) | (DES_IPTAB1[in[8*i+3]] << 3) |
+ (DES_IPTAB1[in[8*i+4]] << 4) | (DES_IPTAB1[in[8*i+5]] << 5) |
+ (DES_IPTAB1[in[8*i+6]] << 6) | (DES_IPTAB2[in[8*i+7]] );
u32bit L = static_cast<u32bit>(T >> 32);
u32bit R = static_cast<u32bit>(T);
@@ -162,10 +162,7 @@ void DES::encrypt_n(const byte in[], byte out[], size_t blocks) const
(DES_FPTAB1[get_byte(2, R)] ) | (DES_FPTAB2[get_byte(3, R)] );
T = rotate_left(T, 32);
- store_be(T, out);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(T, out + 8*i);
}
}
@@ -174,12 +171,12 @@ void DES::encrypt_n(const byte in[], byte out[], size_t blocks) const
*/
void DES::decrypt_n(const byte in[], byte out[], size_t blocks) const
{
- for(size_t i = 0; i != blocks; ++i)
+ for(size_t i = 0; i < blocks; ++i)
{
- u64bit T = (DES_IPTAB1[in[0]] ) | (DES_IPTAB1[in[1]] << 1) |
- (DES_IPTAB1[in[2]] << 2) | (DES_IPTAB1[in[3]] << 3) |
- (DES_IPTAB1[in[4]] << 4) | (DES_IPTAB1[in[5]] << 5) |
- (DES_IPTAB1[in[6]] << 6) | (DES_IPTAB2[in[7]] );
+ u64bit T = (DES_IPTAB1[in[BLOCK_SIZE*i+0]] ) | (DES_IPTAB1[in[BLOCK_SIZE*i+1]] << 1) |
+ (DES_IPTAB1[in[BLOCK_SIZE*i+2]] << 2) | (DES_IPTAB1[in[BLOCK_SIZE*i+3]] << 3) |
+ (DES_IPTAB1[in[BLOCK_SIZE*i+4]] << 4) | (DES_IPTAB1[in[BLOCK_SIZE*i+5]] << 5) |
+ (DES_IPTAB1[in[BLOCK_SIZE*i+6]] << 6) | (DES_IPTAB2[in[BLOCK_SIZE*i+7]] );
u32bit L = static_cast<u32bit>(T >> 32);
u32bit R = static_cast<u32bit>(T);
@@ -193,10 +190,7 @@ void DES::decrypt_n(const byte in[], byte out[], size_t blocks) const
T = rotate_left(T, 32);
- store_be(T, out);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(T, out + BLOCK_SIZE*i);
}
}
diff --git a/src/lib/block/idea/idea.cpp b/src/lib/block/idea/idea.cpp
index 85cc5e757..1fe25d599 100644
--- a/src/lib/block/idea/idea.cpp
+++ b/src/lib/block/idea/idea.cpp
@@ -67,12 +67,10 @@ void idea_op(const byte in[], byte out[], size_t blocks, const u16bit K[52])
CT::poison(out, blocks * 8);
CT::poison(K, 52);
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u16bit X1 = load_be<u16bit>(in + BLOCK_SIZE*i, 0);
- u16bit X2 = load_be<u16bit>(in + BLOCK_SIZE*i, 1);
- u16bit X3 = load_be<u16bit>(in + BLOCK_SIZE*i, 2);
- u16bit X4 = load_be<u16bit>(in + BLOCK_SIZE*i, 3);
+ u16bit X1, X2, X3, X4;
+ load_be(in + BLOCK_SIZE*i, X1, X2, X3, X4);
for(size_t j = 0; j != 8; ++j)
{
diff --git a/src/lib/block/serpent/serpent.cpp b/src/lib/block/serpent/serpent.cpp
index 07088211d..a1326b888 100644
--- a/src/lib/block/serpent/serpent.cpp
+++ b/src/lib/block/serpent/serpent.cpp
@@ -70,12 +70,10 @@ void Serpent::encrypt_n(const byte in[], byte out[], size_t blocks) const
}
#endif
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_SIMD_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit B0 = load_le<u32bit>(in, 0);
- u32bit B1 = load_le<u32bit>(in, 1);
- u32bit B2 = load_le<u32bit>(in, 2);
- u32bit B3 = load_le<u32bit>(in, 3);
+ u32bit B0, B1, B2, B3;
+ load_le(in + 16*i, B0, B1, B2, B3);
key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
@@ -110,10 +108,7 @@ void Serpent::encrypt_n(const byte in[], byte out[], size_t blocks) const
key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
- store_le(out, B0, B1, B2, B3);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_le(out + 16*i, B0, B1, B2, B3);
}
}
@@ -135,12 +130,10 @@ void Serpent::decrypt_n(const byte in[], byte out[], size_t blocks) const
}
#endif
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_SIMD_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit B0 = load_le<u32bit>(in, 0);
- u32bit B1 = load_le<u32bit>(in, 1);
- u32bit B2 = load_le<u32bit>(in, 2);
- u32bit B3 = load_le<u32bit>(in, 3);
+ u32bit B0, B1, B2, B3;
+ load_le(in + 16*i, B0, B1, B2, B3);
key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
@@ -175,10 +168,7 @@ void Serpent::decrypt_n(const byte in[], byte out[], size_t blocks) const
i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
- store_le(out, B0, B1, B2, B3);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_le(out + 16*i, B0, B1, B2, B3);
}
}
@@ -205,24 +195,47 @@ void Serpent::key_schedule(const byte key[], size_t length)
W[i] = rotate_left(wi, 11);
}
- SBoxE4(W[ 8],W[ 9],W[ 10],W[ 11]); SBoxE3(W[ 12],W[ 13],W[ 14],W[ 15]);
- SBoxE2(W[ 16],W[ 17],W[ 18],W[ 19]); SBoxE1(W[ 20],W[ 21],W[ 22],W[ 23]);
- SBoxE8(W[ 24],W[ 25],W[ 26],W[ 27]); SBoxE7(W[ 28],W[ 29],W[ 30],W[ 31]);
- SBoxE6(W[ 32],W[ 33],W[ 34],W[ 35]); SBoxE5(W[ 36],W[ 37],W[ 38],W[ 39]);
- SBoxE4(W[ 40],W[ 41],W[ 42],W[ 43]); SBoxE3(W[ 44],W[ 45],W[ 46],W[ 47]);
- SBoxE2(W[ 48],W[ 49],W[ 50],W[ 51]); SBoxE1(W[ 52],W[ 53],W[ 54],W[ 55]);
- SBoxE8(W[ 56],W[ 57],W[ 58],W[ 59]); SBoxE7(W[ 60],W[ 61],W[ 62],W[ 63]);
- SBoxE6(W[ 64],W[ 65],W[ 66],W[ 67]); SBoxE5(W[ 68],W[ 69],W[ 70],W[ 71]);
- SBoxE4(W[ 72],W[ 73],W[ 74],W[ 75]); SBoxE3(W[ 76],W[ 77],W[ 78],W[ 79]);
- SBoxE2(W[ 80],W[ 81],W[ 82],W[ 83]); SBoxE1(W[ 84],W[ 85],W[ 86],W[ 87]);
- SBoxE8(W[ 88],W[ 89],W[ 90],W[ 91]); SBoxE7(W[ 92],W[ 93],W[ 94],W[ 95]);
- SBoxE6(W[ 96],W[ 97],W[ 98],W[ 99]); SBoxE5(W[100],W[101],W[102],W[103]);
- SBoxE4(W[104],W[105],W[106],W[107]); SBoxE3(W[108],W[109],W[110],W[111]);
- SBoxE2(W[112],W[113],W[114],W[115]); SBoxE1(W[116],W[117],W[118],W[119]);
- SBoxE8(W[120],W[121],W[122],W[123]); SBoxE7(W[124],W[125],W[126],W[127]);
- SBoxE6(W[128],W[129],W[130],W[131]); SBoxE5(W[132],W[133],W[134],W[135]);
+ SBoxE1(W[ 20],W[ 21],W[ 22],W[ 23]);
+ SBoxE1(W[ 52],W[ 53],W[ 54],W[ 55]);
+ SBoxE1(W[ 84],W[ 85],W[ 86],W[ 87]);
+ SBoxE1(W[116],W[117],W[118],W[119]);
+
+ SBoxE2(W[ 16],W[ 17],W[ 18],W[ 19]);
+ SBoxE2(W[ 48],W[ 49],W[ 50],W[ 51]);
+ SBoxE2(W[ 80],W[ 81],W[ 82],W[ 83]);
+ SBoxE2(W[112],W[113],W[114],W[115]);
+
+ SBoxE3(W[ 12],W[ 13],W[ 14],W[ 15]);
+ SBoxE3(W[ 44],W[ 45],W[ 46],W[ 47]);
+ SBoxE3(W[ 76],W[ 77],W[ 78],W[ 79]);
+ SBoxE3(W[108],W[109],W[110],W[111]);
+
+ SBoxE4(W[ 8],W[ 9],W[ 10],W[ 11]);
+ SBoxE4(W[ 40],W[ 41],W[ 42],W[ 43]);
+ SBoxE4(W[ 72],W[ 73],W[ 74],W[ 75]);
+ SBoxE4(W[104],W[105],W[106],W[107]);
SBoxE4(W[136],W[137],W[138],W[139]);
+ SBoxE5(W[ 36],W[ 37],W[ 38],W[ 39]);
+ SBoxE5(W[ 68],W[ 69],W[ 70],W[ 71]);
+ SBoxE5(W[100],W[101],W[102],W[103]);
+ SBoxE5(W[132],W[133],W[134],W[135]);
+
+ SBoxE6(W[ 32],W[ 33],W[ 34],W[ 35]);
+ SBoxE6(W[ 64],W[ 65],W[ 66],W[ 67]);
+ SBoxE6(W[ 96],W[ 97],W[ 98],W[ 99]);
+ SBoxE6(W[128],W[129],W[130],W[131]);
+
+ SBoxE7(W[ 28],W[ 29],W[ 30],W[ 31]);
+ SBoxE7(W[ 60],W[ 61],W[ 62],W[ 63]);
+ SBoxE7(W[ 92],W[ 93],W[ 94],W[ 95]);
+ SBoxE7(W[124],W[125],W[126],W[127]);
+
+ SBoxE8(W[ 24],W[ 25],W[ 26],W[ 27]);
+ SBoxE8(W[ 56],W[ 57],W[ 58],W[ 59]);
+ SBoxE8(W[ 88],W[ 89],W[ 90],W[ 91]);
+ SBoxE8(W[120],W[121],W[122],W[123]);
+
m_round_key.assign(W.begin() + 8, W.end());
}
diff --git a/src/lib/block/threefish/threefish.cpp b/src/lib/block/threefish/threefish.cpp
index f592021fb..2acdef020 100644
--- a/src/lib/block/threefish/threefish.cpp
+++ b/src/lib/block/threefish/threefish.cpp
@@ -122,16 +122,10 @@ void Threefish_512::encrypt_n(const byte in[], byte out[], size_t blocks) const
}
#endif
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u64bit X0 = load_le<u64bit>(in, 0);
- u64bit X1 = load_le<u64bit>(in, 1);
- u64bit X2 = load_le<u64bit>(in, 2);
- u64bit X3 = load_le<u64bit>(in, 3);
- u64bit X4 = load_le<u64bit>(in, 4);
- u64bit X5 = load_le<u64bit>(in, 5);
- u64bit X6 = load_le<u64bit>(in, 6);
- u64bit X7 = load_le<u64bit>(in, 7);
+ u64bit X0, X1, X2, X3, X4, X5, X6, X7;
+ load_le(in + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7);
THREEFISH_INJECT_KEY(0);
@@ -145,10 +139,7 @@ void Threefish_512::encrypt_n(const byte in[], byte out[], size_t blocks) const
THREEFISH_ENC_8_ROUNDS(15,16);
THREEFISH_ENC_8_ROUNDS(17,18);
- store_le(out, X0, X1, X2, X3, X4, X5, X6, X7);
-
- in += 64;
- out += 64;
+ store_le(out + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7);
}
}
@@ -211,16 +202,10 @@ void Threefish_512::decrypt_n(const byte in[], byte out[], size_t blocks) const
THREEFISH_INJECT_KEY(R2); \
} while(0)
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u64bit X0 = load_le<u64bit>(in, 0);
- u64bit X1 = load_le<u64bit>(in, 1);
- u64bit X2 = load_le<u64bit>(in, 2);
- u64bit X3 = load_le<u64bit>(in, 3);
- u64bit X4 = load_le<u64bit>(in, 4);
- u64bit X5 = load_le<u64bit>(in, 5);
- u64bit X6 = load_le<u64bit>(in, 6);
- u64bit X7 = load_le<u64bit>(in, 7);
+ u64bit X0, X1, X2, X3, X4, X5, X6, X7;
+ load_le(in + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7);
THREEFISH_INJECT_KEY(18);
@@ -234,10 +219,7 @@ void Threefish_512::decrypt_n(const byte in[], byte out[], size_t blocks) const
THREEFISH_DEC_8_ROUNDS(3,2);
THREEFISH_DEC_8_ROUNDS(1,0);
- store_le(out, X0, X1, X2, X3, X4, X5, X6, X7);
-
- in += 64;
- out += 64;
+ store_le(out + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7);
}
#undef THREEFISH_DEC_8_ROUNDS
diff --git a/src/lib/block/twofish/twofish.cpp b/src/lib/block/twofish/twofish.cpp
index 336d73a03..a98ae8e70 100644
--- a/src/lib/block/twofish/twofish.cpp
+++ b/src/lib/block/twofish/twofish.cpp
@@ -19,12 +19,15 @@ namespace Botan {
*/
void Twofish::encrypt_n(const byte in[], byte out[], size_t blocks) const
{
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit A = load_le<u32bit>(in, 0) ^ m_RK[0];
- u32bit B = load_le<u32bit>(in, 1) ^ m_RK[1];
- u32bit C = load_le<u32bit>(in, 2) ^ m_RK[2];
- u32bit D = load_le<u32bit>(in, 3) ^ m_RK[3];
+ u32bit A, B, C, D;
+ load_le(in + BLOCK_SIZE*i, A, B, C, D);
+
+ A ^= m_RK[0];
+ B ^= m_RK[1];
+ C ^= m_RK[2];
+ D ^= m_RK[3];
for(size_t j = 0; j != 16; j += 2)
{
@@ -58,10 +61,7 @@ void Twofish::encrypt_n(const byte in[], byte out[], size_t blocks) const
A ^= m_RK[6];
B ^= m_RK[7];
- store_le(out, C, D, A, B);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_le(out + BLOCK_SIZE*i, C, D, A, B);
}
}
@@ -70,12 +70,15 @@ void Twofish::encrypt_n(const byte in[], byte out[], size_t blocks) const
*/
void Twofish::decrypt_n(const byte in[], byte out[], size_t blocks) const
{
- for(size_t i = 0; i != blocks; ++i)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
{
- u32bit A = load_le<u32bit>(in, 0) ^ m_RK[4];
- u32bit B = load_le<u32bit>(in, 1) ^ m_RK[5];
- u32bit C = load_le<u32bit>(in, 2) ^ m_RK[6];
- u32bit D = load_le<u32bit>(in, 3) ^ m_RK[7];
+ u32bit A, B, C, D;
+ load_le(in + BLOCK_SIZE*i, A, B, C, D);
+
+ A ^= m_RK[4];
+ B ^= m_RK[5];
+ C ^= m_RK[6];
+ D ^= m_RK[7];
for(size_t j = 0; j != 16; j += 2)
{
@@ -109,10 +112,7 @@ void Twofish::decrypt_n(const byte in[], byte out[], size_t blocks) const
A ^= m_RK[2];
B ^= m_RK[3];
- store_le(out, C, D, A, B);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_le(out + BLOCK_SIZE*i, C, D, A, B);
}
}
@@ -139,7 +139,7 @@ void Twofish::key_schedule(const byte key[], size_t length)
m_SB[768+i] = MDS3[Q1[Q1[i]^S[ 3]]^S[ 7]];
}
- for(size_t i = 0; i != 40; i += 2)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2)
{
u32bit X = MDS0[Q0[Q0[i ]^key[ 8]]^key[ 0]] ^
MDS1[Q0[Q1[i ]^key[ 9]]^key[ 1]] ^
@@ -166,7 +166,7 @@ void Twofish::key_schedule(const byte key[], size_t length)
m_SB[768+i] = MDS3[Q1[Q1[Q0[i]^S[ 3]]^S[ 7]]^S[11]];
}
- for(size_t i = 0; i != 40; i += 2)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2)
{
u32bit X = MDS0[Q0[Q0[Q1[i ]^key[16]]^key[ 8]]^key[ 0]] ^
MDS1[Q0[Q1[Q1[i ]^key[17]]^key[ 9]]^key[ 1]] ^
@@ -193,7 +193,7 @@ void Twofish::key_schedule(const byte key[], size_t length)
m_SB[768+i] = MDS3[Q1[Q1[Q0[Q1[i]^S[ 3]]^S[ 7]]^S[11]]^S[15]];
}
- for(size_t i = 0; i != 40; i += 2)
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2)
{
u32bit X = MDS0[Q0[Q0[Q1[Q1[i ]^key[24]]^key[16]]^key[ 8]]^key[ 0]] ^
MDS1[Q0[Q1[Q1[Q0[i ]^key[25]]^key[17]]^key[ 9]]^key[ 1]] ^
diff --git a/src/lib/block/xtea/xtea.cpp b/src/lib/block/xtea/xtea.cpp
index 333406d9b..4e5ca7e7c 100644
--- a/src/lib/block/xtea/xtea.cpp
+++ b/src/lib/block/xtea/xtea.cpp
@@ -1,6 +1,6 @@
/*
* XTEA
-* (C) 1999-2009 Jack Lloyd
+* (C) 1999-2009,2016 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
@@ -10,80 +10,49 @@
namespace Botan {
-namespace {
-
-void xtea_encrypt_4(const byte in[32], byte out[32], const u32bit EK[64])
- {
- u32bit L0, R0, L1, R1, L2, R2, L3, R3;
- load_be(in, L0, R0, L1, R1, L2, R2, L3, R3);
-
- for(size_t i = 0; i != 32; ++i)
- {
- L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*i];
- L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*i];
- L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*i];
- L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*i];
-
- R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*i+1];
- R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*i+1];
- R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*i+1];
- R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*i+1];
- }
-
- store_be(out, L0, R0, L1, R1, L2, R2, L3, R3);
- }
-
-void xtea_decrypt_4(const byte in[32], byte out[32], const u32bit EK[64])
- {
- u32bit L0, R0, L1, R1, L2, R2, L3, R3;
- load_be(in, L0, R0, L1, R1, L2, R2, L3, R3);
-
- for(size_t i = 0; i != 32; ++i)
- {
- R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*i];
- R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*i];
- R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*i];
- R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*i];
-
- L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*i];
- L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*i];
- L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*i];
- L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*i];
- }
-
- store_be(out, L0, R0, L1, R1, L2, R2, L3, R3);
- }
-
-}
-
/*
* XTEA Encryption
*/
void XTEA::encrypt_n(const byte in[], byte out[], size_t blocks) const
{
- while(blocks >= 4)
- {
- xtea_encrypt_4(in, out, &(this->m_EK[0]));
- in += 4 * BLOCK_SIZE;
- out += 4 * BLOCK_SIZE;
- blocks -= 4;
- }
+ const u32bit* EK = &m_EK[0];
- for(size_t i = 0; i != blocks; ++i)
+ const size_t blocks4 = blocks / 4;
+ const size_t blocks_left = blocks % 4;
+
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks4; i++)
{
- u32bit L = load_be<u32bit>(in, 0);
- u32bit R = load_be<u32bit>(in, 1);
+ u32bit L0, R0, L1, R1, L2, R2, L3, R3;
+ load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
- for(size_t j = 0; j != 32; ++j)
+ for(size_t r = 0; r != 32; ++r)
{
- L += (((R << 4) ^ (R >> 5)) + R) ^ m_EK[2*j];
- R += (((L << 4) ^ (L >> 5)) + L) ^ m_EK[2*j+1];
+ L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r];
+ L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r];
+ L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r];
+ L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r];
+
+ R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1];
+ R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1];
+ R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1];
+ R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1];
}
- store_be(out, L, R);
+ store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+ }
+
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks_left; ++i)
+ {
+ u32bit L, R;
+ load_be(in + BLOCK_SIZE*(4*blocks4+i), L, R);
+
+ for(size_t r = 0; r != 32; ++r)
+ {
+ L += (((R << 4) ^ (R >> 5)) + R) ^ EK[2*r];
+ R += (((L << 4) ^ (L >> 5)) + L) ^ EK[2*r+1];
+ }
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(out + BLOCK_SIZE*(4*blocks4+i), L, R);
}
}
@@ -92,29 +61,44 @@ void XTEA::encrypt_n(const byte in[], byte out[], size_t blocks) const
*/
void XTEA::decrypt_n(const byte in[], byte out[], size_t blocks) const
{
- while(blocks >= 4)
- {
- xtea_decrypt_4(in, out, &(this->m_EK[0]));
- in += 4 * BLOCK_SIZE;
- out += 4 * BLOCK_SIZE;
- blocks -= 4;
- }
+ const u32bit* EK = &m_EK[0];
- for(size_t i = 0; i != blocks; ++i)
+ const size_t blocks4 = blocks / 4;
+ const size_t blocks_left = blocks % 4;
+
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks4; i++)
{
- u32bit L = load_be<u32bit>(in, 0);
- u32bit R = load_be<u32bit>(in, 1);
+ u32bit L0, R0, L1, R1, L2, R2, L3, R3;
+ load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
- for(size_t j = 0; j != 32; ++j)
+ for(size_t r = 0; r != 32; ++r)
{
- R -= (((L << 4) ^ (L >> 5)) + L) ^ m_EK[63 - 2*j];
- L -= (((R << 4) ^ (R >> 5)) + R) ^ m_EK[62 - 2*j];
+ R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*r];
+ R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*r];
+ R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*r];
+ R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*r];
+
+ L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*r];
+ L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*r];
+ L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*r];
+ L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*r];
}
- store_be(out, L, R);
+ store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+ }
+
+ BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks_left; ++i)
+ {
+ u32bit L, R;
+ load_be(in + BLOCK_SIZE*(4*blocks4+i), L, R);
+
+ for(size_t r = 0; r != 32; ++r)
+ {
+ R -= (((L << 4) ^ (L >> 5)) + L) ^ m_EK[63 - 2*r];
+ L -= (((R << 4) ^ (R >> 5)) + R) ^ m_EK[62 - 2*r];
+ }
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
+ store_be(out + BLOCK_SIZE*(4*blocks4+i), L, R);
}
}