diff options
author | Jack Lloyd <[email protected]> | 2016-08-10 13:51:25 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2016-11-26 12:34:15 -0500 |
commit | 3bc46d79c4509cbf871f762e39a366e95e8342ce (patch) | |
tree | 80359e3d3955412ae027393ec72db51456d5f575 | |
parent | 5372d0b499ad317ab3776c9ac92df866cc6a1e84 (diff) |
Add Cilk/OpenMP support
-rwxr-xr-x | configure.py | 20 | ||||
-rw-r--r-- | src/build-data/buildh.in | 46 | ||||
-rw-r--r-- | src/build-data/cc/clang.txt | 2 | ||||
-rw-r--r-- | src/build-data/cc/gcc.txt | 3 | ||||
-rw-r--r-- | src/lib/block/aes/aes.cpp | 48 | ||||
-rw-r--r-- | src/lib/block/aes/aes_ssse3/aes_ssse3.cpp | 4 | ||||
-rw-r--r-- | src/lib/block/blowfish/blowfish.cpp | 22 | ||||
-rw-r--r-- | src/lib/block/camellia/camellia.cpp | 22 | ||||
-rw-r--r-- | src/lib/block/cast/cast128.cpp | 22 | ||||
-rw-r--r-- | src/lib/block/des/des.cpp | 30 | ||||
-rw-r--r-- | src/lib/block/idea/idea.cpp | 8 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent.cpp | 81 | ||||
-rw-r--r-- | src/lib/block/threefish/threefish.cpp | 34 | ||||
-rw-r--r-- | src/lib/block/twofish/twofish.cpp | 42 | ||||
-rw-r--r-- | src/lib/block/xtea/xtea.cpp | 142 |
15 files changed, 275 insertions, 251 deletions
diff --git a/configure.py b/configure.py index 0fabeed3e..789377129 100755 --- a/configure.py +++ b/configure.py @@ -323,6 +323,11 @@ def process_command_line(args): build_group.add_option('--with-external-includedir', metavar='DIR', default='', help='use DIR for external includes') + build_group.add_option('--with-openmp', default=False, action='store_true', + help='enable use of OpenMP') + build_group.add_option('--with-cilkplus', default=False, action='store_true', + help='enable use of Cilk Plus') + link_methods = ['symlink', 'hardlink', 'copy'] build_group.add_option('--link-method', default=None, metavar='METHOD', choices=link_methods, @@ -850,6 +855,11 @@ class ArchInfo(object): if options.with_valgrind: macros.append('HAS_VALGRIND') + if options.with_openmp: + macros.append('TARGET_HAS_OPENMP') + if options.with_cilkplus: + macros.append('TARGET_HAS_CILKPLUS') + return macros class CompilerInfo(object): @@ -953,6 +963,16 @@ class CompilerInfo(object): raise Exception('No sanitizer handling for %s' % (self.basename)) abi_link.append(self.sanitizer_flags) + if options.with_openmp: + if 'openmp' not in self.mach_abi_linking: + raise Exception('No support for OpenMP for %s' % (self.basename)) + abi_link.append(self.mach_abi_linking['openmp']) + + if options.with_cilkplus: + if 'cilkplus' not in self.mach_abi_linking: + raise Exception('No support for Cilk Plus for %s' % (self.basename)) + abi_link.append(self.mach_abi_linking['cilkplus']) + abi_flags = ' '.join(sorted(abi_link)) if options.cc_abi_flags != '': diff --git a/src/build-data/buildh.in b/src/build-data/buildh.in index 56b70e060..0702d1416 100644 --- a/src/build-data/buildh.in +++ b/src/build-data/buildh.in @@ -264,6 +264,52 @@ Each read generates 32 bits of output #define BOTAN_NOEXCEPT noexcept #endif +#if !defined(BOTAN_PARALLEL_FOR) + +#if defined(BOTAN_TARGET_HAS_CILKPLUS) + #define BOTAN_PARALLEL_FOR _Cilk_for +#elif defined(BOTAN_TARGET_HAS_OPENMP) + #define BOTAN_PARALLEL_FOR _Pragma("omp parallel for") for +#else + #define BOTAN_PARALLEL_FOR for +#endif + +#endif + +#if !defined(BOTAN_PARALLEL_SIMD_FOR) + +#if defined(BOTAN_TARGET_HAS_CILKPLUS) + #define BOTAN_PARALLEL_SIMD_FOR _Pragma("simd") for +#elif defined(BOTAN_TARGET_HAS_OPENMP) + #define BOTAN_PARALLEL_SIMD_FOR _Pragma("omp simd") for +#elif defined(BOTAN_TARGET_COMPILER_IS_GCC) + #define BOTAN_PARALLEL_FOR _Pragma("GCC ivdep") for +#else + #define BOTAN_PARALLEL_SIMD_FOR for +#endif + +#endif + +#if !defined(BOTAN_PARALLEL_SPAWN) + +#if defined(BOTAN_TARGET_HAS_CILKPLUS) + #define BOTAN_PARALLEL_SPAWN _Cilk_spawn +#else + #define BOTAN_PARALLEL_SPAWN +#endif + +#endif + +#if !defined(BOTAN_PARALLEL_SYNC) + +#if defined(BOTAN_TARGET_HAS_CILKPLUS) + #define BOTAN_PARALLEL_SYNC _Cilk_sync +#else + #define BOTAN_PARALLEL_SYNC BOTAN_FORCE_SEMICOLON +#endif + +#endif + /* * Module availability definitions */ diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt index c4a85658f..055315c3b 100644 --- a/src/build-data/cc/clang.txt +++ b/src/build-data/cc/clang.txt @@ -73,6 +73,8 @@ ivybridge -> "-march=core-avx-i" <mach_abi_linking> all -> "-pthread" +openmp -> "-fopenmp" + x86_32 -> "-m32" x86_64 -> "-m64" ppc64 -> "-m64" diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt index b88454ce6..0a53e15c1 100644 --- a/src/build-data/cc/gcc.txt +++ b/src/build-data/cc/gcc.txt @@ -120,6 +120,9 @@ all_x86_64 -> "-momit-leaf-frame-pointer" <mach_abi_linking> all -> "-pthread -fstack-protector" +cilkplus -> "-fcilkplus" +openmp -> "-fopenmp" + mips64 -> "-mabi=64" s390 -> "-m31" s390x -> "-m64" diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp index 39f5bd0db..8c7000135 100644 --- a/src/lib/block/aes/aes.cpp +++ b/src/lib/block/aes/aes.cpp @@ -168,12 +168,15 @@ void aes_encrypt_n(const byte in[], byte out[], } Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit T0 = load_be<u32bit>(in, 0) ^ EK[0]; - u32bit T1 = load_be<u32bit>(in, 1) ^ EK[1]; - u32bit T2 = load_be<u32bit>(in, 2) ^ EK[2]; - u32bit T3 = load_be<u32bit>(in, 3) ^ EK[3]; + u32bit T0, T1, T2, T3; + load_be(in + 16*i, T0, T1, T2, T3); + + T0 ^= EK[0]; + T1 ^= EK[1]; + T2 ^= EK[2]; + T3 ^= EK[3]; T0 ^= Z; @@ -226,25 +229,22 @@ void aes_encrypt_n(const byte in[], byte out[], TE[get_byte(2, T1) + 512] ^ TE[get_byte(3, T2) + 768]; } - out[ 0] = SE[get_byte(0, B0)] ^ ME[0]; - out[ 1] = SE[get_byte(1, B1)] ^ ME[1]; - out[ 2] = SE[get_byte(2, B2)] ^ ME[2]; - out[ 3] = SE[get_byte(3, B3)] ^ ME[3]; - out[ 4] = SE[get_byte(0, B1)] ^ ME[4]; - out[ 5] = SE[get_byte(1, B2)] ^ ME[5]; - out[ 6] = SE[get_byte(2, B3)] ^ ME[6]; - out[ 7] = SE[get_byte(3, B0)] ^ ME[7]; - out[ 8] = SE[get_byte(0, B2)] ^ ME[8]; - out[ 9] = SE[get_byte(1, B3)] ^ ME[9]; - out[10] = SE[get_byte(2, B0)] ^ ME[10]; - out[11] = SE[get_byte(3, B1)] ^ ME[11]; - out[12] = SE[get_byte(0, B3)] ^ ME[12]; - out[13] = SE[get_byte(1, B0)] ^ ME[13]; - out[14] = SE[get_byte(2, B1)] ^ ME[14]; - out[15] = SE[get_byte(3, B2)] ^ ME[15]; - - in += 16; - out += 16; + out[16*i+ 0] = SE[get_byte(0, B0)] ^ ME[0]; + out[16*i+ 1] = SE[get_byte(1, B1)] ^ ME[1]; + out[16*i+ 2] = SE[get_byte(2, B2)] ^ ME[2]; + out[16*i+ 3] = SE[get_byte(3, B3)] ^ ME[3]; + out[16*i+ 4] = SE[get_byte(0, B1)] ^ ME[4]; + out[16*i+ 5] = SE[get_byte(1, B2)] ^ ME[5]; + out[16*i+ 6] = SE[get_byte(2, B3)] ^ ME[6]; + out[16*i+ 7] = SE[get_byte(3, B0)] ^ ME[7]; + out[16*i+ 8] = SE[get_byte(0, B2)] ^ ME[8]; + out[16*i+ 9] = SE[get_byte(1, B3)] ^ ME[9]; + out[16*i+10] = SE[get_byte(2, B0)] ^ ME[10]; + out[16*i+11] = SE[get_byte(3, B1)] ^ ME[11]; + out[16*i+12] = SE[get_byte(0, B3)] ^ ME[12]; + out[16*i+13] = SE[get_byte(1, B0)] ^ ME[13]; + out[16*i+14] = SE[get_byte(2, B1)] ^ ME[14]; + out[16*i+15] = SE[get_byte(3, B2)] ^ ME[15]; } } diff --git a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp b/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp index ef24795bb..d8c7e7314 100644 --- a/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp +++ b/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp @@ -355,7 +355,7 @@ void AES_128::ssse3_encrypt_n(const byte in[], byte out[], size_t blocks) const CT::poison(in, blocks * block_size()); - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { __m128i B = _mm_loadu_si128(in_mm + i); _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10)); @@ -378,7 +378,7 @@ void AES_128::ssse3_decrypt_n(const byte in[], byte out[], size_t blocks) const CT::poison(in, blocks * block_size()); - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { __m128i B = _mm_loadu_si128(in_mm + i); _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10)); diff --git a/src/lib/block/blowfish/blowfish.cpp b/src/lib/block/blowfish/blowfish.cpp index 7a06cf797..69d345baa 100644 --- a/src/lib/block/blowfish/blowfish.cpp +++ b/src/lib/block/blowfish/blowfish.cpp @@ -202,10 +202,10 @@ void Blowfish::encrypt_n(const byte in[], byte out[], size_t blocks) const const u32bit* S3 = &m_S[512]; const u32bit* S4 = &m_S[768]; - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit L = load_be<u32bit>(in, 0); - u32bit R = load_be<u32bit>(in, 1); + u32bit L, R; + load_be(in + BLOCK_SIZE*i, L, R); for(size_t j = 0; j != 16; j += 2) { @@ -220,10 +220,7 @@ void Blowfish::encrypt_n(const byte in[], byte out[], size_t blocks) const L ^= m_P[16]; R ^= m_P[17]; - store_be(out, R, L); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(out + BLOCK_SIZE*i, R, L); } } @@ -237,10 +234,10 @@ void Blowfish::decrypt_n(const byte in[], byte out[], size_t blocks) const const u32bit* S3 = &m_S[512]; const u32bit* S4 = &m_S[768]; - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit L = load_be<u32bit>(in, 0); - u32bit R = load_be<u32bit>(in, 1); + u32bit L, R; + load_be(in + BLOCK_SIZE*i, L, R); for(size_t j = 17; j != 1; j -= 2) { @@ -255,10 +252,7 @@ void Blowfish::decrypt_n(const byte in[], byte out[], size_t blocks) const L ^= m_P[1]; R ^= m_P[0]; - store_be(out, R, L); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(out + BLOCK_SIZE*i, R, L); } } diff --git a/src/lib/block/camellia/camellia.cpp b/src/lib/block/camellia/camellia.cpp index ac5d57d4e..5ac13b9ab 100644 --- a/src/lib/block/camellia/camellia.cpp +++ b/src/lib/block/camellia/camellia.cpp @@ -645,10 +645,10 @@ inline u64bit FLINV(u64bit v, u64bit K) void encrypt(const byte in[], byte out[], size_t blocks, const secure_vector<u64bit>& SK, const size_t rounds) { - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u64bit D1 = load_be<u64bit>(in, 0); - u64bit D2 = load_be<u64bit>(in, 1); + u64bit D1, D2; + load_be(in + 16*i, D1, D2); const u64bit* K = SK.data(); @@ -676,10 +676,7 @@ void encrypt(const byte in[], byte out[], size_t blocks, D2 ^= *K++; D1 ^= *K++; - store_be(out, D2, D1); - - in += 16; - out += 16; + store_be(out + 16*i, D2, D1); } } @@ -689,10 +686,10 @@ void encrypt(const byte in[], byte out[], size_t blocks, void decrypt(const byte in[], byte out[], size_t blocks, const secure_vector<u64bit>& SK, const size_t rounds) { - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u64bit D1 = load_be<u64bit>(in, 0); - u64bit D2 = load_be<u64bit>(in, 1); + u64bit D1, D2; + load_be(in + 16*i, D1, D2); const u64bit* K = &SK[SK.size()-1]; @@ -720,10 +717,7 @@ void decrypt(const byte in[], byte out[], size_t blocks, D1 ^= *K--; D2 ^= *K; - store_be(out, D2, D1); - - in += 16; - out += 16; + store_be(out + 16*i, D2, D1); } } diff --git a/src/lib/block/cast/cast128.cpp b/src/lib/block/cast/cast128.cpp index 53f7d4611..96c4f45a7 100644 --- a/src/lib/block/cast/cast128.cpp +++ b/src/lib/block/cast/cast128.cpp @@ -50,10 +50,10 @@ inline void R3(u32bit& L, u32bit R, u32bit MK, byte RK) */ void CAST_128::encrypt_n(const byte in[], byte out[], size_t blocks) const { - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit L = load_be<u32bit>(in, 0); - u32bit R = load_be<u32bit>(in, 1); + u32bit L, R; + load_be(in + BLOCK_SIZE*i, L, R); R1(L, R, m_MK[ 0], m_RK[ 0]); R2(R, L, m_MK[ 1], m_RK[ 1]); @@ -72,10 +72,7 @@ void CAST_128::encrypt_n(const byte in[], byte out[], size_t blocks) const R3(L, R, m_MK[14], m_RK[14]); R1(R, L, m_MK[15], m_RK[15]); - store_be(out, R, L); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(out + BLOCK_SIZE*i, R, L); } } @@ -84,10 +81,10 @@ void CAST_128::encrypt_n(const byte in[], byte out[], size_t blocks) const */ void CAST_128::decrypt_n(const byte in[], byte out[], size_t blocks) const { - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit L = load_be<u32bit>(in, 0); - u32bit R = load_be<u32bit>(in, 1); + u32bit L, R; + load_be(in + BLOCK_SIZE*i, L, R); R1(L, R, m_MK[15], m_RK[15]); R3(R, L, m_MK[14], m_RK[14]); @@ -106,10 +103,7 @@ void CAST_128::decrypt_n(const byte in[], byte out[], size_t blocks) const R2(L, R, m_MK[ 1], m_RK[ 1]); R1(R, L, m_MK[ 0], m_RK[ 0]); - store_be(out, R, L); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(out + BLOCK_SIZE*i, R, L); } } diff --git a/src/lib/block/des/des.cpp b/src/lib/block/des/des.cpp index 88671df8d..a55c43ec7 100644 --- a/src/lib/block/des/des.cpp +++ b/src/lib/block/des/des.cpp @@ -144,12 +144,12 @@ void des_decrypt(u32bit& L, u32bit& R, */ void DES::encrypt_n(const byte in[], byte out[], size_t blocks) const { - for(size_t i = 0; i != blocks; ++i) + for(size_t i = 0; i < blocks; ++i) { - u64bit T = (DES_IPTAB1[in[0]] ) | (DES_IPTAB1[in[1]] << 1) | - (DES_IPTAB1[in[2]] << 2) | (DES_IPTAB1[in[3]] << 3) | - (DES_IPTAB1[in[4]] << 4) | (DES_IPTAB1[in[5]] << 5) | - (DES_IPTAB1[in[6]] << 6) | (DES_IPTAB2[in[7]] ); + u64bit T = (DES_IPTAB1[in[8*i+0]] ) | (DES_IPTAB1[in[8*i+1]] << 1) | + (DES_IPTAB1[in[8*i+2]] << 2) | (DES_IPTAB1[in[8*i+3]] << 3) | + (DES_IPTAB1[in[8*i+4]] << 4) | (DES_IPTAB1[in[8*i+5]] << 5) | + (DES_IPTAB1[in[8*i+6]] << 6) | (DES_IPTAB2[in[8*i+7]] ); u32bit L = static_cast<u32bit>(T >> 32); u32bit R = static_cast<u32bit>(T); @@ -162,10 +162,7 @@ void DES::encrypt_n(const byte in[], byte out[], size_t blocks) const (DES_FPTAB1[get_byte(2, R)] ) | (DES_FPTAB2[get_byte(3, R)] ); T = rotate_left(T, 32); - store_be(T, out); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(T, out + 8*i); } } @@ -174,12 +171,12 @@ void DES::encrypt_n(const byte in[], byte out[], size_t blocks) const */ void DES::decrypt_n(const byte in[], byte out[], size_t blocks) const { - for(size_t i = 0; i != blocks; ++i) + for(size_t i = 0; i < blocks; ++i) { - u64bit T = (DES_IPTAB1[in[0]] ) | (DES_IPTAB1[in[1]] << 1) | - (DES_IPTAB1[in[2]] << 2) | (DES_IPTAB1[in[3]] << 3) | - (DES_IPTAB1[in[4]] << 4) | (DES_IPTAB1[in[5]] << 5) | - (DES_IPTAB1[in[6]] << 6) | (DES_IPTAB2[in[7]] ); + u64bit T = (DES_IPTAB1[in[BLOCK_SIZE*i+0]] ) | (DES_IPTAB1[in[BLOCK_SIZE*i+1]] << 1) | + (DES_IPTAB1[in[BLOCK_SIZE*i+2]] << 2) | (DES_IPTAB1[in[BLOCK_SIZE*i+3]] << 3) | + (DES_IPTAB1[in[BLOCK_SIZE*i+4]] << 4) | (DES_IPTAB1[in[BLOCK_SIZE*i+5]] << 5) | + (DES_IPTAB1[in[BLOCK_SIZE*i+6]] << 6) | (DES_IPTAB2[in[BLOCK_SIZE*i+7]] ); u32bit L = static_cast<u32bit>(T >> 32); u32bit R = static_cast<u32bit>(T); @@ -193,10 +190,7 @@ void DES::decrypt_n(const byte in[], byte out[], size_t blocks) const T = rotate_left(T, 32); - store_be(T, out); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(T, out + BLOCK_SIZE*i); } } diff --git a/src/lib/block/idea/idea.cpp b/src/lib/block/idea/idea.cpp index 85cc5e757..1fe25d599 100644 --- a/src/lib/block/idea/idea.cpp +++ b/src/lib/block/idea/idea.cpp @@ -67,12 +67,10 @@ void idea_op(const byte in[], byte out[], size_t blocks, const u16bit K[52]) CT::poison(out, blocks * 8); CT::poison(K, 52); - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u16bit X1 = load_be<u16bit>(in + BLOCK_SIZE*i, 0); - u16bit X2 = load_be<u16bit>(in + BLOCK_SIZE*i, 1); - u16bit X3 = load_be<u16bit>(in + BLOCK_SIZE*i, 2); - u16bit X4 = load_be<u16bit>(in + BLOCK_SIZE*i, 3); + u16bit X1, X2, X3, X4; + load_be(in + BLOCK_SIZE*i, X1, X2, X3, X4); for(size_t j = 0; j != 8; ++j) { diff --git a/src/lib/block/serpent/serpent.cpp b/src/lib/block/serpent/serpent.cpp index 07088211d..a1326b888 100644 --- a/src/lib/block/serpent/serpent.cpp +++ b/src/lib/block/serpent/serpent.cpp @@ -70,12 +70,10 @@ void Serpent::encrypt_n(const byte in[], byte out[], size_t blocks) const } #endif - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_SIMD_FOR(size_t i = 0; i < blocks; ++i) { - u32bit B0 = load_le<u32bit>(in, 0); - u32bit B1 = load_le<u32bit>(in, 1); - u32bit B2 = load_le<u32bit>(in, 2); - u32bit B3 = load_le<u32bit>(in, 3); + u32bit B0, B1, B2, B3; + load_le(in + 16*i, B0, B1, B2, B3); key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); @@ -110,10 +108,7 @@ void Serpent::encrypt_n(const byte in[], byte out[], size_t blocks) const key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3); - store_le(out, B0, B1, B2, B3); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_le(out + 16*i, B0, B1, B2, B3); } } @@ -135,12 +130,10 @@ void Serpent::decrypt_n(const byte in[], byte out[], size_t blocks) const } #endif - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_SIMD_FOR(size_t i = 0; i < blocks; ++i) { - u32bit B0 = load_le<u32bit>(in, 0); - u32bit B1 = load_le<u32bit>(in, 1); - u32bit B2 = load_le<u32bit>(in, 2); - u32bit B3 = load_le<u32bit>(in, 3); + u32bit B0, B1, B2, B3; + load_le(in + 16*i, B0, B1, B2, B3); key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3); @@ -175,10 +168,7 @@ void Serpent::decrypt_n(const byte in[], byte out[], size_t blocks) const i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3); - store_le(out, B0, B1, B2, B3); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_le(out + 16*i, B0, B1, B2, B3); } } @@ -205,24 +195,47 @@ void Serpent::key_schedule(const byte key[], size_t length) W[i] = rotate_left(wi, 11); } - SBoxE4(W[ 8],W[ 9],W[ 10],W[ 11]); SBoxE3(W[ 12],W[ 13],W[ 14],W[ 15]); - SBoxE2(W[ 16],W[ 17],W[ 18],W[ 19]); SBoxE1(W[ 20],W[ 21],W[ 22],W[ 23]); - SBoxE8(W[ 24],W[ 25],W[ 26],W[ 27]); SBoxE7(W[ 28],W[ 29],W[ 30],W[ 31]); - SBoxE6(W[ 32],W[ 33],W[ 34],W[ 35]); SBoxE5(W[ 36],W[ 37],W[ 38],W[ 39]); - SBoxE4(W[ 40],W[ 41],W[ 42],W[ 43]); SBoxE3(W[ 44],W[ 45],W[ 46],W[ 47]); - SBoxE2(W[ 48],W[ 49],W[ 50],W[ 51]); SBoxE1(W[ 52],W[ 53],W[ 54],W[ 55]); - SBoxE8(W[ 56],W[ 57],W[ 58],W[ 59]); SBoxE7(W[ 60],W[ 61],W[ 62],W[ 63]); - SBoxE6(W[ 64],W[ 65],W[ 66],W[ 67]); SBoxE5(W[ 68],W[ 69],W[ 70],W[ 71]); - SBoxE4(W[ 72],W[ 73],W[ 74],W[ 75]); SBoxE3(W[ 76],W[ 77],W[ 78],W[ 79]); - SBoxE2(W[ 80],W[ 81],W[ 82],W[ 83]); SBoxE1(W[ 84],W[ 85],W[ 86],W[ 87]); - SBoxE8(W[ 88],W[ 89],W[ 90],W[ 91]); SBoxE7(W[ 92],W[ 93],W[ 94],W[ 95]); - SBoxE6(W[ 96],W[ 97],W[ 98],W[ 99]); SBoxE5(W[100],W[101],W[102],W[103]); - SBoxE4(W[104],W[105],W[106],W[107]); SBoxE3(W[108],W[109],W[110],W[111]); - SBoxE2(W[112],W[113],W[114],W[115]); SBoxE1(W[116],W[117],W[118],W[119]); - SBoxE8(W[120],W[121],W[122],W[123]); SBoxE7(W[124],W[125],W[126],W[127]); - SBoxE6(W[128],W[129],W[130],W[131]); SBoxE5(W[132],W[133],W[134],W[135]); + SBoxE1(W[ 20],W[ 21],W[ 22],W[ 23]); + SBoxE1(W[ 52],W[ 53],W[ 54],W[ 55]); + SBoxE1(W[ 84],W[ 85],W[ 86],W[ 87]); + SBoxE1(W[116],W[117],W[118],W[119]); + + SBoxE2(W[ 16],W[ 17],W[ 18],W[ 19]); + SBoxE2(W[ 48],W[ 49],W[ 50],W[ 51]); + SBoxE2(W[ 80],W[ 81],W[ 82],W[ 83]); + SBoxE2(W[112],W[113],W[114],W[115]); + + SBoxE3(W[ 12],W[ 13],W[ 14],W[ 15]); + SBoxE3(W[ 44],W[ 45],W[ 46],W[ 47]); + SBoxE3(W[ 76],W[ 77],W[ 78],W[ 79]); + SBoxE3(W[108],W[109],W[110],W[111]); + + SBoxE4(W[ 8],W[ 9],W[ 10],W[ 11]); + SBoxE4(W[ 40],W[ 41],W[ 42],W[ 43]); + SBoxE4(W[ 72],W[ 73],W[ 74],W[ 75]); + SBoxE4(W[104],W[105],W[106],W[107]); SBoxE4(W[136],W[137],W[138],W[139]); + SBoxE5(W[ 36],W[ 37],W[ 38],W[ 39]); + SBoxE5(W[ 68],W[ 69],W[ 70],W[ 71]); + SBoxE5(W[100],W[101],W[102],W[103]); + SBoxE5(W[132],W[133],W[134],W[135]); + + SBoxE6(W[ 32],W[ 33],W[ 34],W[ 35]); + SBoxE6(W[ 64],W[ 65],W[ 66],W[ 67]); + SBoxE6(W[ 96],W[ 97],W[ 98],W[ 99]); + SBoxE6(W[128],W[129],W[130],W[131]); + + SBoxE7(W[ 28],W[ 29],W[ 30],W[ 31]); + SBoxE7(W[ 60],W[ 61],W[ 62],W[ 63]); + SBoxE7(W[ 92],W[ 93],W[ 94],W[ 95]); + SBoxE7(W[124],W[125],W[126],W[127]); + + SBoxE8(W[ 24],W[ 25],W[ 26],W[ 27]); + SBoxE8(W[ 56],W[ 57],W[ 58],W[ 59]); + SBoxE8(W[ 88],W[ 89],W[ 90],W[ 91]); + SBoxE8(W[120],W[121],W[122],W[123]); + m_round_key.assign(W.begin() + 8, W.end()); } diff --git a/src/lib/block/threefish/threefish.cpp b/src/lib/block/threefish/threefish.cpp index f592021fb..2acdef020 100644 --- a/src/lib/block/threefish/threefish.cpp +++ b/src/lib/block/threefish/threefish.cpp @@ -122,16 +122,10 @@ void Threefish_512::encrypt_n(const byte in[], byte out[], size_t blocks) const } #endif - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u64bit X0 = load_le<u64bit>(in, 0); - u64bit X1 = load_le<u64bit>(in, 1); - u64bit X2 = load_le<u64bit>(in, 2); - u64bit X3 = load_le<u64bit>(in, 3); - u64bit X4 = load_le<u64bit>(in, 4); - u64bit X5 = load_le<u64bit>(in, 5); - u64bit X6 = load_le<u64bit>(in, 6); - u64bit X7 = load_le<u64bit>(in, 7); + u64bit X0, X1, X2, X3, X4, X5, X6, X7; + load_le(in + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); THREEFISH_INJECT_KEY(0); @@ -145,10 +139,7 @@ void Threefish_512::encrypt_n(const byte in[], byte out[], size_t blocks) const THREEFISH_ENC_8_ROUNDS(15,16); THREEFISH_ENC_8_ROUNDS(17,18); - store_le(out, X0, X1, X2, X3, X4, X5, X6, X7); - - in += 64; - out += 64; + store_le(out + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); } } @@ -211,16 +202,10 @@ void Threefish_512::decrypt_n(const byte in[], byte out[], size_t blocks) const THREEFISH_INJECT_KEY(R2); \ } while(0) - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u64bit X0 = load_le<u64bit>(in, 0); - u64bit X1 = load_le<u64bit>(in, 1); - u64bit X2 = load_le<u64bit>(in, 2); - u64bit X3 = load_le<u64bit>(in, 3); - u64bit X4 = load_le<u64bit>(in, 4); - u64bit X5 = load_le<u64bit>(in, 5); - u64bit X6 = load_le<u64bit>(in, 6); - u64bit X7 = load_le<u64bit>(in, 7); + u64bit X0, X1, X2, X3, X4, X5, X6, X7; + load_le(in + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); THREEFISH_INJECT_KEY(18); @@ -234,10 +219,7 @@ void Threefish_512::decrypt_n(const byte in[], byte out[], size_t blocks) const THREEFISH_DEC_8_ROUNDS(3,2); THREEFISH_DEC_8_ROUNDS(1,0); - store_le(out, X0, X1, X2, X3, X4, X5, X6, X7); - - in += 64; - out += 64; + store_le(out + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); } #undef THREEFISH_DEC_8_ROUNDS diff --git a/src/lib/block/twofish/twofish.cpp b/src/lib/block/twofish/twofish.cpp index 336d73a03..a98ae8e70 100644 --- a/src/lib/block/twofish/twofish.cpp +++ b/src/lib/block/twofish/twofish.cpp @@ -19,12 +19,15 @@ namespace Botan { */ void Twofish::encrypt_n(const byte in[], byte out[], size_t blocks) const { - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit A = load_le<u32bit>(in, 0) ^ m_RK[0]; - u32bit B = load_le<u32bit>(in, 1) ^ m_RK[1]; - u32bit C = load_le<u32bit>(in, 2) ^ m_RK[2]; - u32bit D = load_le<u32bit>(in, 3) ^ m_RK[3]; + u32bit A, B, C, D; + load_le(in + BLOCK_SIZE*i, A, B, C, D); + + A ^= m_RK[0]; + B ^= m_RK[1]; + C ^= m_RK[2]; + D ^= m_RK[3]; for(size_t j = 0; j != 16; j += 2) { @@ -58,10 +61,7 @@ void Twofish::encrypt_n(const byte in[], byte out[], size_t blocks) const A ^= m_RK[6]; B ^= m_RK[7]; - store_le(out, C, D, A, B); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_le(out + BLOCK_SIZE*i, C, D, A, B); } } @@ -70,12 +70,15 @@ void Twofish::encrypt_n(const byte in[], byte out[], size_t blocks) const */ void Twofish::decrypt_n(const byte in[], byte out[], size_t blocks) const { - for(size_t i = 0; i != blocks; ++i) + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { - u32bit A = load_le<u32bit>(in, 0) ^ m_RK[4]; - u32bit B = load_le<u32bit>(in, 1) ^ m_RK[5]; - u32bit C = load_le<u32bit>(in, 2) ^ m_RK[6]; - u32bit D = load_le<u32bit>(in, 3) ^ m_RK[7]; + u32bit A, B, C, D; + load_le(in + BLOCK_SIZE*i, A, B, C, D); + + A ^= m_RK[4]; + B ^= m_RK[5]; + C ^= m_RK[6]; + D ^= m_RK[7]; for(size_t j = 0; j != 16; j += 2) { @@ -109,10 +112,7 @@ void Twofish::decrypt_n(const byte in[], byte out[], size_t blocks) const A ^= m_RK[2]; B ^= m_RK[3]; - store_le(out, C, D, A, B); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_le(out + BLOCK_SIZE*i, C, D, A, B); } } @@ -139,7 +139,7 @@ void Twofish::key_schedule(const byte key[], size_t length) m_SB[768+i] = MDS3[Q1[Q1[i]^S[ 3]]^S[ 7]]; } - for(size_t i = 0; i != 40; i += 2) + BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2) { u32bit X = MDS0[Q0[Q0[i ]^key[ 8]]^key[ 0]] ^ MDS1[Q0[Q1[i ]^key[ 9]]^key[ 1]] ^ @@ -166,7 +166,7 @@ void Twofish::key_schedule(const byte key[], size_t length) m_SB[768+i] = MDS3[Q1[Q1[Q0[i]^S[ 3]]^S[ 7]]^S[11]]; } - for(size_t i = 0; i != 40; i += 2) + BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2) { u32bit X = MDS0[Q0[Q0[Q1[i ]^key[16]]^key[ 8]]^key[ 0]] ^ MDS1[Q0[Q1[Q1[i ]^key[17]]^key[ 9]]^key[ 1]] ^ @@ -193,7 +193,7 @@ void Twofish::key_schedule(const byte key[], size_t length) m_SB[768+i] = MDS3[Q1[Q1[Q0[Q1[i]^S[ 3]]^S[ 7]]^S[11]]^S[15]]; } - for(size_t i = 0; i != 40; i += 2) + BOTAN_PARALLEL_FOR(size_t i = 0; i < 40; i += 2) { u32bit X = MDS0[Q0[Q0[Q1[Q1[i ]^key[24]]^key[16]]^key[ 8]]^key[ 0]] ^ MDS1[Q0[Q1[Q1[Q0[i ]^key[25]]^key[17]]^key[ 9]]^key[ 1]] ^ diff --git a/src/lib/block/xtea/xtea.cpp b/src/lib/block/xtea/xtea.cpp index 333406d9b..4e5ca7e7c 100644 --- a/src/lib/block/xtea/xtea.cpp +++ b/src/lib/block/xtea/xtea.cpp @@ -1,6 +1,6 @@ /* * XTEA -* (C) 1999-2009 Jack Lloyd +* (C) 1999-2009,2016 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -10,80 +10,49 @@ namespace Botan { -namespace { - -void xtea_encrypt_4(const byte in[32], byte out[32], const u32bit EK[64]) - { - u32bit L0, R0, L1, R1, L2, R2, L3, R3; - load_be(in, L0, R0, L1, R1, L2, R2, L3, R3); - - for(size_t i = 0; i != 32; ++i) - { - L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*i]; - L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*i]; - L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*i]; - L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*i]; - - R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*i+1]; - R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*i+1]; - R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*i+1]; - R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*i+1]; - } - - store_be(out, L0, R0, L1, R1, L2, R2, L3, R3); - } - -void xtea_decrypt_4(const byte in[32], byte out[32], const u32bit EK[64]) - { - u32bit L0, R0, L1, R1, L2, R2, L3, R3; - load_be(in, L0, R0, L1, R1, L2, R2, L3, R3); - - for(size_t i = 0; i != 32; ++i) - { - R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*i]; - R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*i]; - R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*i]; - R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*i]; - - L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*i]; - L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*i]; - L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*i]; - L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*i]; - } - - store_be(out, L0, R0, L1, R1, L2, R2, L3, R3); - } - -} - /* * XTEA Encryption */ void XTEA::encrypt_n(const byte in[], byte out[], size_t blocks) const { - while(blocks >= 4) - { - xtea_encrypt_4(in, out, &(this->m_EK[0])); - in += 4 * BLOCK_SIZE; - out += 4 * BLOCK_SIZE; - blocks -= 4; - } + const u32bit* EK = &m_EK[0]; - for(size_t i = 0; i != blocks; ++i) + const size_t blocks4 = blocks / 4; + const size_t blocks_left = blocks % 4; + + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks4; i++) { - u32bit L = load_be<u32bit>(in, 0); - u32bit R = load_be<u32bit>(in, 1); + u32bit L0, R0, L1, R1, L2, R2, L3, R3; + load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); - for(size_t j = 0; j != 32; ++j) + for(size_t r = 0; r != 32; ++r) { - L += (((R << 4) ^ (R >> 5)) + R) ^ m_EK[2*j]; - R += (((L << 4) ^ (L >> 5)) + L) ^ m_EK[2*j+1]; + L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r]; + L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r]; + L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r]; + L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r]; + + R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1]; + R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1]; + R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1]; + R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1]; } - store_be(out, L, R); + store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); + } + + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks_left; ++i) + { + u32bit L, R; + load_be(in + BLOCK_SIZE*(4*blocks4+i), L, R); + + for(size_t r = 0; r != 32; ++r) + { + L += (((R << 4) ^ (R >> 5)) + R) ^ EK[2*r]; + R += (((L << 4) ^ (L >> 5)) + L) ^ EK[2*r+1]; + } - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(out + BLOCK_SIZE*(4*blocks4+i), L, R); } } @@ -92,29 +61,44 @@ void XTEA::encrypt_n(const byte in[], byte out[], size_t blocks) const */ void XTEA::decrypt_n(const byte in[], byte out[], size_t blocks) const { - while(blocks >= 4) - { - xtea_decrypt_4(in, out, &(this->m_EK[0])); - in += 4 * BLOCK_SIZE; - out += 4 * BLOCK_SIZE; - blocks -= 4; - } + const u32bit* EK = &m_EK[0]; - for(size_t i = 0; i != blocks; ++i) + const size_t blocks4 = blocks / 4; + const size_t blocks_left = blocks % 4; + + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks4; i++) { - u32bit L = load_be<u32bit>(in, 0); - u32bit R = load_be<u32bit>(in, 1); + u32bit L0, R0, L1, R1, L2, R2, L3, R3; + load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); - for(size_t j = 0; j != 32; ++j) + for(size_t r = 0; r != 32; ++r) { - R -= (((L << 4) ^ (L >> 5)) + L) ^ m_EK[63 - 2*j]; - L -= (((R << 4) ^ (R >> 5)) + R) ^ m_EK[62 - 2*j]; + R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*r]; + R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*r]; + R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*r]; + R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*r]; + + L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*r]; + L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*r]; + L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*r]; + L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*r]; } - store_be(out, L, R); + store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); + } + + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks_left; ++i) + { + u32bit L, R; + load_be(in + BLOCK_SIZE*(4*blocks4+i), L, R); + + for(size_t r = 0; r != 32; ++r) + { + R -= (((L << 4) ^ (L >> 5)) + L) ^ m_EK[63 - 2*r]; + L -= (((R << 4) ^ (R >> 5)) + R) ^ m_EK[62 - 2*r]; + } - in += BLOCK_SIZE; - out += BLOCK_SIZE; + store_be(out + BLOCK_SIZE*(4*blocks4+i), L, R); } } |