aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/block/threefish_avx2
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2016-08-09 17:04:47 -0400
committerJack Lloyd <[email protected]>2016-08-10 03:03:10 -0400
commit80cdc3d6822f43dd2f1e2a0b60eb792a02121919 (patch)
tree9c881cee425c4b9edcbd6f68e815110fe5b6de5f /src/lib/block/threefish_avx2
parent3f4e00b7d856d9176d0332c5eb65b4afa406544f (diff)
Threefish-512 AVX2 optimizations
Remove loop variable R, instead derive from macro param constant Support 2 block parallel decrypt, improves raw perf from 456 MB/s to 710 MB/s for decrypt. Switch to alternate key schedule for encrypt. Uses 3 ymm registers instead of 9 at the cost of more computation. Not much faster on Skylake, unclear if this is worthwhile.
Diffstat (limited to 'src/lib/block/threefish_avx2')
-rw-r--r--src/lib/block/threefish_avx2/threefish_avx2.cpp241
1 files changed, 165 insertions, 76 deletions
diff --git a/src/lib/block/threefish_avx2/threefish_avx2.cpp b/src/lib/block/threefish_avx2/threefish_avx2.cpp
index bed98fafa..9b808a221 100644
--- a/src/lib/block/threefish_avx2/threefish_avx2.cpp
+++ b/src/lib/block/threefish_avx2/threefish_avx2.cpp
@@ -1,6 +1,6 @@
/*
* Threefish-512 using AVX2
-* (C) 2013 Jack Lloyd
+* (C) 2013,2016 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
@@ -34,6 +34,41 @@ inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
X1 = _mm256_unpackhi_epi64(T0, T1);
}
+inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
+ {
+ /*
+ Behold. The key schedule progresses like so. The values
+ loop back to the originals after the rounds are complete.
+
+ R0 R1 R2
+ K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
+ K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
+ K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
+
+ K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
+ K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
+ K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
+
+ K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
+ K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
+ K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
+
+ To compute the values for the next round:
+ X0 is X2 from the last round
+ X1 becomes (X0[4],X1[1:3])
+ X2 becomes (X1[4],X2[1:3])
+
+ Uses 3 permutes and 2 blends, is there a faster way?
+ */
+ __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
+ __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
+ __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
+
+ R0 = _mm256_blend_epi32(T1, T0, 0xC0);
+ R1 = _mm256_blend_epi32(T2, T1, 0xC0);
+ }
+
+
}
void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const
@@ -81,10 +116,9 @@ void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) c
const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
X0 = _mm256_add_epi64(X0, K0); \
X1 = _mm256_add_epi64(X1, K1); \
- X1 = _mm256_add_epi64(X1, R); \
+ X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \
X0 = _mm256_add_epi64(X0, T0); \
X1 = _mm256_add_epi64(X1, T1); \
- R = _mm256_add_epi64(R, ONE); \
} while(0)
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
@@ -95,64 +129,52 @@ void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) c
X2 = _mm256_add_epi64(X2, K0); \
X1 = _mm256_add_epi64(X1, K1); \
X3 = _mm256_add_epi64(X3, K1); \
- T1 = _mm256_add_epi64(T1, R); \
+ T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
X0 = _mm256_add_epi64(X0, T0); \
X2 = _mm256_add_epi64(X2, T0); \
X1 = _mm256_add_epi64(X1, T1); \
X3 = _mm256_add_epi64(X3, T1); \
- R = _mm256_add_epi64(R, ONE); \
} while(0)
-#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \
+#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \
do { \
+ rotate_keys(K1, K2, K0); \
THREEFISH_ROUND(X0, X1, ROTATE_1); \
THREEFISH_ROUND(X0, X1, ROTATE_2); \
THREEFISH_ROUND(X0, X1, ROTATE_3); \
THREEFISH_ROUND(X0, X1, ROTATE_4); \
- THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \
+ THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \
\
THREEFISH_ROUND(X0, X1, ROTATE_5); \
THREEFISH_ROUND(X0, X1, ROTATE_6); \
THREEFISH_ROUND(X0, X1, ROTATE_7); \
THREEFISH_ROUND(X0, X1, ROTATE_8); \
- THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \
+ THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \
} while(0)
-#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
+#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
do { \
+ rotate_keys(K1, K2, K0); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
- THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \
+ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \
\
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
- THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0); \
+ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \
} while(0)
- /*
- v1.0 key schedule: 9 ymm registers (only need 2 or 3)
- (0,1,2,3),(4,5,6,7) [8]
- then mutating with vpermq
- */
- const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
- const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
- const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
- const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
- const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
- const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
- const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
- const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
- const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
-
- const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
+ __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
+ __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
+ __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
__m256i* out_mm = reinterpret_cast<__m256i*>(out);
-
+
while(blocks >= 2)
{
__m256i X0 = _mm256_loadu_si256(in_mm++);
@@ -162,24 +184,20 @@ void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) c
const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
- __m256i R = _mm256_set_epi64x(0, 0, 0, 0);
-
interleave_epi64(X0, X1);
interleave_epi64(X2, X3);
- THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3);
+ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2);
-
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2);
-
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1);
- THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
+ THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
deinterleave_epi64(X0, X1);
deinterleave_epi64(X2, X3);
@@ -191,7 +209,7 @@ void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) c
blocks -= 2;
}
-
+
for(size_t i = 0; i != blocks; ++i)
{
__m256i X0 = _mm256_loadu_si256(in_mm++);
@@ -199,23 +217,19 @@ void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) c
const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
- __m256i R = _mm256_set_epi64x(0, 0, 0, 0);
-
interleave_epi64(X0, X1);
- THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3);
-
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2);
-
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2);
+ THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1);
- THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 1, K2,K0,K1, 1, 2, 3);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 3, K1,K2,K0, 2, 3, 1);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 5, K0,K1,K2, 3, 1, 2);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 7, K2,K0,K1, 1, 2, 3);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 9, K1,K2,K0, 2, 3, 1);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
+ THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
deinterleave_epi64(X0, X1);
@@ -226,7 +240,7 @@ void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) c
#undef THREEFISH_ENC_8_ROUNDS
#undef THREEFISH_ROUND
#undef THREEFISH_INJECT_KEY
-#undef THREEFISH_ENC_2_8_ROUNDS
+#undef THREEFISH_DEC_2_8_ROUNDS
#undef THREEFISH_ROUND_2
#undef THREEFISH_INJECT_KEY_2
}
@@ -255,21 +269,35 @@ void Threefish_512_AVX2::decrypt_n(const byte in[], byte out[], size_t blocks) c
X0 = _mm256_sub_epi64(X0, X1); \
} while(0)
+#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \
+ do { \
+ const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
+ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
+ X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \
+ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
+ X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
+ X1 = _mm256_xor_si256(X1, X0); \
+ X3 = _mm256_xor_si256(X3, X2); \
+ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
+ X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
+ X0 = _mm256_sub_epi64(X0, X1); \
+ X2 = _mm256_sub_epi64(X2, X3); \
+ } while(0)
+
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
do { \
const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
X0 = _mm256_sub_epi64(X0, K0); \
X1 = _mm256_sub_epi64(X1, K1); \
- X1 = _mm256_sub_epi64(X1, R); \
- R = _mm256_sub_epi64(R, ONE); \
+ X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \
X0 = _mm256_sub_epi64(X0, T0); \
X1 = _mm256_sub_epi64(X1, T1); \
} while(0)
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \
do { \
- THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \
+ THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \
THREEFISH_ROUND(X0, X1, ROTATE_8); \
THREEFISH_ROUND(X0, X1, ROTATE_7); \
THREEFISH_ROUND(X0, X1, ROTATE_6); \
@@ -282,6 +310,36 @@ void Threefish_512_AVX2::decrypt_n(const byte in[], byte out[], size_t blocks) c
THREEFISH_ROUND(X0, X1, ROTATE_1); \
} while(0)
+#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
+ do { \
+ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
+ __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
+ X0 = _mm256_sub_epi64(X0, K0); \
+ X2 = _mm256_sub_epi64(X2, K0); \
+ X1 = _mm256_sub_epi64(X1, K1); \
+ X3 = _mm256_sub_epi64(X3, K1); \
+ T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
+ X0 = _mm256_sub_epi64(X0, T0); \
+ X2 = _mm256_sub_epi64(X2, T0); \
+ X1 = _mm256_sub_epi64(X1, T1); \
+ X3 = _mm256_sub_epi64(X3, T1); \
+ } while(0)
+
+#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
+ do { \
+ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
+ \
+ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
+ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
+ } while(0)
+
/*
v1.0 key schedule: 9 ymm registers (only need 2 or 3)
(0,1,2,3),(4,5,6,7) [8]
@@ -297,33 +355,64 @@ void Threefish_512_AVX2::decrypt_n(const byte in[], byte out[], size_t blocks) c
const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
- const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
-
const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
__m256i* out_mm = reinterpret_cast<__m256i*>(out);
- for(size_t i = 0; i != blocks; ++i)
+ while(blocks >= 2)
{
__m256i X0 = _mm256_loadu_si256(in_mm++);
__m256i X1 = _mm256_loadu_si256(in_mm++);
+ __m256i X2 = _mm256_loadu_si256(in_mm++);
+ __m256i X3 = _mm256_loadu_si256(in_mm++);
const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
- __m256i R = _mm256_set_epi64x(18, 0, 0, 0);
+ interleave_epi64(X0, X1);
+ interleave_epi64(X2, X3);
+
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
+ THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
+
+ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
+
+ deinterleave_epi64(X0, X1);
+ deinterleave_epi64(X2, X3);
+
+ _mm256_storeu_si256(out_mm++, X0);
+ _mm256_storeu_si256(out_mm++, X1);
+ _mm256_storeu_si256(out_mm++, X2);
+ _mm256_storeu_si256(out_mm++, X3);
+
+ blocks -= 2;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m256i X0 = _mm256_loadu_si256(in_mm++);
+ __m256i X1 = _mm256_loadu_si256(in_mm++);
+
+ const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
interleave_epi64(X0, X1);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1);
- THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3);
-
- THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
+ THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
+
+ THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
deinterleave_epi64(X0, X1);