aboutsummaryrefslogtreecommitdiffstats
path: root/src/block
diff options
context:
space:
mode:
authorlloyd <[email protected]>2013-12-24 22:24:54 +0000
committerlloyd <[email protected]>2013-12-24 22:24:54 +0000
commitbb9ca87f6bd9383cf27de115281cc9ea54284d3a (patch)
tree944c0e69b3387f04d60c1290c15ae7bbe719e5fe /src/block
parent32233249f38b7fcbce53699ab23c899352e17449 (diff)
Add Threefish-512 in AVX2
Tested using Intel's SDE, but no idea how fast it will be on hardware.
Diffstat (limited to 'src/block')
-rw-r--r--src/block/threefish/info.txt2
-rw-r--r--src/block/threefish/threefish.cpp72
-rw-r--r--src/block/threefish/threefish.h7
-rw-r--r--src/block/threefish_avx2/info.txt3
-rw-r--r--src/block/threefish_avx2/threefish_avx2.cpp146
-rw-r--r--src/block/threefish_avx2/threefish_avx2.h26
6 files changed, 226 insertions, 30 deletions
diff --git a/src/block/threefish/info.txt b/src/block/threefish/info.txt
index 91fc7bfb1..b352a49df 100644
--- a/src/block/threefish/info.txt
+++ b/src/block/threefish/info.txt
@@ -1 +1 @@
-define THREEFISH 20131209
+define THREEFISH_512 20131220
diff --git a/src/block/threefish/threefish.cpp b/src/block/threefish/threefish.cpp
index 6fb65a37e..ef639f03a 100644
--- a/src/block/threefish/threefish.cpp
+++ b/src/block/threefish/threefish.cpp
@@ -25,21 +25,22 @@ secure_vector<byte> Threefish_512::start(const byte tweak[], size_t tweak_len)
return secure_vector<byte>();
}
-void Threefish_512::update(secure_vector<byte>& blocks, size_t offset)
+void Threefish_512::full_inplace_update(byte* buf, size_t sz)
{
- byte* buf = &blocks[offset];
- size_t sz = blocks.size() - offset;
-
- BOTAN_ASSERT(sz % update_granularity() == 0, "Block sized input");
-
- BOTAN_ASSERT(m_T.size() == 3, "Tweak was set");
-
-#define THREEFISH_ROUND(I1,I2,I3,I4,I5,I6,I7,I8,ROT1,ROT2,ROT3,ROT4) \
+#define THREEFISH_ROUND(X0,X1,X2,X3,X4,X5,X6,X7,ROT1,ROT2,ROT3,ROT4) \
do { \
- X##I1 += X##I2; X##I2 = rotate_left(X##I2, ROT1) ^ X##I1; \
- X##I3 += X##I4; X##I4 = rotate_left(X##I4, ROT2) ^ X##I3; \
- X##I5 += X##I6; X##I6 = rotate_left(X##I6, ROT3) ^ X##I5; \
- X##I7 += X##I8; X##I8 = rotate_left(X##I8, ROT4) ^ X##I7; \
+ X0 += X4; \
+ X1 += X5; \
+ X2 += X6; \
+ X3 += X7; \
+ X4 = rotate_left(X4, ROT1); \
+ X5 = rotate_left(X5, ROT2); \
+ X6 = rotate_left(X6, ROT3); \
+ X7 = rotate_left(X7, ROT4); \
+ X4 ^= X0; \
+ X5 ^= X1; \
+ X6 ^= X2; \
+ X7 ^= X3; \
} while(0)
#define THREEFISH_INJECT_KEY(r) \
@@ -54,21 +55,21 @@ void Threefish_512::update(secure_vector<byte>& blocks, size_t offset)
X7 += m_K[(r+7) % 9] + (r); \
} while(0)
-#define THREEFISH_8_ROUNDS(R1,R2) \
- do { \
- THREEFISH_ROUND(0,1,2,3,4,5,6,7, 46,36,19,37); \
- THREEFISH_ROUND(2,1,4,7,6,5,0,3, 33,27,14,42); \
- THREEFISH_ROUND(4,1,6,3,0,5,2,7, 17,49,36,39); \
- THREEFISH_ROUND(6,1,0,7,2,5,4,3, 44, 9,54,56); \
- \
- THREEFISH_INJECT_KEY(R1); \
- \
- THREEFISH_ROUND(0,1,2,3,4,5,6,7, 39,30,34,24); \
- THREEFISH_ROUND(2,1,4,7,6,5,0,3, 13,50,10,17); \
- THREEFISH_ROUND(4,1,6,3,0,5,2,7, 25,29,39,43); \
- THREEFISH_ROUND(6,1,0,7,2,5,4,3, 8,35,56,22); \
- \
- THREEFISH_INJECT_KEY(R2); \
+#define THREEFISH_8_ROUNDS(R1,R2) \
+ do { \
+ THREEFISH_ROUND(X0,X2,X4,X6, X1,X3,X5,X7, 46,36,19,37); \
+ THREEFISH_ROUND(X2,X4,X6,X0, X1,X7,X5,X3, 33,27,14,42); \
+ THREEFISH_ROUND(X4,X6,X0,X2, X1,X3,X5,X7, 17,49,36,39); \
+ THREEFISH_ROUND(X6,X0,X2,X4, X1,X7,X5,X3, 44, 9,54,56); \
+ \
+ THREEFISH_INJECT_KEY(R1); \
+ \
+ THREEFISH_ROUND(X0,X2,X4,X6, X1,X3,X5,X7, 39,30,34,24); \
+ THREEFISH_ROUND(X2,X4,X6,X0, X1,X7,X5,X3, 13,50,10,17); \
+ THREEFISH_ROUND(X4,X6,X0,X2, X1,X3,X5,X7, 25,29,39,43); \
+ THREEFISH_ROUND(X6,X0,X2,X4, X1,X7,X5,X3, 8,35,56,22); \
+ \
+ THREEFISH_INJECT_KEY(R2); \
} while(0)
while(sz)
@@ -105,6 +106,17 @@ void Threefish_512::update(secure_vector<byte>& blocks, size_t offset)
#undef THREEFISH_ROUND
}
+void Threefish_512::update(secure_vector<byte>& buf, size_t offset)
+ {
+ BOTAN_ASSERT(m_K.size() == 9, "Key was set");
+ BOTAN_ASSERT(m_T.size() == 3, "Tweak was set");
+
+ const size_t sz = buf.size() - offset;
+ BOTAN_ASSERT(sz % block_size() == 0, "Block sized input");
+
+ full_inplace_update(&buf[offset], sz);
+ }
+
Key_Length_Specification Threefish_512::key_spec() const
{
return Key_Length_Specification(64);
@@ -138,7 +150,7 @@ size_t Threefish_512::output_length(size_t input_length) const
size_t Threefish_512::update_granularity() const
{
- return 64; // single block
+ return block_size();
}
size_t Threefish_512::minimum_final_size() const
@@ -148,6 +160,7 @@ size_t Threefish_512::minimum_final_size() const
size_t Threefish_512::default_nonce_length() const
{
+ // todo: define encoding for smaller nonces
return 16;
}
@@ -159,6 +172,7 @@ bool Threefish_512::valid_nonce_length(size_t nonce_len) const
void Threefish_512::clear()
{
zeroise(m_K);
+ zeroise(m_T);
}
}
diff --git a/src/block/threefish/threefish.h b/src/block/threefish/threefish.h
index b7806a93d..0e13221cc 100644
--- a/src/block/threefish/threefish.h
+++ b/src/block/threefish/threefish.h
@@ -40,6 +40,13 @@ class BOTAN_DLL Threefish_512 : public Transformation
void clear();
+ size_t block_size() const { return 64; }
+
+ protected:
+ const secure_vector<u64bit>& get_T() const { return m_T; }
+ const secure_vector<u64bit>& get_K() const { return m_K; }
+
+ virtual void full_inplace_update(byte* buf, size_t sz);
private:
void key_schedule(const byte key[], size_t key_len) override;
diff --git a/src/block/threefish_avx2/info.txt b/src/block/threefish_avx2/info.txt
new file mode 100644
index 000000000..3f62629b9
--- /dev/null
+++ b/src/block/threefish_avx2/info.txt
@@ -0,0 +1,3 @@
+define THREEFISH_512_AVX2 20131224
+
+need_isa avx2
diff --git a/src/block/threefish_avx2/threefish_avx2.cpp b/src/block/threefish_avx2/threefish_avx2.cpp
new file mode 100644
index 000000000..f77cfee47
--- /dev/null
+++ b/src/block/threefish_avx2/threefish_avx2.cpp
@@ -0,0 +1,146 @@
+/*
+* Threefish-512
+* (C) 2013 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#include <botan/threefish_avx2.h>
+#include <immintrin.h>
+
+namespace Botan {
+
+namespace {
+
+inline void interleave_epi64(__m256i& X0, __m256i& X1)
+ {
+ // interleave X0 and X1 qwords
+ // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
+
+ const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
+ const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
+
+ X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
+ X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
+ }
+
+inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
+ {
+ const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
+ const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
+
+ X0 = _mm256_unpacklo_epi64(T0, T1);
+ X1 = _mm256_unpackhi_epi64(T0, T1);
+ }
+
+}
+
+void Threefish_512_AVX2::full_inplace_update(byte* buf, size_t sz)
+ {
+ const u64bit* K = &get_K()[0];
+ const u64bit* T_64 = &get_T()[0];
+
+ const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
+ const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
+ const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
+ const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
+
+ const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
+ const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
+ const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
+ const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
+
+#define THREEFISH_ROUND(X0,X1,SHL) \
+ do { \
+ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
+ X0 = _mm256_add_epi64(X0, X1); \
+ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
+ X1 = _mm256_xor_si256(X1, X0); \
+ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
+ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
+ } while(0)
+
+#define THREEFISH_INJECT_KEY(r, K0, K1, T0I, T1I) \
+ do { \
+ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
+ const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
+ X0 = _mm256_add_epi64(X0, K0); \
+ X1 = _mm256_add_epi64(X1, K1); \
+ X1 = _mm256_add_epi64(X1, R); \
+ R = _mm256_add_epi64(R, ONE); \
+ X0 = _mm256_add_epi64(X0, T0); \
+ X1 = _mm256_add_epi64(X1, T1); \
+ } while(0)
+
+#define THREEFISH_8_ROUNDS(R1, R2, K1, K2, K3, T0, T1, T2) \
+ do { \
+ THREEFISH_ROUND(X0, X1, ROTATE_1); \
+ THREEFISH_ROUND(X0, X1, ROTATE_2); \
+ THREEFISH_ROUND(X0, X1, ROTATE_3); \
+ THREEFISH_ROUND(X0, X1, ROTATE_4); \
+ \
+ THREEFISH_INJECT_KEY(R1, K1, K2, T0, T1); \
+ \
+ THREEFISH_ROUND(X0,X1, ROTATE_5); \
+ THREEFISH_ROUND(X0,X1, ROTATE_6); \
+ THREEFISH_ROUND(X0,X1, ROTATE_7); \
+ THREEFISH_ROUND(X0,X1, ROTATE_8); \
+ \
+ THREEFISH_INJECT_KEY(R2, K2, K3, T2, T0); \
+ } while(0)
+
+ /*
+ v1.0 key schedule: 9 ymm registers (only need 2 or 3)
+ (0,1,2,3),(4,5,6,7) [8]
+ then mutating with vpermq
+ */
+ const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
+ const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
+ const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
+ const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
+ const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
+ const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
+ const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
+ const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
+ const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
+
+ const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
+
+ while(sz >= 64)
+ {
+ __m256i* buf_mm = reinterpret_cast<__m256i*>(buf);
+
+ __m256i X0 = _mm256_loadu_si256(buf_mm);
+ __m256i X1 = _mm256_loadu_si256(buf_mm + 1);
+
+ __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
+
+ __m256i R = _mm256_set1_epi64x(0);
+
+ interleave_epi64(X0, X1);
+
+ THREEFISH_INJECT_KEY(0, K0, K1, 2, 3);
+
+ THREEFISH_8_ROUNDS( 1, 2, K1,K2,K3, 1, 2, 3);
+ THREEFISH_8_ROUNDS( 3, 4, K3,K4,K5, 2, 3, 1);
+ THREEFISH_8_ROUNDS( 5, 6, K5,K6,K7, 3, 1, 2);
+
+ THREEFISH_8_ROUNDS( 7, 8, K7,K8,K0, 1, 2, 3);
+ THREEFISH_8_ROUNDS( 9, 10, K0,K1,K2, 2, 3, 1);
+ THREEFISH_8_ROUNDS(11, 12, K2,K3,K4, 3, 1, 2);
+
+ THREEFISH_8_ROUNDS(13, 14, K4,K5,K6, 1, 2, 3);
+ THREEFISH_8_ROUNDS(15, 16, K6,K7,K8, 2, 3, 1);
+ THREEFISH_8_ROUNDS(17, 18, K8,K0,K1, 3, 1, 2);
+
+ deinterleave_epi64(X0, X1);
+
+ _mm256_storeu_si256(buf_mm, X0);
+ _mm256_storeu_si256(buf_mm + 1, X1);
+
+ buf += 64;
+ sz -= 64;
+ }
+ }
+
+}
diff --git a/src/block/threefish_avx2/threefish_avx2.h b/src/block/threefish_avx2/threefish_avx2.h
new file mode 100644
index 000000000..f9a0b666f
--- /dev/null
+++ b/src/block/threefish_avx2/threefish_avx2.h
@@ -0,0 +1,26 @@
+/*
+* Threefish-512 in AVX2
+* (C) 2013 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef BOTAN_THREEFISH_AVX2_H__
+#define BOTAN_THREEFISH_AVX2_H__
+
+#include <botan/threefish.h>
+
+namespace Botan {
+
+/**
+* Threefish-512
+*/
+class BOTAN_DLL Threefish_512_AVX2 : public Threefish_512
+ {
+ private:
+ void full_inplace_update(byte* buf, size_t sz) override;
+ };
+
+}
+
+#endif