aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2016-09-05 13:24:31 -0400
committerJack Lloyd <[email protected]>2016-09-05 13:24:31 -0400
commit5178ba78da3a7cbf63165504886e27f3184862cf (patch)
tree5bc8c7beff516d9e79966eb0b04f8afe41ce958f /src
parent743320d7007cc66f3ebeebeaafb7b42f1a0362ff (diff)
parentac3d1ea17c0dfc0c279a6715a110c738ee615b3f (diff)
Merge GH #616 ChaCha SSE2 optimizations
Diffstat (limited to 'src')
-rw-r--r--src/cli/speed.cpp5
-rw-r--r--src/lib/stream/chacha/chacha.cpp160
-rw-r--r--src/lib/stream/chacha/chacha.h8
-rw-r--r--src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp259
-rw-r--r--src/lib/stream/chacha/chacha_sse2/info.txt3
-rw-r--r--src/tests/data/stream/chacha.vec8
6 files changed, 371 insertions, 72 deletions
diff --git a/src/cli/speed.cpp b/src/cli/speed.cpp
index 222a98d3f..c1f3a91e8 100644
--- a/src/cli/speed.cpp
+++ b/src/cli/speed.cpp
@@ -521,10 +521,11 @@ class Speed final : public Command
Timer encrypt_timer(cipher.name(), provider, "encrypt", buffer.size());
+ const Botan::SymmetricKey key(rng(), cipher.maximum_keylength());
+ cipher.set_key(key);
+
while(encrypt_timer.under(runtime))
{
- const Botan::SymmetricKey key(rng(), cipher.maximum_keylength());
- cipher.set_key(key);
encrypt_timer.run([&] { cipher.encipher(buffer); });
}
diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp
index 40da93029..c35363112 100644
--- a/src/lib/stream/chacha/chacha.cpp
+++ b/src/lib/stream/chacha/chacha.cpp
@@ -7,6 +7,7 @@
#include <botan/chacha.h>
#include <botan/loadstor.h>
+#include <botan/cpuid.h>
namespace Botan {
@@ -16,58 +17,87 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds)
throw Invalid_Argument("ChaCha only supports 8, 12 or 20 rounds");
}
-namespace {
-
-void chacha(byte output[64], const u32bit input[16], size_t rounds)
+//static
+void ChaCha::chacha_x4(byte output[64*4], u32bit input[16], size_t rounds)
{
BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
- u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3],
- x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7],
- x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11],
- x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
-
-#define CHACHA_QUARTER_ROUND(a, b, c, d) \
- do { \
- a += b; d ^= a; d = rotate_left(d, 16); \
- c += d; b ^= c; b = rotate_left(b, 12); \
- a += b; d ^= a; d = rotate_left(d, 8); \
- c += d; b ^= c; b = rotate_left(b, 7); \
- } while(0)
-
- for(size_t i = 0; i != rounds / 2; ++i)
+#if defined(BOTAN_HAS_CHACHA_SSE2)
+ if(CPUID::has_sse2())
{
- CHACHA_QUARTER_ROUND(x00, x04, x08, x12);
- CHACHA_QUARTER_ROUND(x01, x05, x09, x13);
- CHACHA_QUARTER_ROUND(x02, x06, x10, x14);
- CHACHA_QUARTER_ROUND(x03, x07, x11, x15);
-
- CHACHA_QUARTER_ROUND(x00, x05, x10, x15);
- CHACHA_QUARTER_ROUND(x01, x06, x11, x12);
- CHACHA_QUARTER_ROUND(x02, x07, x08, x13);
- CHACHA_QUARTER_ROUND(x03, x04, x09, x14);
+ return ChaCha::chacha_sse2_x4(output, input, rounds);
}
+#endif
+
+ // TODO interleave rounds
+ for(size_t i = 0; i != 4; ++i)
+ {
+ u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3],
+ x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7],
+ x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11],
+ x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
+
+#define CHACHA_QUARTER_ROUND(a, b, c, d) \
+ do { \
+ a += b; d ^= a; d = rotate_left(d, 16); \
+ c += d; b ^= c; b = rotate_left(b, 12); \
+ a += b; d ^= a; d = rotate_left(d, 8); \
+ c += d; b ^= c; b = rotate_left(b, 7); \
+ } while(0)
+
+ for(size_t i = 0; i != rounds / 2; ++i)
+ {
+ CHACHA_QUARTER_ROUND(x00, x04, x08, x12);
+ CHACHA_QUARTER_ROUND(x01, x05, x09, x13);
+ CHACHA_QUARTER_ROUND(x02, x06, x10, x14);
+ CHACHA_QUARTER_ROUND(x03, x07, x11, x15);
+
+ CHACHA_QUARTER_ROUND(x00, x05, x10, x15);
+ CHACHA_QUARTER_ROUND(x01, x06, x11, x12);
+ CHACHA_QUARTER_ROUND(x02, x07, x08, x13);
+ CHACHA_QUARTER_ROUND(x03, x04, x09, x14);
+ }
#undef CHACHA_QUARTER_ROUND
- store_le(x00 + input[ 0], output + 4 * 0);
- store_le(x01 + input[ 1], output + 4 * 1);
- store_le(x02 + input[ 2], output + 4 * 2);
- store_le(x03 + input[ 3], output + 4 * 3);
- store_le(x04 + input[ 4], output + 4 * 4);
- store_le(x05 + input[ 5], output + 4 * 5);
- store_le(x06 + input[ 6], output + 4 * 6);
- store_le(x07 + input[ 7], output + 4 * 7);
- store_le(x08 + input[ 8], output + 4 * 8);
- store_le(x09 + input[ 9], output + 4 * 9);
- store_le(x10 + input[10], output + 4 * 10);
- store_le(x11 + input[11], output + 4 * 11);
- store_le(x12 + input[12], output + 4 * 12);
- store_le(x13 + input[13], output + 4 * 13);
- store_le(x14 + input[14], output + 4 * 14);
- store_le(x15 + input[15], output + 4 * 15);
+ x00 += input[0];
+ x01 += input[1];
+ x02 += input[2];
+ x03 += input[3];
+ x04 += input[4];
+ x05 += input[5];
+ x06 += input[6];
+ x07 += input[7];
+ x08 += input[8];
+ x09 += input[9];
+ x10 += input[10];
+ x11 += input[11];
+ x12 += input[12];
+ x13 += input[13];
+ x14 += input[14];
+ x15 += input[15];
+
+ store_le(x00, output + 64 * i + 4 * 0);
+ store_le(x01, output + 64 * i + 4 * 1);
+ store_le(x02, output + 64 * i + 4 * 2);
+ store_le(x03, output + 64 * i + 4 * 3);
+ store_le(x04, output + 64 * i + 4 * 4);
+ store_le(x05, output + 64 * i + 4 * 5);
+ store_le(x06, output + 64 * i + 4 * 6);
+ store_le(x07, output + 64 * i + 4 * 7);
+ store_le(x08, output + 64 * i + 4 * 8);
+ store_le(x09, output + 64 * i + 4 * 9);
+ store_le(x10, output + 64 * i + 4 * 10);
+ store_le(x11, output + 64 * i + 4 * 11);
+ store_le(x12, output + 64 * i + 4 * 12);
+ store_le(x13, output + 64 * i + 4 * 13);
+ store_le(x14, output + 64 * i + 4 * 14);
+ store_le(x15, output + 64 * i + 4 * 15);
+
+ input[12]++;
+ input[13] += input[12] < i; // carry?
+ }
}
-}
/*
* Combine cipher stream with message
@@ -80,11 +110,7 @@ void ChaCha::cipher(const byte in[], byte out[], size_t length)
length -= (m_buffer.size() - m_position);
in += (m_buffer.size() - m_position);
out += (m_buffer.size() - m_position);
- chacha(m_buffer.data(), m_state.data(), m_rounds);
-
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
-
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
m_position = 0;
}
@@ -106,8 +132,12 @@ void ChaCha::key_schedule(const byte key[], size_t length)
const u32bit* CONSTANTS = (length == 16) ? TAU : SIGMA;
+ // Repeat the key if 128 bits
+ const byte* key2 = (length == 32) ? key + 16 : key;
+
+ m_position = 0;
m_state.resize(16);
- m_buffer.resize(64);
+ m_buffer.resize(4*64);
m_state[0] = CONSTANTS[0];
m_state[1] = CONSTANTS[1];
@@ -119,16 +149,12 @@ void ChaCha::key_schedule(const byte key[], size_t length)
m_state[6] = load_le<u32bit>(key, 2);
m_state[7] = load_le<u32bit>(key, 3);
- if(length == 32)
- key += 16;
-
- m_state[8] = load_le<u32bit>(key, 0);
- m_state[9] = load_le<u32bit>(key, 1);
- m_state[10] = load_le<u32bit>(key, 2);
- m_state[11] = load_le<u32bit>(key, 3);
-
- m_position = 0;
+ m_state[8] = load_le<u32bit>(key2, 0);
+ m_state[9] = load_le<u32bit>(key2, 1);
+ m_state[10] = load_le<u32bit>(key2, 2);
+ m_state[11] = load_le<u32bit>(key2, 3);
+ // Default all-zero IV
const byte ZERO[8] = { 0 };
set_iv(ZERO, sizeof(ZERO));
}
@@ -153,10 +179,7 @@ void ChaCha::set_iv(const byte iv[], size_t length)
m_state[15] = load_le<u32bit>(iv, 2);
}
- chacha(m_buffer.data(), m_state.data(), m_rounds);
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
-
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
m_position = 0;
}
@@ -176,12 +199,11 @@ void ChaCha::seek(u64bit offset)
{
if (m_state.size() == 0 && m_buffer.size() == 0)
{
- throw Invalid_State("You have to setup the stream cipher (key and iv)");
+ throw Invalid_State("You have to setup the stream cipher (key and iv)");
}
- m_position = offset % m_buffer.size();
-
- u64bit counter = offset / m_buffer.size();
+ // Find the block offset
+ u64bit counter = offset / 64;
byte out[8];
@@ -190,9 +212,7 @@ void ChaCha::seek(u64bit offset)
m_state[12] = load_le<u32bit>(out, 0);
m_state[13] += load_le<u32bit>(out, 1);
- chacha(m_buffer.data(), m_state.data(), m_rounds);
-
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
+ m_position = offset % 64;
}
}
diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h
index f8f42e41d..e4d4ae60e 100644
--- a/src/lib/stream/chacha/chacha.h
+++ b/src/lib/stream/chacha/chacha.h
@@ -47,6 +47,14 @@ class BOTAN_DLL ChaCha final : public StreamCipher
private:
void key_schedule(const byte key[], size_t key_len) override;
+ void incr_state_counter(size_t howmany);
+
+ void chacha_x4(byte output[64*4], u32bit state[16], size_t rounds);
+
+#if defined(BOTAN_HAS_CHACHA_SSE2)
+ void chacha_sse2_x4(byte output[64*4], u32bit state[16], size_t rounds);
+#endif
+
size_t m_rounds;
secure_vector<u32bit> m_state;
secure_vector<byte> m_buffer;
diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
new file mode 100644
index 000000000..e39b285b3
--- /dev/null
+++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
@@ -0,0 +1,259 @@
+/*
+* SSE2 ChaCha
+* (C) 2016 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/chacha.h>
+#include <emmintrin.h>
+
+namespace Botan {
+
+//static
+void ChaCha::chacha_sse2_x4(byte output[64], u32bit input[16], size_t rounds)
+ {
+ BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
+
+ const __m128i* input_mm = reinterpret_cast<const __m128i*>(input);
+ __m128i* output_mm = reinterpret_cast<__m128i*>(output);
+
+ __m128i input0 = _mm_loadu_si128(input_mm);
+ __m128i input1 = _mm_loadu_si128(input_mm + 1);
+ __m128i input2 = _mm_loadu_si128(input_mm + 2);
+ __m128i input3 = _mm_loadu_si128(input_mm + 3);
+
+ // TODO: try transposing, which would avoid the permutations each round
+
+#define mm_rotl(r, n) \
+ _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n))
+
+ __m128i r0_0 = input0;
+ __m128i r0_1 = input1;
+ __m128i r0_2 = input2;
+ __m128i r0_3 = input3;
+
+ __m128i r1_0 = input0;
+ __m128i r1_1 = input1;
+ __m128i r1_2 = input2;
+ __m128i r1_3 = input3;
+ r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
+
+ __m128i r2_0 = input0;
+ __m128i r2_1 = input1;
+ __m128i r2_2 = input2;
+ __m128i r2_3 = input3;
+ r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
+
+ __m128i r3_0 = input0;
+ __m128i r3_1 = input1;
+ __m128i r3_2 = input2;
+ __m128i r3_3 = input3;
+ r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
+
+ for(size_t r = 0; r != rounds / 2; ++r)
+ {
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 16);
+ r1_3 = mm_rotl(r1_3, 16);
+ r2_3 = mm_rotl(r2_3, 16);
+ r3_3 = mm_rotl(r3_3, 16);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 12);
+ r1_1 = mm_rotl(r1_1, 12);
+ r2_1 = mm_rotl(r2_1, 12);
+ r3_1 = mm_rotl(r3_1, 12);
+
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 8);
+ r1_3 = mm_rotl(r1_3, 8);
+ r2_3 = mm_rotl(r2_3, 8);
+ r3_3 = mm_rotl(r3_3, 8);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 7);
+ r1_1 = mm_rotl(r1_1, 7);
+ r2_1 = mm_rotl(r2_1, 7);
+ r3_1 = mm_rotl(r3_1, 7);
+
+ r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 16);
+ r1_3 = mm_rotl(r1_3, 16);
+ r2_3 = mm_rotl(r2_3, 16);
+ r3_3 = mm_rotl(r3_3, 16);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 12);
+ r1_1 = mm_rotl(r1_1, 12);
+ r2_1 = mm_rotl(r2_1, 12);
+ r3_1 = mm_rotl(r3_1, 12);
+
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 8);
+ r1_3 = mm_rotl(r1_3, 8);
+ r2_3 = mm_rotl(r2_3, 8);
+ r3_3 = mm_rotl(r3_3, 8);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 7);
+ r1_1 = mm_rotl(r1_1, 7);
+ r2_1 = mm_rotl(r2_1, 7);
+ r3_1 = mm_rotl(r3_1, 7);
+
+ r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+ r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+ r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+ r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
+ }
+
+ r0_0 = _mm_add_epi32(r0_0, input0);
+ r0_1 = _mm_add_epi32(r0_1, input1);
+ r0_2 = _mm_add_epi32(r0_2, input2);
+ r0_3 = _mm_add_epi32(r0_3, input3);
+
+ r1_0 = _mm_add_epi32(r1_0, input0);
+ r1_1 = _mm_add_epi32(r1_1, input1);
+ r1_2 = _mm_add_epi32(r1_2, input2);
+ r1_3 = _mm_add_epi32(r1_3, input3);
+ r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
+
+ r2_0 = _mm_add_epi32(r2_0, input0);
+ r2_1 = _mm_add_epi32(r2_1, input1);
+ r2_2 = _mm_add_epi32(r2_2, input2);
+ r2_3 = _mm_add_epi32(r2_3, input3);
+ r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
+
+ r3_0 = _mm_add_epi32(r3_0, input0);
+ r3_1 = _mm_add_epi32(r3_1, input1);
+ r3_2 = _mm_add_epi32(r3_2, input2);
+ r3_3 = _mm_add_epi32(r3_3, input3);
+ r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
+
+ _mm_storeu_si128(output_mm + 0, r0_0);
+ _mm_storeu_si128(output_mm + 1, r0_1);
+ _mm_storeu_si128(output_mm + 2, r0_2);
+ _mm_storeu_si128(output_mm + 3, r0_3);
+
+ _mm_storeu_si128(output_mm + 4, r1_0);
+ _mm_storeu_si128(output_mm + 5, r1_1);
+ _mm_storeu_si128(output_mm + 6, r1_2);
+ _mm_storeu_si128(output_mm + 7, r1_3);
+
+ _mm_storeu_si128(output_mm + 8, r2_0);
+ _mm_storeu_si128(output_mm + 9, r2_1);
+ _mm_storeu_si128(output_mm + 10, r2_2);
+ _mm_storeu_si128(output_mm + 11, r2_3);
+
+ _mm_storeu_si128(output_mm + 12, r3_0);
+ _mm_storeu_si128(output_mm + 13, r3_1);
+ _mm_storeu_si128(output_mm + 14, r3_2);
+ _mm_storeu_si128(output_mm + 15, r3_3);
+
+#undef mm_rotl
+
+ input[12] += 4;
+ if(input[12] < 4)
+ input[13]++;
+ }
+
+}
diff --git a/src/lib/stream/chacha/chacha_sse2/info.txt b/src/lib/stream/chacha/chacha_sse2/info.txt
new file mode 100644
index 000000000..965479746
--- /dev/null
+++ b/src/lib/stream/chacha/chacha_sse2/info.txt
@@ -0,0 +1,3 @@
+define CHACHA_SSE2 20160831
+
+need_isa sse2
diff --git a/src/tests/data/stream/chacha.vec b/src/tests/data/stream/chacha.vec
index 881513706..830684b2c 100644
--- a/src/tests/data/stream/chacha.vec
+++ b/src/tests/data/stream/chacha.vec
@@ -124,6 +124,14 @@ Nonce = 000000000000000000000002
In = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
Out = C2C64D378CD536374AE204B9EF933FCD1A8B2288B3DFA49672AB765B54EE27C78A970E0E955C14F3A88E741B97C286F75F8FC299E8148362FA198A39531BED6D
+# Long output tests generated by DJB ref impl
+
+Key = 0000000000000000000000000000000000000000000000000000000000000000
+Nonce = 000000000000000000000000
+In = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+Out = 76B8E0ADA0F13D90405D6AE55386BD28BDD219B8A08DED1AA836EFCC8B770DC7DA41597C5157488D7724E03FB8D84A376A43B8F41518A11CC387B669B2EE65869F07E7BE5551387A98BA977C732D080DCB0F29A048E3656912C6533E32EE7AED29B721769CE64E43D57133B074D839D531ED1F28510AFB45ACE10A1F4B794D6F2D09A0E663266CE1AE7ED1081968A0758E718E997BD362C6B0C34634A9A0B35D012737681F7B5D0F281E3AFDE458BC1E73D2D313C9CF94C05FF3716240A248F21320A058D7B3566BD520DAAA3ED2BF0AC5B8B120FB852773C3639734B45C91A42DD4CB83F8840D2EEDB158131062AC3F1F2CF8FF6DCD1856E86A1E6C3167167EE5A688742B47C5ADFB59D4DF76FD1DB1E51EE03B1CA9F82ACA173EDB8B7293474EBE980F904D10C916442B4783A0E984860CB6C957B39C38ED8F51CFFAA68A4DE01025A39C504546B9DC1406A7EB28151E5150D7B204BAA719D4F091021217DB5CF1B5C84C4FA71A879610A1A695AC527C5B56774A6B8A21AAE88685868E094CF29EF4090AF7A90CC07E8817AA528763797D3C332B67CA4BC110642C2151EC47EE84CB8C42D85F10E2A8CB18C3B7335F26E8C39A12B1BCC1707177B76138732EEDAAB74DA1410FC055EA068C99E9260ACBE337CF5D3E00E5B3230FFEDB0B990787D0C70E0BFE4198EA6758DD5A61FB5FEC2DF981F31BEFE153F81D17161784DB1C8822D53CD1EE7DB532364828BDF404B040A8DCC522F3D3D99AEC4B8057EDB8500931A2C42D2F0C570847100B5754DAFC5FBDB894BBEF1A2DE1A07F8BA0C4B919301066EDBC056B7B481E7A0C46297BBB589D9DA5B675A6723E152E5E63A4CE034E9E83E58A013AF0E7352FB7908514E3B3D1040D0BB963B3954B636B5FD4BF6D0AADBAF8157D062ACB2418C176A475511B35C3F6218A5668EA5BC6F54B8782F8B340F00AC1BEBA5E62CD632A7CE7809C725608ACA5EFBF7C41F237643F06C0997207171DE867F9D697BF5EA6011ABCCE6C8CDB211394D2C02DD0FB60DB5A2C17AC3DC85878A90BED3809DBB96EAA5426FC8EAE0D2D65C42A479F088648BE2DC801D82A366FDDC0EF234263C0B6417D5F9DA41817B88D68E5E67195C5C1EE3095E821F22524B20BE41CEB590412E41DC648843FA9BFEC7A3DCF61AB0541573316D3FA8151629303FE9741562ED065DB4EBC0050EF558364AE81124A28F5C01313232FBC496DFD8A2568657B686D7214382A1A00903017DDA969878442BA5AFFF6613F553CBB233CE46D9AEE93A7876CF5E9E82912B18CADF0B34327B2E0427ECF66B7CEB7C0918DC47BDFF12A062ADF07133009CE7A5E5C917E0168306109B7CB49653A6D2CAEF005DE783A9A9BFE05381ED1348D94EC65886F9C0B619C52C5533800B16C836172B95182DBC5EEC042B89E22F11A085B739A3611CD8D836018C4FFF0B86C02ED662D2D2522647A1F09A7B2F9EEA56E7E20B1F06CCDD9CEC37E3B2D20812DF369978636C22646603675804104745D2997E28DF5D8242AAD19C8120CA4142FB6019FCCECF9FADB04ADE03B341E3FC77201B3DC957A8097AB2F615AFF142AB753811D5F32E75BC8825B456555F3D179FFABCF35F6AE61365851F3F681A2E86E8078B064976646186394CB9064767750DAD4E336B8F1D20FE2C13C6248D3D73D4D66D9C8587AC68A7976A3BBB8B5808320607400DBDB1918E3D3B90CFC38C4DDFADE990A213D208FBF7898334F4DEED7E5830FD266751315435AE19BB94F4D3DC92652F243DD1F96F3595AB473D2356D8FA8F6D64CC4F64B12CA99ECDD1962572E6ADD609D9C619AAB678B3FC298BC2F0F81FEB4F0D3EBAD7E850A8BCB52CA467E649DE2DB913BFDA001294C49DC369F7D14CC25C5FA65D4D5AF6A436D22BD2839BE23DD3C57825033FECDCE2DED6C511DBEAF4DF2B4CBB7AF8215BB48A550F57D02750E599298F512B1EC1829722FC10A5ACF9537E392A728455905D3AB4837DECE4B63FDFD5DD07A2B76A8C82566DF1A2167DAE5E125B6AA0E76B9D99CA84664F50EEEA54E449F0E587039137F57543D89205483141C933166B61990A706ACA07F467D22BC34C6552F5BBA91CB1FC21DB51D03DFFF6523A5E1B4285D54C47660EDA1B290E4087B30651B542305A714E98A8233577D2AFB383E402F6B9FD214B194C738886BD2289CC5F997951910994B0A6104092FBC9B385639343CF26C9FAF845E7A98CB1F2C9306E8200185D95DE059F83AD17C4B97F8C62CF6C347DC6EB5F2B1F4BF2DD328130D4500CA39BEBA2D4281A3D8CEB4CB1ECDE378B20029FB6A4C543312E41013915C57016E5DA681944CC277F9C7E75F4A654AB2E5DC646ADA242B6223AACC63674F9702146723360811ADBDF2BB938B595BF4C688A8A844130D9DA3F0EFE3650C2283640B342F8922FB6DD10B8BBE35C7AEBEBA416CB0180FB7D2B171149018F8D880463AC26202C2B72F9A7CF83A917AD26183F8E74CD418E3B63459F7AD59849EE43CAC6DF3BB63FCEEC1ABE8E9E0B64B233A43AAC54F9BA0998D2219B3BACA111940D524B7CF94677D6C557750FA4DB9E1077EEDB5BA6E33C104AE25443C86BF1583353ADDF6FDDD19A4FF491188E3D4878769611B36427C8F4C705CF42338475C3185C123919B79B3A4887243B924509C9A4E7A3FFF0517021E51642D9B4526C28A0CF86FB254BE7EAB18701CA5919B754EC2506ECCC087AC6141B4C3A661A3D1A89E0D4DD2DF52CAA5B3402D0026B3C643FA7126E8ED101A94188A048B34AB61E1182D6BE76E2E9E6ACF401443ED0D997DD5AE67346CB1E189791102900225E6B955CD7C9E39FC7255021045FE7ECD40E2C68486A4C2FBCDBC53E847790DAFE5B2CBDCA09BF09DE327076C79F2A339A9F942DFA372B41A390EABBBF296383D438466AE6105B5058117B8E406DCAA62A98AC624D30BC8773DE643CCE7578E8D5C57C5718711421E6D04A182F8714E192EDF3935CB2E0380E10C77AA6583832DEB64CB41DC401541504E0420D06379E4830F06125018DB3810A684C4E888B3B88829CF97EC67FC8549703F9EA5D6B8F67CE9E060F765532C323DB034EC700DB819936FBE6F749FD37CE927663F439498C98C5104D69AE9CD8B04444A471F95390873346858625420ED783203F8DB371D766586137459505525CB3EE59A7FD8AC3C1DB3F5F4E0DAB62B43D1C4CD813D998ED83637F922F884A7584835BCDC03E9F1802B449DFA2D249D9ED7F2F9129815CD8D4EC4169EC51EAC449DDFA5BC0F232D47323D4C48AB558576160103820D485877CC2BBAB641D21D94E67B32CEFB4E9CE5DAC84D03ED3228EA3CDDDD33F3915B9E21B435BBED927A539B556FFCDD4C9822FD1CE81ACCA79B6F7050B0C01A3A317F066A1A89C81E4DF8C7EE2E042203BE8C290CF8605C193471953F8FE6055AC546670A4A075F70909246C3E3B92B39CF2F35AB49FE6FDA67072A6E1B82364820477FD522A746EAC07E0398E873790486ABEBB50EC59A9E4129B7CEA87C5182EB43BCDF096EF740EF67482E98165A4EC64EBE9D57C8D8C16CEAA81F3203DFF26B3D5BF01EDC0B0458B6B47846C326C75A822FC8B42774A3B201EF941484631406CBBEAC961CCF42860EBAC578703B485844BB9E01B2D6C2DF72B62B43BADE982B0BDD0501230FC76B820D2F5BD259EC791B33225E06AFC2D996C90E756DBD4257D53EA6D6FF2371330035B57BF54C9DC4FA95D61536AAC11CDB08A664EFB55BC91A4C6F7FBE3EFA8C621FD9F2CB343898979E7BCBD55B55217143493B6DE415A85D681A2405113F3ADA3EDAEE6EB0E78572D2BAF9C43E3FC39B93081CF2FB3F8879E810F417B733927525ACB6D026F6EC46395620547D057A539A391DE7C6F4B7095911C2D3AE075AD4F2C0C96D9F70E48A42CDCDAE542BAE833EB4A976D4F98410B4A3D77857762D1527EC6714A040BAAEC3BEC41BF9CFF00E1CF81CE61E95D97792D7C0DB7A88545F10D9B0A5940457018817725DA257766906FFBC6172B9C4D2D32A14D00C0D1D01E15280074A4A9FD2D21393F078EF55B16CFEA5327993263BFFE8E99E56837B2763ABD221ED85D83F9187AF8B9E928F00DEFF423FFFDADB786E6678A59AF305CDC02546D0F8AB4681ACC1F00069B0C47BBC9F13D12FD9411F8DF532096D53E4B7861839E602FC5DFA0D0B72232DD81D2B0E4B660A7EBA353DA27E66CEAF2D6C7734925247281866A12D67752A1EDAADD1EA59E4E86E2E85A81A573CD68F6DFB526558D81A8F488F261F355DDAC23F6CAF07D27FDA71D8F3968D4CEEDA89A09DCFDD00C17FA6DB3658CC7AB02C0E5F44B1F526A7DB9269E4DCD1D11B8421C204C07A5DE46E48769579718C69532F1671E552B92EF1451FFBC7A2F412696C6D67EF071C988ACF61F39319CA02DC853247BE1F7F07B3BBD68C901EF36D46868F65D0FA1D1755EFF6C2802212908250207AB65FCE827F0A82F3140DDC692CB5742B3133C541E0FE17718B546AD1FC8CB8A3A5AF69A825A84343C378D7B54F917057A8D026D4A59931FF9E24AB95BCE2BEBDB3A4286E000A4C47E74CF82925DEA07686DD20DE228828705D90638661173E703E712D69B043DCEBAFDC53F132BE3D04CE50D70B80DD118DB00E45DFE953DB87525880BA9F7ABB0FC481A5F3ACFA363AA2153391AEF0DBF680512A0FA37A6C8293865CA95015DEDBF6A21992CE1FEB5F499359E288A3255EF3FBAEA5B01440BD787251CAF362326C7FFAD20086AA7FCF35E95BBF398DB75735E087E0863E016246703946C4E88BCA6FC21446146BD342AAE88AD4249F9A38CA3E4ED0A88AA6FD746F578E5B414CE0F2CE5EC76F87FA3B6321102EF7110B7E6D889B2B85BB11FCFD4AB2B3BD5F92B8A2F66166F0695C0B13F32F97EFAA0CDE4F28E1674BF120E03FA15980873A1C6AB674C085303FDD74832CC1DF5EF787AE11C6E6A9302C1488F36996C1703670786C05324F99D9D2CDA1624DB95B8EAB6F7E10AC5BC985D6536CF667307A577A2555888BD9302CFE704CF72E59FD28B6DAEA343A83531A10A732D65CE93F523A03E5E791EFD5AC502BB0A1F756E93208746240BC7BB1C085728CCC8150D8EA74B33AEBC59567E65B7E2BD83699F607412448D202D948BB111BADD456D68086FF9A5906EA3B2CDA4111D3638391F7A7B153EEA77AB47215D6FE13B350F59F884C6E31AC087239D9145B816424CBA2C8BCB7B3ED7E19638089D91E5C9136D2AEFC8DA165284B42229A70346296A7484648DAAFB9B88994D8823EFBCE9E029ACE51706FC1E6E194B7D8906BB83BE681A96D50F4A66F5AF24AFF5007AA2052277D75E1C6FCB719AE789587E73BFB11029B2AE9E380CD7B4FBD78F0C5318EF61802ECBFE5D91BA28184E96963366BE3D5B063B5B664002AB0682E5820E3F9D30AFD461B3046CFC86F29605458AE94054CC2967CA77F102F97BAE3454BC190726AF88CFFD5F0F05E18D31159EF2A9D9800F48D1B6719617474F1B3594315795822976613CB63A4D6996505F4A4C57F3454EC101F5303F5B84AA8236843FC51D63D8445AD5C9B4F4E15AA0395695CDE980A6D48936146C519FB84AB775647E2CF809135D10B12F7E632654C679F92988B79021746C867A8E05102BB98F8FE7D3D7C005F690D9CA7C6A175FF26ED82B873D63DF4CBEA0218BAD0CB9A8C4404526F9806E2D4FBF37E84B756561C67C3A5732966DC4F0701063EAA7D3A52A2B5E4A8C9ACDB30EE8B9F5125F76BCF7B22DB897DE9A8A9560C54118F31DCCF49E87527D188FCB8
+
+
# Test seek offset
# Tests got from the original implementation of Daniel J. Bernstein
#