aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2016-09-05 13:24:31 -0400
committerJack Lloyd <[email protected]>2016-09-05 13:24:31 -0400
commit5178ba78da3a7cbf63165504886e27f3184862cf (patch)
tree5bc8c7beff516d9e79966eb0b04f8afe41ce958f /src
parent743320d7007cc66f3ebeebeaafb7b42f1a0362ff (diff)
parentac3d1ea17c0dfc0c279a6715a110c738ee615b3f (diff)
Merge GH #616 ChaCha SSE2 optimizations
Diffstat (limited to 'src')
-rw-r--r--src/cli/speed.cpp5
-rw-r--r--src/lib/stream/chacha/chacha.cpp160
-rw-r--r--src/lib/stream/chacha/chacha.h8
-rw-r--r--src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp259
-rw-r--r--src/lib/stream/chacha/chacha_sse2/info.txt3
-rw-r--r--src/tests/data/stream/chacha.vec8
6 files changed, 371 insertions, 72 deletions
diff --git a/src/cli/speed.cpp b/src/cli/speed.cpp
index 222a98d3f..c1f3a91e8 100644
--- a/src/cli/speed.cpp
+++ b/src/cli/speed.cpp
@@ -521,10 +521,11 @@ class Speed final : public Command
Timer encrypt_timer(cipher.name(), provider, "encrypt", buffer.size());
+ const Botan::SymmetricKey key(rng(), cipher.maximum_keylength());
+ cipher.set_key(key);
+
while(encrypt_timer.under(runtime))
{
- const Botan::SymmetricKey key(rng(), cipher.maximum_keylength());
- cipher.set_key(key);
encrypt_timer.run([&] { cipher.encipher(buffer); });
}
diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp
index 40da93029..c35363112 100644
--- a/src/lib/stream/chacha/chacha.cpp
+++ b/src/lib/stream/chacha/chacha.cpp
@@ -7,6 +7,7 @@
#include <botan/chacha.h>
#include <botan/loadstor.h>
+#include <botan/cpuid.h>
namespace Botan {
@@ -16,58 +17,87 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds)
throw Invalid_Argument("ChaCha only supports 8, 12 or 20 rounds");
}
-namespace {
-
-void chacha(byte output[64], const u32bit input[16], size_t rounds)
+//static
+void ChaCha::chacha_x4(byte output[64*4], u32bit input[16], size_t rounds)
{
BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
- u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3],
- x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7],
- x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11],
- x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
-
-#define CHACHA_QUARTER_ROUND(a, b, c, d) \
- do { \
- a += b; d ^= a; d = rotate_left(d, 16); \
- c += d; b ^= c; b = rotate_left(b, 12); \
- a += b; d ^= a; d = rotate_left(d, 8); \
- c += d; b ^= c; b = rotate_left(b, 7); \
- } while(0)
-
- for(size_t i = 0; i != rounds / 2; ++i)
+#if defined(BOTAN_HAS_CHACHA_SSE2)
+ if(CPUID::has_sse2())
{
- CHACHA_QUARTER_ROUND(x00, x04, x08, x12);
- CHACHA_QUARTER_ROUND(x01, x05, x09, x13);
- CHACHA_QUARTER_ROUND(x02, x06, x10, x14);
- CHACHA_QUARTER_ROUND(x03, x07, x11, x15);
-
- CHACHA_QUARTER_ROUND(x00, x05, x10, x15);
- CHACHA_QUARTER_ROUND(x01, x06, x11, x12);
- CHACHA_QUARTER_ROUND(x02, x07, x08, x13);
- CHACHA_QUARTER_ROUND(x03, x04, x09, x14);
+ return ChaCha::chacha_sse2_x4(output, input, rounds);
}
+#endif
+
+ // TODO interleave rounds
+ for(size_t i = 0; i != 4; ++i)
+ {
+ u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3],
+ x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7],
+ x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11],
+ x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
+
+#define CHACHA_QUARTER_ROUND(a, b, c, d) \
+ do { \
+ a += b; d ^= a; d = rotate_left(d, 16); \
+ c += d; b ^= c; b = rotate_left(b, 12); \
+ a += b; d ^= a; d = rotate_left(d, 8); \
+ c += d; b ^= c; b = rotate_left(b, 7); \
+ } while(0)
+
+ for(size_t i = 0; i != rounds / 2; ++i)
+ {
+ CHACHA_QUARTER_ROUND(x00, x04, x08, x12);
+ CHACHA_QUARTER_ROUND(x01, x05, x09, x13);
+ CHACHA_QUARTER_ROUND(x02, x06, x10, x14);
+ CHACHA_QUARTER_ROUND(x03, x07, x11, x15);
+
+ CHACHA_QUARTER_ROUND(x00, x05, x10, x15);
+ CHACHA_QUARTER_ROUND(x01, x06, x11, x12);
+ CHACHA_QUARTER_ROUND(x02, x07, x08, x13);
+ CHACHA_QUARTER_ROUND(x03, x04, x09, x14);
+ }
#undef CHACHA_QUARTER_ROUND
- store_le(x00 + input[ 0], output + 4 * 0);
- store_le(x01 + input[ 1], output + 4 * 1);
- store_le(x02 + input[ 2], output + 4 * 2);
- store_le(x03 + input[ 3], output + 4 * 3);
- store_le(x04 + input[ 4], output + 4 * 4);
- store_le(x05 + input[ 5], output + 4 * 5);
- store_le(x06 + input[ 6], output + 4 * 6);
- store_le(x07 + input[ 7], output + 4 * 7);
- store_le(x08 + input[ 8], output + 4 * 8);
- store_le(x09 + input[ 9], output + 4 * 9);
- store_le(x10 + input[10], output + 4 * 10);
- store_le(x11 + input[11], output + 4 * 11);
- store_le(x12 + input[12], output + 4 * 12);
- store_le(x13 + input[13], output + 4 * 13);
- store_le(x14 + input[14], output + 4 * 14);
- store_le(x15 + input[15], output + 4 * 15);
+ x00 += input[0];
+ x01 += input[1];
+ x02 += input[2];
+ x03 += input[3];
+ x04 += input[4];
+ x05 += input[5];
+ x06 += input[6];
+ x07 += input[7];
+ x08 += input[8];
+ x09 += input[9];
+ x10 += input[10];
+ x11 += input[11];
+ x12 += input[12];
+ x13 += input[13];
+ x14 += input[14];
+ x15 += input[15];
+
+ store_le(x00, output + 64 * i + 4 * 0);
+ store_le(x01, output + 64 * i + 4 * 1);
+ store_le(x02, output + 64 * i + 4 * 2);
+ store_le(x03, output + 64 * i + 4 * 3);
+ store_le(x04, output + 64 * i + 4 * 4);
+ store_le(x05, output + 64 * i + 4 * 5);
+ store_le(x06, output + 64 * i + 4 * 6);
+ store_le(x07, output + 64 * i + 4 * 7);
+ store_le(x08, output + 64 * i + 4 * 8);
+ store_le(x09, output + 64 * i + 4 * 9);
+ store_le(x10, output + 64 * i + 4 * 10);
+ store_le(x11, output + 64 * i + 4 * 11);
+ store_le(x12, output + 64 * i + 4 * 12);
+ store_le(x13, output + 64 * i + 4 * 13);
+ store_le(x14, output + 64 * i + 4 * 14);
+ store_le(x15, output + 64 * i + 4 * 15);
+
+ input[12]++;
+ input[13] += input[12] < i; // carry?
+ }
}
-}
/*
* Combine cipher stream with message
@@ -80,11 +110,7 @@ void ChaCha::cipher(const byte in[], byte out[], size_t length)
length -= (m_buffer.size() - m_position);
in += (m_buffer.size() - m_position);
out += (m_buffer.size() - m_position);
- chacha(m_buffer.data(), m_state.data(), m_rounds);
-
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
-
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
m_position = 0;
}
@@ -106,8 +132,12 @@ void ChaCha::key_schedule(const byte key[], size_t length)
const u32bit* CONSTANTS = (length == 16) ? TAU : SIGMA;
+ // Repeat the key if 128 bits
+ const byte* key2 = (length == 32) ? key + 16 : key;
+
+ m_position = 0;
m_state.resize(16);
- m_buffer.resize(64);
+ m_buffer.resize(4*64);
m_state[0] = CONSTANTS[0];
m_state[1] = CONSTANTS[1];
@@ -119,16 +149,12 @@ void ChaCha::key_schedule(const byte key[], size_t length)
m_state[6] = load_le<u32bit>(key, 2);
m_state[7] = load_le<u32bit>(key, 3);
- if(length == 32)
- key += 16;
-
- m_state[8] = load_le<u32bit>(key, 0);
- m_state[9] = load_le<u32bit>(key, 1);
- m_state[10] = load_le<u32bit>(key, 2);
- m_state[11] = load_le<u32bit>(key, 3);
-
- m_position = 0;
+ m_state[8] = load_le<u32bit>(key2, 0);
+ m_state[9] = load_le<u32bit>(key2, 1);
+ m_state[10] = load_le<u32bit>(key2, 2);
+ m_state[11] = load_le<u32bit>(key2, 3);
+ // Default all-zero IV
const byte ZERO[8] = { 0 };
set_iv(ZERO, sizeof(ZERO));
}
@@ -153,10 +179,7 @@ void ChaCha::set_iv(const byte iv[], size_t length)
m_state[15] = load_le<u32bit>(iv, 2);
}
- chacha(m_buffer.data(), m_state.data(), m_rounds);
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
-
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
m_position = 0;
}
@@ -176,12 +199,11 @@ void ChaCha::seek(u64bit offset)
{
if (m_state.size() == 0 && m_buffer.size() == 0)
{
- throw Invalid_State("You have to setup the stream cipher (key and iv)");
+ throw Invalid_State("You have to setup the stream cipher (key and iv)");
}
- m_position = offset % m_buffer.size();
-
- u64bit counter = offset / m_buffer.size();
+ // Find the block offset
+ u64bit counter = offset / 64;
byte out[8];
@@ -190,9 +212,7 @@ void ChaCha::seek(u64bit offset)
m_state[12] = load_le<u32bit>(out, 0);
m_state[13] += load_le<u32bit>(out, 1);
- chacha(m_buffer.data(), m_state.data(), m_rounds);
-
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
+ m_position = offset % 64;
}
}
diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h
index f8f42e41d..e4d4ae60e 100644
--- a/src/lib/stream/chacha/chacha.h
+++ b/src/lib/stream/chacha/chacha.h
@@ -47,6 +47,14 @@ class BOTAN_DLL ChaCha final : public StreamCipher
private:
void key_schedule(const byte key[], size_t key_len) override;
+ void incr_state_counter(size_t howmany);
+
+ void chacha_x4(byte output[64*4], u32bit state[16], size_t rounds);
+
+#if defined(BOTAN_HAS_CHACHA_SSE2)
+ void chacha_sse2_x4(byte output[64*4], u32bit state[16], size_t rounds);
+#endif
+
size_t m_rounds;
secure_vector<u32bit> m_state;
secure_vector<byte> m_buffer;
diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
new file mode 100644
index 000000000..e39b285b3
--- /dev/null
+++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
@@ -0,0 +1,259 @@
+/*
+* SSE2 ChaCha
+* (C) 2016 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/chacha.h>
+#include <emmintrin.h>
+
+namespace Botan {
+
+//static
+void ChaCha::chacha_sse2_x4(byte output[64], u32bit input[16], size_t rounds)
+ {
+ BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
+
+ const __m128i* input_mm = reinterpret_cast<const __m128i*>(input);
+ __m128i* output_mm = reinterpret_cast<__m128i*>(output);
+
+ __m128i input0 = _mm_loadu_si128(input_mm);
+ __m128i input1 = _mm_loadu_si128(input_mm + 1);
+ __m128i input2 = _mm_loadu_si128(input_mm + 2);
+ __m128i input3 = _mm_loadu_si128(input_mm + 3);
+
+ // TODO: try transposing, which would avoid the permutations each round
+
+#define mm_rotl(r, n) \
+ _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n))
+
+ __m128i r0_0 = input0;
+ __m128i r0_1 = input1;
+ __m128i r0_2 = input2;
+ __m128i r0_3 = input3;
+
+ __m128i r1_0 = input0;
+ __m128i r1_1 = input1;
+ __m128i r1_2 = input2;
+ __m128i r1_3 = input3;
+ r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
+
+ __m128i r2_0 = input0;
+ __m128i r2_1 = input1;
+ __m128i r2_2 = input2;
+ __m128i r2_3 = input3;
+ r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
+
+ __m128i r3_0 = input0;
+ __m128i r3_1 = input1;
+ __m128i r3_2 = input2;
+ __m128i r3_3 = input3;
+ r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
+
+ for(size_t r = 0; r != rounds / 2; ++r)
+ {
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 16);
+ r1_3 = mm_rotl(r1_3, 16);
+ r2_3 = mm_rotl(r2_3, 16);
+ r3_3 = mm_rotl(r3_3, 16);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 12);
+ r1_1 = mm_rotl(r1_1, 12);
+ r2_1 = mm_rotl(r2_1, 12);
+ r3_1 = mm_rotl(r3_1, 12);
+
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 8);
+ r1_3 = mm_rotl(r1_3, 8);
+ r2_3 = mm_rotl(r2_3, 8);
+ r3_3 = mm_rotl(r3_3, 8);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 7);
+ r1_1 = mm_rotl(r1_1, 7);
+ r2_1 = mm_rotl(r2_1, 7);
+ r3_1 = mm_rotl(r3_1, 7);
+
+ r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
+ r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 16);
+ r1_3 = mm_rotl(r1_3, 16);
+ r2_3 = mm_rotl(r2_3, 16);
+ r3_3 = mm_rotl(r3_3, 16);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 12);
+ r1_1 = mm_rotl(r1_1, 12);
+ r2_1 = mm_rotl(r2_1, 12);
+ r3_1 = mm_rotl(r3_1, 12);
+
+ r0_0 = _mm_add_epi32(r0_0, r0_1);
+ r1_0 = _mm_add_epi32(r1_0, r1_1);
+ r2_0 = _mm_add_epi32(r2_0, r2_1);
+ r3_0 = _mm_add_epi32(r3_0, r3_1);
+
+ r0_3 = _mm_xor_si128(r0_3, r0_0);
+ r1_3 = _mm_xor_si128(r1_3, r1_0);
+ r2_3 = _mm_xor_si128(r2_3, r2_0);
+ r3_3 = _mm_xor_si128(r3_3, r3_0);
+
+ r0_3 = mm_rotl(r0_3, 8);
+ r1_3 = mm_rotl(r1_3, 8);
+ r2_3 = mm_rotl(r2_3, 8);
+ r3_3 = mm_rotl(r3_3, 8);
+
+ r0_2 = _mm_add_epi32(r0_2, r0_3);
+ r1_2 = _mm_add_epi32(r1_2, r1_3);
+ r2_2 = _mm_add_epi32(r2_2, r2_3);
+ r3_2 = _mm_add_epi32(r3_2, r3_3);
+
+ r0_1 = _mm_xor_si128(r0_1, r0_2);
+ r1_1 = _mm_xor_si128(r1_1, r1_2);
+ r2_1 = _mm_xor_si128(r2_1, r2_2);
+ r3_1 = _mm_xor_si128(r3_1, r3_2);
+
+ r0_1 = mm_rotl(r0_1, 7);
+ r1_1 = mm_rotl(r1_1, 7);
+ r2_1 = mm_rotl(r2_1, 7);
+ r3_1 = mm_rotl(r3_1, 7);
+
+ r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+ r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+ r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+ r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
+ r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
+ r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
+ }
+
+ r0_0 = _mm_add_epi32(r0_0, input0);
+ r0_1 = _mm_add_epi32(r0_1, input1);
+ r0_2 = _mm_add_epi32(r0_2, input2);
+ r0_3 = _mm_add_epi32(r0_3, input3);
+
+ r1_0 = _mm_add_epi32(r1_0, input0);
+ r1_1 = _mm_add_epi32(r1_1, input1);
+ r1_2 = _mm_add_epi32(r1_2, input2);
+ r1_3 = _mm_add_epi32(r1_3, input3);
+ r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
+
+ r2_0 = _mm_add_epi32(r2_0, input0);
+ r2_1 = _mm_add_epi32(r2_1, input1);
+ r2_2 = _mm_add_epi32(r2_2, input2);
+ r2_3 = _mm_add_epi32(r2_3, input3);
+ r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
+
+ r3_0 = _mm_add_epi32(r3_0, input0);
+ r3_1 = _mm_add_epi32(r3_1, input1);
+ r3_2 = _mm_add_epi32(r3_2, input2);
+ r3_3 = _mm_add_epi32(r3_3, input3);
+ r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
+
+ _mm_storeu_si128(output_mm + 0, r0_0);
+ _mm_storeu_si128(output_mm + 1, r0_1);
+ _mm_storeu_si128(output_mm + 2, r0_2);
+ _mm_storeu_si128(output_mm + 3, r0_3);
+
+ _mm_storeu_si128(output_mm + 4, r1_0);
+ _mm_storeu_si128(output_mm + 5, r1_1);
+ _mm_storeu_si128(output_mm + 6, r1_2);
+ _mm_storeu_si128(output_mm + 7, r1_3);
+
+ _mm_storeu_si128(output_mm + 8, r2_0);
+ _mm_storeu_si128(output_mm + 9, r2_1);
+ _mm_storeu_si128(output_mm + 10, r2_2);
+ _mm_storeu_si128(output_mm + 11, r2_3);
+
+ _mm_storeu_si128(output_mm + 12, r3_0);
+ _mm_storeu_si128(output_mm + 13, r3_1);
+ _mm_storeu_si128(output_mm + 14, r3_2);
+ _mm_storeu_si128(output_mm + 15, r3_3);
+
+#undef mm_rotl
+
+ input[12] += 4;
+ if(input[12] < 4)
+ input[13]++;
+ }
+
+}
diff --git a/src/lib/stream/chacha/chacha_sse2/info.txt b/src/lib/stream/chacha/chacha_sse2/info.txt
new file mode 100644
index 000000000..965479746
--- /dev/null
+++ b/src/lib/stream/chacha/chacha_sse2/info.txt
@@ -0,0 +1,3 @@
+define CHACHA_SSE2 20160831
+
+need_isa sse2
diff --git a/src/tests/data/stream/chacha.vec b/src/tests/data/stream/chacha.vec
index 881513706..830684b2c 100644
--- a/src/tests/data/stream/chacha.vec
+++ b/src/tests/data/stream/chacha.vec
@@ -124,6 +124,14 @@ Nonce = 000000000000000000000002
In = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
Out = C2C64D378CD536374AE204B9EF933FCD1A8B2288B3DFA49672AB765B54EE27C78A970E0E955C14F3A88E741B97C286F75F8FC299E8148362FA198A39531BED6D
+# Long output tests generated by DJB ref impl
+
+Key = 0000000000000000000000000000000000000000000000000000000000000000
+Nonce = 000000000000000000000000
+In
+Out = 
+
+
# Test seek offset
# Tests got from the original implementation of Daniel J. Bernstein
#