diff options
author | Jack Lloyd <[email protected]> | 2016-09-05 13:24:31 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2016-09-05 13:24:31 -0400 |
commit | 5178ba78da3a7cbf63165504886e27f3184862cf (patch) | |
tree | 5bc8c7beff516d9e79966eb0b04f8afe41ce958f /src | |
parent | 743320d7007cc66f3ebeebeaafb7b42f1a0362ff (diff) | |
parent | ac3d1ea17c0dfc0c279a6715a110c738ee615b3f (diff) |
Merge GH #616 ChaCha SSE2 optimizations
Diffstat (limited to 'src')
-rw-r--r-- | src/cli/speed.cpp | 5 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha.cpp | 160 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha.h | 8 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 259 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/info.txt | 3 | ||||
-rw-r--r-- | src/tests/data/stream/chacha.vec | 8 |
6 files changed, 371 insertions, 72 deletions
diff --git a/src/cli/speed.cpp b/src/cli/speed.cpp index 222a98d3f..c1f3a91e8 100644 --- a/src/cli/speed.cpp +++ b/src/cli/speed.cpp @@ -521,10 +521,11 @@ class Speed final : public Command Timer encrypt_timer(cipher.name(), provider, "encrypt", buffer.size()); + const Botan::SymmetricKey key(rng(), cipher.maximum_keylength()); + cipher.set_key(key); + while(encrypt_timer.under(runtime)) { - const Botan::SymmetricKey key(rng(), cipher.maximum_keylength()); - cipher.set_key(key); encrypt_timer.run([&] { cipher.encipher(buffer); }); } diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp index 40da93029..c35363112 100644 --- a/src/lib/stream/chacha/chacha.cpp +++ b/src/lib/stream/chacha/chacha.cpp @@ -7,6 +7,7 @@ #include <botan/chacha.h> #include <botan/loadstor.h> +#include <botan/cpuid.h> namespace Botan { @@ -16,58 +17,87 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds) throw Invalid_Argument("ChaCha only supports 8, 12 or 20 rounds"); } -namespace { - -void chacha(byte output[64], const u32bit input[16], size_t rounds) +//static +void ChaCha::chacha_x4(byte output[64*4], u32bit input[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], - x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], - x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], - x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; - -#define CHACHA_QUARTER_ROUND(a, b, c, d) \ - do { \ - a += b; d ^= a; d = rotate_left(d, 16); \ - c += d; b ^= c; b = rotate_left(b, 12); \ - a += b; d ^= a; d = rotate_left(d, 8); \ - c += d; b ^= c; b = rotate_left(b, 7); \ - } while(0) - - for(size_t i = 0; i != rounds / 2; ++i) +#if defined(BOTAN_HAS_CHACHA_SSE2) + if(CPUID::has_sse2()) { - CHACHA_QUARTER_ROUND(x00, x04, x08, x12); - CHACHA_QUARTER_ROUND(x01, x05, x09, x13); - CHACHA_QUARTER_ROUND(x02, x06, x10, x14); - CHACHA_QUARTER_ROUND(x03, x07, x11, x15); - - CHACHA_QUARTER_ROUND(x00, x05, x10, x15); - CHACHA_QUARTER_ROUND(x01, x06, x11, x12); - CHACHA_QUARTER_ROUND(x02, x07, x08, x13); - CHACHA_QUARTER_ROUND(x03, x04, x09, x14); + return ChaCha::chacha_sse2_x4(output, input, rounds); } +#endif + + // TODO interleave rounds + for(size_t i = 0; i != 4; ++i) + { + u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], + x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], + x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], + x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; + +#define CHACHA_QUARTER_ROUND(a, b, c, d) \ + do { \ + a += b; d ^= a; d = rotate_left(d, 16); \ + c += d; b ^= c; b = rotate_left(b, 12); \ + a += b; d ^= a; d = rotate_left(d, 8); \ + c += d; b ^= c; b = rotate_left(b, 7); \ + } while(0) + + for(size_t i = 0; i != rounds / 2; ++i) + { + CHACHA_QUARTER_ROUND(x00, x04, x08, x12); + CHACHA_QUARTER_ROUND(x01, x05, x09, x13); + CHACHA_QUARTER_ROUND(x02, x06, x10, x14); + CHACHA_QUARTER_ROUND(x03, x07, x11, x15); + + CHACHA_QUARTER_ROUND(x00, x05, x10, x15); + CHACHA_QUARTER_ROUND(x01, x06, x11, x12); + CHACHA_QUARTER_ROUND(x02, x07, x08, x13); + CHACHA_QUARTER_ROUND(x03, x04, x09, x14); + } #undef CHACHA_QUARTER_ROUND - store_le(x00 + input[ 0], output + 4 * 0); - store_le(x01 + input[ 1], output + 4 * 1); - store_le(x02 + input[ 2], output + 4 * 2); - store_le(x03 + input[ 3], output + 4 * 3); - store_le(x04 + input[ 4], output + 4 * 4); - store_le(x05 + input[ 5], output + 4 * 5); - store_le(x06 + input[ 6], output + 4 * 6); - store_le(x07 + input[ 7], output + 4 * 7); - store_le(x08 + input[ 8], output + 4 * 8); - store_le(x09 + input[ 9], output + 4 * 9); - store_le(x10 + input[10], output + 4 * 10); - store_le(x11 + input[11], output + 4 * 11); - store_le(x12 + input[12], output + 4 * 12); - store_le(x13 + input[13], output + 4 * 13); - store_le(x14 + input[14], output + 4 * 14); - store_le(x15 + input[15], output + 4 * 15); + x00 += input[0]; + x01 += input[1]; + x02 += input[2]; + x03 += input[3]; + x04 += input[4]; + x05 += input[5]; + x06 += input[6]; + x07 += input[7]; + x08 += input[8]; + x09 += input[9]; + x10 += input[10]; + x11 += input[11]; + x12 += input[12]; + x13 += input[13]; + x14 += input[14]; + x15 += input[15]; + + store_le(x00, output + 64 * i + 4 * 0); + store_le(x01, output + 64 * i + 4 * 1); + store_le(x02, output + 64 * i + 4 * 2); + store_le(x03, output + 64 * i + 4 * 3); + store_le(x04, output + 64 * i + 4 * 4); + store_le(x05, output + 64 * i + 4 * 5); + store_le(x06, output + 64 * i + 4 * 6); + store_le(x07, output + 64 * i + 4 * 7); + store_le(x08, output + 64 * i + 4 * 8); + store_le(x09, output + 64 * i + 4 * 9); + store_le(x10, output + 64 * i + 4 * 10); + store_le(x11, output + 64 * i + 4 * 11); + store_le(x12, output + 64 * i + 4 * 12); + store_le(x13, output + 64 * i + 4 * 13); + store_le(x14, output + 64 * i + 4 * 14); + store_le(x15, output + 64 * i + 4 * 15); + + input[12]++; + input[13] += input[12] < i; // carry? + } } -} /* * Combine cipher stream with message @@ -80,11 +110,7 @@ void ChaCha::cipher(const byte in[], byte out[], size_t length) length -= (m_buffer.size() - m_position); in += (m_buffer.size() - m_position); out += (m_buffer.size() - m_position); - chacha(m_buffer.data(), m_state.data(), m_rounds); - - ++m_state[12]; - m_state[13] += (m_state[12] == 0); - + chacha_x4(m_buffer.data(), m_state.data(), m_rounds); m_position = 0; } @@ -106,8 +132,12 @@ void ChaCha::key_schedule(const byte key[], size_t length) const u32bit* CONSTANTS = (length == 16) ? TAU : SIGMA; + // Repeat the key if 128 bits + const byte* key2 = (length == 32) ? key + 16 : key; + + m_position = 0; m_state.resize(16); - m_buffer.resize(64); + m_buffer.resize(4*64); m_state[0] = CONSTANTS[0]; m_state[1] = CONSTANTS[1]; @@ -119,16 +149,12 @@ void ChaCha::key_schedule(const byte key[], size_t length) m_state[6] = load_le<u32bit>(key, 2); m_state[7] = load_le<u32bit>(key, 3); - if(length == 32) - key += 16; - - m_state[8] = load_le<u32bit>(key, 0); - m_state[9] = load_le<u32bit>(key, 1); - m_state[10] = load_le<u32bit>(key, 2); - m_state[11] = load_le<u32bit>(key, 3); - - m_position = 0; + m_state[8] = load_le<u32bit>(key2, 0); + m_state[9] = load_le<u32bit>(key2, 1); + m_state[10] = load_le<u32bit>(key2, 2); + m_state[11] = load_le<u32bit>(key2, 3); + // Default all-zero IV const byte ZERO[8] = { 0 }; set_iv(ZERO, sizeof(ZERO)); } @@ -153,10 +179,7 @@ void ChaCha::set_iv(const byte iv[], size_t length) m_state[15] = load_le<u32bit>(iv, 2); } - chacha(m_buffer.data(), m_state.data(), m_rounds); - ++m_state[12]; - m_state[13] += (m_state[12] == 0); - + chacha_x4(m_buffer.data(), m_state.data(), m_rounds); m_position = 0; } @@ -176,12 +199,11 @@ void ChaCha::seek(u64bit offset) { if (m_state.size() == 0 && m_buffer.size() == 0) { - throw Invalid_State("You have to setup the stream cipher (key and iv)"); + throw Invalid_State("You have to setup the stream cipher (key and iv)"); } - m_position = offset % m_buffer.size(); - - u64bit counter = offset / m_buffer.size(); + // Find the block offset + u64bit counter = offset / 64; byte out[8]; @@ -190,9 +212,7 @@ void ChaCha::seek(u64bit offset) m_state[12] = load_le<u32bit>(out, 0); m_state[13] += load_le<u32bit>(out, 1); - chacha(m_buffer.data(), m_state.data(), m_rounds); - - ++m_state[12]; - m_state[13] += (m_state[12] == 0); + chacha_x4(m_buffer.data(), m_state.data(), m_rounds); + m_position = offset % 64; } } diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h index f8f42e41d..e4d4ae60e 100644 --- a/src/lib/stream/chacha/chacha.h +++ b/src/lib/stream/chacha/chacha.h @@ -47,6 +47,14 @@ class BOTAN_DLL ChaCha final : public StreamCipher private: void key_schedule(const byte key[], size_t key_len) override; + void incr_state_counter(size_t howmany); + + void chacha_x4(byte output[64*4], u32bit state[16], size_t rounds); + +#if defined(BOTAN_HAS_CHACHA_SSE2) + void chacha_sse2_x4(byte output[64*4], u32bit state[16], size_t rounds); +#endif + size_t m_rounds; secure_vector<u32bit> m_state; secure_vector<byte> m_buffer; diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp new file mode 100644 index 000000000..e39b285b3 --- /dev/null +++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp @@ -0,0 +1,259 @@ +/* +* SSE2 ChaCha +* (C) 2016 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/chacha.h> +#include <emmintrin.h> + +namespace Botan { + +//static +void ChaCha::chacha_sse2_x4(byte output[64], u32bit input[16], size_t rounds) + { + BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); + + const __m128i* input_mm = reinterpret_cast<const __m128i*>(input); + __m128i* output_mm = reinterpret_cast<__m128i*>(output); + + __m128i input0 = _mm_loadu_si128(input_mm); + __m128i input1 = _mm_loadu_si128(input_mm + 1); + __m128i input2 = _mm_loadu_si128(input_mm + 2); + __m128i input3 = _mm_loadu_si128(input_mm + 3); + + // TODO: try transposing, which would avoid the permutations each round + +#define mm_rotl(r, n) \ + _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) + + __m128i r0_0 = input0; + __m128i r0_1 = input1; + __m128i r0_2 = input2; + __m128i r0_3 = input3; + + __m128i r1_0 = input0; + __m128i r1_1 = input1; + __m128i r1_2 = input2; + __m128i r1_3 = input3; + r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); + + __m128i r2_0 = input0; + __m128i r2_1 = input1; + __m128i r2_2 = input2; + __m128i r2_3 = input3; + r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); + + __m128i r3_0 = input0; + __m128i r3_1 = input1; + __m128i r3_2 = input2; + __m128i r3_3 = input3; + r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); + + for(size_t r = 0; r != rounds / 2; ++r) + { + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 16); + r1_3 = mm_rotl(r1_3, 16); + r2_3 = mm_rotl(r2_3, 16); + r3_3 = mm_rotl(r3_3, 16); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 12); + r1_1 = mm_rotl(r1_1, 12); + r2_1 = mm_rotl(r2_1, 12); + r3_1 = mm_rotl(r3_1, 12); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 8); + r1_3 = mm_rotl(r1_3, 8); + r2_3 = mm_rotl(r2_3, 8); + r3_3 = mm_rotl(r3_3, 8); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 7); + r1_1 = mm_rotl(r1_1, 7); + r2_1 = mm_rotl(r2_1, 7); + r3_1 = mm_rotl(r3_1, 7); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 16); + r1_3 = mm_rotl(r1_3, 16); + r2_3 = mm_rotl(r2_3, 16); + r3_3 = mm_rotl(r3_3, 16); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 12); + r1_1 = mm_rotl(r1_1, 12); + r2_1 = mm_rotl(r2_1, 12); + r3_1 = mm_rotl(r3_1, 12); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 8); + r1_3 = mm_rotl(r1_3, 8); + r2_3 = mm_rotl(r2_3, 8); + r3_3 = mm_rotl(r3_3, 8); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 7); + r1_1 = mm_rotl(r1_1, 7); + r2_1 = mm_rotl(r2_1, 7); + r3_1 = mm_rotl(r3_1, 7); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + r0_0 = _mm_add_epi32(r0_0, input0); + r0_1 = _mm_add_epi32(r0_1, input1); + r0_2 = _mm_add_epi32(r0_2, input2); + r0_3 = _mm_add_epi32(r0_3, input3); + + r1_0 = _mm_add_epi32(r1_0, input0); + r1_1 = _mm_add_epi32(r1_1, input1); + r1_2 = _mm_add_epi32(r1_2, input2); + r1_3 = _mm_add_epi32(r1_3, input3); + r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); + + r2_0 = _mm_add_epi32(r2_0, input0); + r2_1 = _mm_add_epi32(r2_1, input1); + r2_2 = _mm_add_epi32(r2_2, input2); + r2_3 = _mm_add_epi32(r2_3, input3); + r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); + + r3_0 = _mm_add_epi32(r3_0, input0); + r3_1 = _mm_add_epi32(r3_1, input1); + r3_2 = _mm_add_epi32(r3_2, input2); + r3_3 = _mm_add_epi32(r3_3, input3); + r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); + + _mm_storeu_si128(output_mm + 0, r0_0); + _mm_storeu_si128(output_mm + 1, r0_1); + _mm_storeu_si128(output_mm + 2, r0_2); + _mm_storeu_si128(output_mm + 3, r0_3); + + _mm_storeu_si128(output_mm + 4, r1_0); + _mm_storeu_si128(output_mm + 5, r1_1); + _mm_storeu_si128(output_mm + 6, r1_2); + _mm_storeu_si128(output_mm + 7, r1_3); + + _mm_storeu_si128(output_mm + 8, r2_0); + _mm_storeu_si128(output_mm + 9, r2_1); + _mm_storeu_si128(output_mm + 10, r2_2); + _mm_storeu_si128(output_mm + 11, r2_3); + + _mm_storeu_si128(output_mm + 12, r3_0); + _mm_storeu_si128(output_mm + 13, r3_1); + _mm_storeu_si128(output_mm + 14, r3_2); + _mm_storeu_si128(output_mm + 15, r3_3); + +#undef mm_rotl + + input[12] += 4; + if(input[12] < 4) + input[13]++; + } + +} diff --git a/src/lib/stream/chacha/chacha_sse2/info.txt b/src/lib/stream/chacha/chacha_sse2/info.txt new file mode 100644 index 000000000..965479746 --- /dev/null +++ b/src/lib/stream/chacha/chacha_sse2/info.txt @@ -0,0 +1,3 @@ +define CHACHA_SSE2 20160831 + +need_isa sse2 diff --git a/src/tests/data/stream/chacha.vec b/src/tests/data/stream/chacha.vec index 881513706..830684b2c 100644 --- a/src/tests/data/stream/chacha.vec +++ b/src/tests/data/stream/chacha.vec @@ -124,6 +124,14 @@ Nonce = 000000000000000000000002 In = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 Out = C2C64D378CD536374AE204B9EF933FCD1A8B2288B3DFA49672AB765B54EE27C78A970E0E955C14F3A88E741B97C286F75F8FC299E8148362FA198A39531BED6D +# Long output tests generated by DJB ref impl + +Key = 0000000000000000000000000000000000000000000000000000000000000000 +Nonce = 000000000000000000000000 +In = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +Out = 76B8E0ADA0F13D90405D6AE55386BD28BDD219B8A08DED1AA836EFCC8B770DC7DA41597C5157488D7724E03FB8D84A376A43B8F41518A11CC387B669B2EE65869F07E7BE5551387A98BA977C732D080DCB0F29A048E3656912C6533E32EE7AED29B721769CE64E43D57133B074D839D531ED1F28510AFB45ACE10A1F4B794D6F2D09A0E663266CE1AE7ED1081968A0758E718E997BD362C6B0C34634A9A0B35D012737681F7B5D0F281E3AFDE458BC1E73D2D313C9CF94C05FF3716240A248F21320A058D7B3566BD520DAAA3ED2BF0AC5B8B120FB852773C3639734B45C91A42DD4CB83F8840D2EEDB158131062AC3F1F2CF8FF6DCD1856E86A1E6C3167167EE5A688742B47C5ADFB59D4DF76FD1DB1E51EE03B1CA9F82ACA173EDB8B7293474EBE980F904D10C916442B4783A0E984860CB6C957B39C38ED8F51CFFAA68A4DE01025A39C504546B9DC1406A7EB28151E5150D7B204BAA719D4F091021217DB5CF1B5C84C4FA71A879610A1A695AC527C5B56774A6B8A21AAE88685868E094CF29EF4090AF7A90CC07E8817AA528763797D3C332B67CA4BC110642C2151EC47EE84CB8C42D85F10E2A8CB18C3B7335F26E8C39A12B1BCC1707177B76138732EEDAAB74DA1410FC055EA068C99E9260ACBE337CF5D3E00E5B3230FFEDB0B990787D0C70E0BFE4198EA6758DD5A61FB5FEC2DF981F31BEFE153F81D17161784DB1C8822D53CD1EE7DB532364828BDF404B040A8DCC522F3D3D99AEC4B8057EDB8500931A2C42D2F0C570847100B5754DAFC5FBDB894BBEF1A2DE1A07F8BA0C4B919301066EDBC056B7B481E7A0C46297BBB589D9DA5B675A6723E152E5E63A4CE034E9E83E58A013AF0E7352FB7908514E3B3D1040D0BB963B3954B636B5FD4BF6D0AADBAF8157D062ACB2418C176A475511B35C3F6218A5668EA5BC6F54B8782F8B340F00AC1BEBA5E62CD632A7CE7809C725608ACA5EFBF7C41F237643F06C0997207171DE867F9D697BF5EA6011ABCCE6C8CDB211394D2C02DD0FB60DB5A2C17AC3DC85878A90BED3809DBB96EAA5426FC8EAE0D2D65C42A479F088648BE2DC801D82A366FDDC0EF234263C0B6417D5F9DA41817B88D68E5E67195C5C1EE3095E821F22524B20BE41CEB590412E41DC648843FA9BFEC7A3DCF61AB0541573316D3FA8151629303FE9741562ED065DB4EBC0050EF558364AE81124A28F5C01313232FBC496DFD8A2568657B686D7214382A1A00903017DDA969878442BA5AFFF6613F553CBB233CE46D9AEE93A7876CF5E9E82912B18CADF0B34327B2E0427ECF66B7CEB7C0918DC47BDFF12A062ADF07133009CE7A5E5C917E0168306109B7CB49653A6D2CAEF005DE783A9A9BFE05381ED1348D94EC65886F9C0B619C52C5533800B16C836172B95182DBC5EEC042B89E22F11A085B739A3611CD8D836018C4FFF0B86C02ED662D2D2522647A1F09A7B2F9EEA56E7E20B1F06CCDD9CEC37E3B2D20812DF369978636C22646603675804104745D2997E28DF5D8242AAD19C8120CA4142FB6019FCCECF9FADB04ADE03B341E3FC77201B3DC957A8097AB2F615AFF142AB753811D5F32E75BC8825B456555F3D179FFABCF35F6AE61365851F3F681A2E86E8078B064976646186394CB9064767750DAD4E336B8F1D20FE2C13C6248D3D73D4D66D9C8587AC68A7976A3BBB8B5808320607400DBDB1918E3D3B90CFC38C4DDFADE990A213D208FBF7898334F4DEED7E5830FD266751315435AE19BB94F4D3DC92652F243DD1F96F3595AB473D2356D8FA8F6D64CC4F64B12CA99ECDD1962572E6ADD609D9C619AAB678B3FC298BC2F0F81FEB4F0D3EBAD7E850A8BCB52CA467E649DE2DB913BFDA001294C49DC369F7D14CC25C5FA65D4D5AF6A436D22BD2839BE23DD3C57825033FECDCE2DED6C511DBEAF4DF2B4CBB7AF8215BB48A550F57D02750E599298F512B1EC1829722FC10A5ACF9537E392A728455905D3AB4837DECE4B63FDFD5DD07A2B76A8C82566DF1A2167DAE5E125B6AA0E76B9D99CA84664F50EEEA54E449F0E587039137F57543D89205483141C933166B61990A706ACA07F467D22BC34C6552F5BBA91CB1FC21DB51D03DFFF6523A5E1B4285D54C47660EDA1B290E4087B30651B542305A714E98A8233577D2AFB383E402F6B9FD214B194C738886BD2289CC5F997951910994B0A6104092FBC9B385639343CF26C9FAF845E7A98CB1F2C9306E8200185D95DE059F83AD17C4B97F8C62CF6C347DC6EB5F2B1F4BF2DD328130D4500CA39BEBA2D4281A3D8CEB4CB1ECDE378B20029FB6A4C543312E41013915C57016E5DA681944CC277F9C7E75F4A654AB2E5DC646ADA242B6223AACC63674F9702146723360811ADBDF2BB938B595BF4C688A8A844130D9DA3F0EFE3650C2283640B342F8922FB6DD10B8BBE35C7AEBEBA416CB0180FB7D2B171149018F8D880463AC26202C2B72F9A7CF83A917AD26183F8E74CD418E3B63459F7AD59849EE43CAC6DF3BB63FCEEC1ABE8E9E0B64B233A43AAC54F9BA0998D2219B3BACA111940D524B7CF94677D6C557750FA4DB9E1077EEDB5BA6E33C104AE25443C86BF1583353ADDF6FDDD19A4FF491188E3D4878769611B36427C8F4C705CF42338475C3185C123919B79B3A4887243B924509C9A4E7A3FFF0517021E51642D9B4526C28A0CF86FB254BE7EAB18701CA5919B754EC2506ECCC087AC6141B4C3A661A3D1A89E0D4DD2DF52CAA5B3402D0026B3C643FA7126E8ED101A94188A048B34AB61E1182D6BE76E2E9E6ACF401443ED0D997DD5AE67346CB1E189791102900225E6B955CD7C9E39FC7255021045FE7ECD40E2C68486A4C2FBCDBC53E847790DAFE5B2CBDCA09BF09DE327076C79F2A339A9F942DFA372B41A390EABBBF296383D438466AE6105B5058117B8E406DCAA62A98AC624D30BC8773DE643CCE7578E8D5C57C5718711421E6D04A182F8714E192EDF3935CB2E0380E10C77AA6583832DEB64CB41DC401541504E0420D06379E4830F06125018DB3810A684C4E888B3B88829CF97EC67FC8549703F9EA5D6B8F67CE9E060F765532C323DB034EC700DB819936FBE6F749FD37CE927663F439498C98C5104D69AE9CD8B04444A471F95390873346858625420ED783203F8DB371D766586137459505525CB3EE59A7FD8AC3C1DB3F5F4E0DAB62B43D1C4CD813D998ED83637F922F884A7584835BCDC03E9F1802B449DFA2D249D9ED7F2F9129815CD8D4EC4169EC51EAC449DDFA5BC0F232D47323D4C48AB558576160103820D485877CC2BBAB641D21D94E67B32CEFB4E9CE5DAC84D03ED3228EA3CDDDD33F3915B9E21B435BBED927A539B556FFCDD4C9822FD1CE81ACCA79B6F7050B0C01A3A317F066A1A89C81E4DF8C7EE2E042203BE8C290CF8605C193471953F8FE6055AC546670A4A075F70909246C3E3B92B39CF2F35AB49FE6FDA67072A6E1B82364820477FD522A746EAC07E0398E873790486ABEBB50EC59A9E4129B7CEA87C5182EB43BCDF096EF740EF67482E98165A4EC64EBE9D57C8D8C16CEAA81F3203DFF26B3D5BF01EDC0B0458B6B47846C326C75A822FC8B42774A3B201EF941484631406CBBEAC961CCF42860EBAC578703B485844BB9E01B2D6C2DF72B62B43BADE982B0BDD0501230FC76B820D2F5BD259EC791B33225E06AFC2D996C90E756DBD4257D53EA6D6FF2371330035B57BF54C9DC4FA95D61536AAC11CDB08A664EFB55BC91A4C6F7FBE3EFA8C621FD9F2CB343898979E7BCBD55B55217143493B6DE415A85D681A2405113F3ADA3EDAEE6EB0E78572D2BAF9C43E3FC39B93081CF2FB3F8879E810F417B733927525ACB6D026F6EC46395620547D057A539A391DE7C6F4B7095911C2D3AE075AD4F2C0C96D9F70E48A42CDCDAE542BAE833EB4A976D4F98410B4A3D77857762D1527EC6714A040BAAEC3BEC41BF9CFF00E1CF81CE61E95D97792D7C0DB7A88545F10D9B0A5940457018817725DA257766906FFBC6172B9C4D2D32A14D00C0D1D01E15280074A4A9FD2D21393F078EF55B16CFEA5327993263BFFE8E99E56837B2763ABD221ED85D83F9187AF8B9E928F00DEFF423FFFDADB786E6678A59AF305CDC02546D0F8AB4681ACC1F00069B0C47BBC9F13D12FD9411F8DF532096D53E4B7861839E602FC5DFA0D0B72232DD81D2B0E4B660A7EBA353DA27E66CEAF2D6C7734925247281866A12D67752A1EDAADD1EA59E4E86E2E85A81A573CD68F6DFB526558D81A8F488F261F355DDAC23F6CAF07D27FDA71D8F3968D4CEEDA89A09DCFDD00C17FA6DB3658CC7AB02C0E5F44B1F526A7DB9269E4DCD1D11B8421C204C07A5DE46E48769579718C69532F1671E552B92EF1451FFBC7A2F412696C6D67EF071C988ACF61F39319CA02DC853247BE1F7F07B3BBD68C901EF36D46868F65D0FA1D1755EFF6C2802212908250207AB65FCE827F0A82F3140DDC692CB5742B3133C541E0FE17718B546AD1FC8CB8A3A5AF69A825A84343C378D7B54F917057A8D026D4A59931FF9E24AB95BCE2BEBDB3A4286E000A4C47E74CF82925DEA07686DD20DE228828705D90638661173E703E712D69B043DCEBAFDC53F132BE3D04CE50D70B80DD118DB00E45DFE953DB87525880BA9F7ABB0FC481A5F3ACFA363AA2153391AEF0DBF680512A0FA37A6C8293865CA95015DEDBF6A21992CE1FEB5F499359E288A3255EF3FBAEA5B01440BD787251CAF362326C7FFAD20086AA7FCF35E95BBF398DB75735E087E0863E016246703946C4E88BCA6FC21446146BD342AAE88AD4249F9A38CA3E4ED0A88AA6FD746F578E5B414CE0F2CE5EC76F87FA3B6321102EF7110B7E6D889B2B85BB11FCFD4AB2B3BD5F92B8A2F66166F0695C0B13F32F97EFAA0CDE4F28E1674BF120E03FA15980873A1C6AB674C085303FDD74832CC1DF5EF787AE11C6E6A9302C1488F36996C1703670786C05324F99D9D2CDA1624DB95B8EAB6F7E10AC5BC985D6536CF667307A577A2555888BD9302CFE704CF72E59FD28B6DAEA343A83531A10A732D65CE93F523A03E5E791EFD5AC502BB0A1F756E93208746240BC7BB1C085728CCC8150D8EA74B33AEBC59567E65B7E2BD83699F607412448D202D948BB111BADD456D68086FF9A5906EA3B2CDA4111D3638391F7A7B153EEA77AB47215D6FE13B350F59F884C6E31AC087239D9145B816424CBA2C8BCB7B3ED7E19638089D91E5C9136D2AEFC8DA165284B42229A70346296A7484648DAAFB9B88994D8823EFBCE9E029ACE51706FC1E6E194B7D8906BB83BE681A96D50F4A66F5AF24AFF5007AA2052277D75E1C6FCB719AE789587E73BFB11029B2AE9E380CD7B4FBD78F0C5318EF61802ECBFE5D91BA28184E96963366BE3D5B063B5B664002AB0682E5820E3F9D30AFD461B3046CFC86F29605458AE94054CC2967CA77F102F97BAE3454BC190726AF88CFFD5F0F05E18D31159EF2A9D9800F48D1B6719617474F1B3594315795822976613CB63A4D6996505F4A4C57F3454EC101F5303F5B84AA8236843FC51D63D8445AD5C9B4F4E15AA0395695CDE980A6D48936146C519FB84AB775647E2CF809135D10B12F7E632654C679F92988B79021746C867A8E05102BB98F8FE7D3D7C005F690D9CA7C6A175FF26ED82B873D63DF4CBEA0218BAD0CB9A8C4404526F9806E2D4FBF37E84B756561C67C3A5732966DC4F0701063EAA7D3A52A2B5E4A8C9ACDB30EE8B9F5125F76BCF7B22DB897DE9A8A9560C54118F31DCCF49E87527D188FCB8 + + # Test seek offset # Tests got from the original implementation of Daniel J. Bernstein # |