diff options
author | Jack Lloyd <[email protected]> | 2016-09-01 13:19:12 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2016-09-01 13:20:05 -0400 |
commit | fc4b34d9d23c1afedaa71835c7af8f397c51c56d (patch) | |
tree | d933804ebcebfa5689aa0f41b012596615f7ec5b /src | |
parent | e358acf9e3fd74e7dc307a203977652ca3a9a3c9 (diff) |
4x interleaved SSE2
Diffstat (limited to 'src')
-rw-r--r-- | src/cli/speed.cpp | 5 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 292 |
2 files changed, 228 insertions, 69 deletions
diff --git a/src/cli/speed.cpp b/src/cli/speed.cpp index 222a98d3f..c1f3a91e8 100644 --- a/src/cli/speed.cpp +++ b/src/cli/speed.cpp @@ -521,10 +521,11 @@ class Speed final : public Command Timer encrypt_timer(cipher.name(), provider, "encrypt", buffer.size()); + const Botan::SymmetricKey key(rng(), cipher.maximum_keylength()); + cipher.set_key(key); + while(encrypt_timer.under(runtime)) { - const Botan::SymmetricKey key(rng(), cipher.maximum_keylength()); - cipher.set_key(key); encrypt_timer.run([&] { cipher.encipher(buffer); }); } diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp index 34376d84c..8c00ce133 100644 --- a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp +++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp @@ -16,81 +16,239 @@ void ChaCha::chacha_sse2_x4(byte output[64], u32bit input[16], size_t rounds) BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); const __m128i* input_mm = reinterpret_cast<const __m128i*>(input); + __m128i* output_mm = reinterpret_cast<__m128i*>(output); - const __m128i input0 = _mm_loadu_si128(input_mm); - const __m128i input1 = _mm_loadu_si128(input_mm + 1); - const __m128i input2 = _mm_loadu_si128(input_mm + 2); - const __m128i input3 = _mm_loadu_si128(input_mm + 3); - // TODO: interleave! + __m128i input0 = _mm_loadu_si128(input_mm); + __m128i input1 = _mm_loadu_si128(input_mm + 1); + __m128i input2 = _mm_loadu_si128(input_mm + 2); + __m128i input3 = _mm_loadu_si128(input_mm + 3); + + // TODO: try transposing, which would avoid the permutations each round #define mm_rotl(r, n) \ _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) - for(size_t i = 0; i != 4; ++i) + __m128i r0_0 = input0; + __m128i r0_1 = input1; + __m128i r0_2 = input2; + __m128i r0_3 = input3; + + __m128i r1_0 = input0; + __m128i r1_1 = input1; + __m128i r1_2 = input2; + __m128i r1_3 = input3; + r1_3 = _mm_add_epi64(r0_3, _mm_set_epi64x(0, 1)); + + __m128i r2_0 = input0; + __m128i r2_1 = input1; + __m128i r2_2 = input2; + __m128i r2_3 = input3; + r2_3 = _mm_add_epi64(r0_3, _mm_set_epi64x(0, 2)); + + __m128i r3_0 = input0; + __m128i r3_1 = input1; + __m128i r3_2 = input2; + __m128i r3_3 = input3; + r3_3 = _mm_add_epi64(r0_3, _mm_set_epi64x(0, 3)); + + for(size_t r = 0; r != rounds / 2; ++r) { - __m128i r0 = input0; - __m128i r1 = input1; - __m128i r2 = input2; - __m128i r3 = input3; - - r3 = _mm_add_epi64(r3, _mm_set_epi64x(0, i)); - - for(size_t r = 0; r != rounds / 2; ++r) - { - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 16); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 12); - - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 8); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 7); - - r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(0, 3, 2, 1)); - r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); - r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(2, 1, 0, 3)); - - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 16); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 12); - - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 8); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 7); - - r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 1, 0, 3)); - r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); - r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(0, 3, 2, 1)); - } - - r0 = _mm_add_epi32(r0, input0); - r1 = _mm_add_epi32(r1, input1); - r2 = _mm_add_epi32(r2, input2); - r3 = _mm_add_epi32(r3, input3); - r3 = _mm_add_epi64(r3, _mm_set_epi64x(0, i)); - - __m128i* output_mm = reinterpret_cast<__m128i*>(output); - _mm_storeu_si128(output_mm + 4*i , r0); - _mm_storeu_si128(output_mm + 4*i + 1, r1); - _mm_storeu_si128(output_mm + 4*i + 2, r2); - _mm_storeu_si128(output_mm + 4*i + 3, r3); + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 16); + r1_3 = mm_rotl(r1_3, 16); + r2_3 = mm_rotl(r2_3, 16); + r3_3 = mm_rotl(r3_3, 16); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 12); + r1_1 = mm_rotl(r1_1, 12); + r2_1 = mm_rotl(r2_1, 12); + r3_1 = mm_rotl(r3_1, 12); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 8); + r1_3 = mm_rotl(r1_3, 8); + r2_3 = mm_rotl(r2_3, 8); + r3_3 = mm_rotl(r3_3, 8); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 7); + r1_1 = mm_rotl(r1_1, 7); + r2_1 = mm_rotl(r2_1, 7); + r3_1 = mm_rotl(r3_1, 7); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 16); + r1_3 = mm_rotl(r1_3, 16); + r2_3 = mm_rotl(r2_3, 16); + r3_3 = mm_rotl(r3_3, 16); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 12); + r1_1 = mm_rotl(r1_1, 12); + r2_1 = mm_rotl(r2_1, 12); + r3_1 = mm_rotl(r3_1, 12); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = mm_rotl(r0_3, 8); + r1_3 = mm_rotl(r1_3, 8); + r2_3 = mm_rotl(r2_3, 8); + r3_3 = mm_rotl(r3_3, 8); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = mm_rotl(r0_1, 7); + r1_1 = mm_rotl(r1_1, 7); + r2_1 = mm_rotl(r2_1, 7); + r3_1 = mm_rotl(r3_1, 7); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); } + r0_0 = _mm_add_epi32(r0_0, input0); + r0_1 = _mm_add_epi32(r0_1, input1); + r0_2 = _mm_add_epi32(r0_2, input2); + r0_3 = _mm_add_epi32(r0_3, input3); + + r1_0 = _mm_add_epi32(r1_0, input0); + r1_1 = _mm_add_epi32(r1_1, input1); + r1_2 = _mm_add_epi32(r1_2, input2); + r1_3 = _mm_add_epi32(r1_3, input3); + r1_3 = _mm_add_epi64(r1_3, _mm_set_epi64x(0, 1)); + + r2_0 = _mm_add_epi32(r2_0, input0); + r2_1 = _mm_add_epi32(r2_1, input1); + r2_2 = _mm_add_epi32(r2_2, input2); + r2_3 = _mm_add_epi32(r2_3, input3); + r2_3 = _mm_add_epi64(r2_3, _mm_set_epi64x(0, 2)); + + r3_0 = _mm_add_epi32(r3_0, input0); + r3_1 = _mm_add_epi32(r3_1, input1); + r3_2 = _mm_add_epi32(r3_2, input2); + r3_3 = _mm_add_epi32(r3_3, input3); + r3_3 = _mm_add_epi64(r3_3, _mm_set_epi64x(0, 3)); + + _mm_storeu_si128(output_mm + 0, r0_0); + _mm_storeu_si128(output_mm + 1, r0_1); + _mm_storeu_si128(output_mm + 2, r0_2); + _mm_storeu_si128(output_mm + 3, r0_3); + + _mm_storeu_si128(output_mm + 4 , r1_0); + _mm_storeu_si128(output_mm + 5, r1_1); + _mm_storeu_si128(output_mm + 6, r1_2); + _mm_storeu_si128(output_mm + 7, r1_3); + + _mm_storeu_si128(output_mm + 8, r2_0); + _mm_storeu_si128(output_mm + 9, r2_1); + _mm_storeu_si128(output_mm + 10, r2_2); + _mm_storeu_si128(output_mm + 11, r2_3); + + _mm_storeu_si128(output_mm + 12 , r3_0); + _mm_storeu_si128(output_mm + 13, r3_1); + _mm_storeu_si128(output_mm + 14, r3_2); + _mm_storeu_si128(output_mm + 15, r3_3); + #undef mm_rotl } |