diff options
Diffstat (limited to 'src/lib/stream/chacha/chacha_sse2')
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 257 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/info.txt | 5 |
2 files changed, 0 insertions, 262 deletions
diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp deleted file mode 100644 index 121c92a3e..000000000 --- a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp +++ /dev/null @@ -1,257 +0,0 @@ -/* -* SSE2 ChaCha -* (C) 2016 Jack Lloyd -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#include <botan/chacha.h> -#include <emmintrin.h> - -namespace Botan { - -//static -BOTAN_FUNC_ISA("sse2") -void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) - { - BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - - const __m128i* state_mm = reinterpret_cast<const __m128i*>(state); - __m128i* output_mm = reinterpret_cast<__m128i*>(output); - - __m128i state0 = _mm_loadu_si128(state_mm); - __m128i state1 = _mm_loadu_si128(state_mm + 1); - __m128i state2 = _mm_loadu_si128(state_mm + 2); - __m128i state3 = _mm_loadu_si128(state_mm + 3); - - // TODO: try transposing, which would avoid the permutations each round - -#define mm_rotl(r, n) \ - _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) - - __m128i r0_0 = state0; - __m128i r0_1 = state1; - __m128i r0_2 = state2; - __m128i r0_3 = state3; - - __m128i r1_0 = state0; - __m128i r1_1 = state1; - __m128i r1_2 = state2; - __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); - - __m128i r2_0 = state0; - __m128i r2_1 = state1; - __m128i r2_2 = state2; - __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); - - __m128i r3_0 = state0; - __m128i r3_1 = state1; - __m128i r3_2 = state2; - __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); - - for(size_t r = 0; r != rounds / 2; ++r) - { - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 16); - r1_3 = mm_rotl(r1_3, 16); - r2_3 = mm_rotl(r2_3, 16); - r3_3 = mm_rotl(r3_3, 16); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 12); - r1_1 = mm_rotl(r1_1, 12); - r2_1 = mm_rotl(r2_1, 12); - r3_1 = mm_rotl(r3_1, 12); - - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 8); - r1_3 = mm_rotl(r1_3, 8); - r2_3 = mm_rotl(r2_3, 8); - r3_3 = mm_rotl(r3_3, 8); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 7); - r1_1 = mm_rotl(r1_1, 7); - r2_1 = mm_rotl(r2_1, 7); - r3_1 = mm_rotl(r3_1, 7); - - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 16); - r1_3 = mm_rotl(r1_3, 16); - r2_3 = mm_rotl(r2_3, 16); - r3_3 = mm_rotl(r3_3, 16); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 12); - r1_1 = mm_rotl(r1_1, 12); - r2_1 = mm_rotl(r2_1, 12); - r3_1 = mm_rotl(r3_1, 12); - - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 8); - r1_3 = mm_rotl(r1_3, 8); - r2_3 = mm_rotl(r2_3, 8); - r3_3 = mm_rotl(r3_3, 8); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 7); - r1_1 = mm_rotl(r1_1, 7); - r2_1 = mm_rotl(r2_1, 7); - r3_1 = mm_rotl(r3_1, 7); - - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); - } - - r0_0 = _mm_add_epi32(r0_0, state0); - r0_1 = _mm_add_epi32(r0_1, state1); - r0_2 = _mm_add_epi32(r0_2, state2); - r0_3 = _mm_add_epi32(r0_3, state3); - - r1_0 = _mm_add_epi32(r1_0, state0); - r1_1 = _mm_add_epi32(r1_1, state1); - r1_2 = _mm_add_epi32(r1_2, state2); - r1_3 = _mm_add_epi32(r1_3, state3); - r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); - - r2_0 = _mm_add_epi32(r2_0, state0); - r2_1 = _mm_add_epi32(r2_1, state1); - r2_2 = _mm_add_epi32(r2_2, state2); - r2_3 = _mm_add_epi32(r2_3, state3); - r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); - - r3_0 = _mm_add_epi32(r3_0, state0); - r3_1 = _mm_add_epi32(r3_1, state1); - r3_2 = _mm_add_epi32(r3_2, state2); - r3_3 = _mm_add_epi32(r3_3, state3); - r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); - - _mm_storeu_si128(output_mm + 0, r0_0); - _mm_storeu_si128(output_mm + 1, r0_1); - _mm_storeu_si128(output_mm + 2, r0_2); - _mm_storeu_si128(output_mm + 3, r0_3); - - _mm_storeu_si128(output_mm + 4, r1_0); - _mm_storeu_si128(output_mm + 5, r1_1); - _mm_storeu_si128(output_mm + 6, r1_2); - _mm_storeu_si128(output_mm + 7, r1_3); - - _mm_storeu_si128(output_mm + 8, r2_0); - _mm_storeu_si128(output_mm + 9, r2_1); - _mm_storeu_si128(output_mm + 10, r2_2); - _mm_storeu_si128(output_mm + 11, r2_3); - - _mm_storeu_si128(output_mm + 12, r3_0); - _mm_storeu_si128(output_mm + 13, r3_1); - _mm_storeu_si128(output_mm + 14, r3_2); - _mm_storeu_si128(output_mm + 15, r3_3); - -#undef mm_rotl - - state[12] += 4; - if(state[12] < 4) - state[13]++; - } - -} diff --git a/src/lib/stream/chacha/chacha_sse2/info.txt b/src/lib/stream/chacha/chacha_sse2/info.txt deleted file mode 100644 index 20e0eb03f..000000000 --- a/src/lib/stream/chacha/chacha_sse2/info.txt +++ /dev/null @@ -1,5 +0,0 @@ -<defines> -CHACHA_SSE2 -> 20160831 -</defines> - -need_isa sse2 |