From 858e3be10396e082901b612ee8c5e18cd3e47286 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Wed, 31 Aug 2016 17:09:23 -0400 Subject: SSE2 ChaCha --- src/lib/stream/chacha/chacha.cpp | 18 +++-- src/lib/stream/chacha/chacha.h | 6 ++ src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 90 +++++++++++++++++++++++ src/lib/stream/chacha/chacha_sse2/info.txt | 3 + 4 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp create mode 100644 src/lib/stream/chacha/chacha_sse2/info.txt diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp index 40da93029..97b6465f9 100644 --- a/src/lib/stream/chacha/chacha.cpp +++ b/src/lib/stream/chacha/chacha.cpp @@ -7,6 +7,7 @@ #include #include +#include namespace Botan { @@ -16,12 +17,18 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds) throw Invalid_Argument("ChaCha only supports 8, 12 or 20 rounds"); } -namespace { - -void chacha(byte output[64], const u32bit input[16], size_t rounds) +//static +void ChaCha::chacha(byte output[64], const u32bit input[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); + #if defined(BOTAN_TARGET_SUPPORTS_SSE2) + if(CPUID::has_sse2()) + { + return ChaCha::chacha_sse2(output, input, rounds); + } + #endif + u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], @@ -67,7 +74,6 @@ void chacha(byte output[64], const u32bit input[16], size_t rounds) store_le(x14 + input[14], output + 4 * 14); store_le(x15 + input[15], output + 4 * 15); } -} /* * Combine cipher stream with message @@ -80,7 +86,7 @@ void ChaCha::cipher(const byte in[], byte out[], size_t length) length -= (m_buffer.size() - m_position); in += (m_buffer.size() - m_position); out += (m_buffer.size() - m_position); - chacha(m_buffer.data(), m_state.data(), m_rounds); + chacha_sse2(m_buffer.data(), m_state.data(), m_rounds); ++m_state[12]; m_state[13] += (m_state[12] == 0); @@ -176,7 +182,7 @@ void ChaCha::seek(u64bit offset) { if (m_state.size() == 0 && m_buffer.size() == 0) { - throw Invalid_State("You have to setup the stream cipher (key and iv)"); + throw Invalid_State("You have to setup the stream cipher (key and iv)"); } m_position = offset % m_buffer.size(); diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h index f8f42e41d..ab28f9563 100644 --- a/src/lib/stream/chacha/chacha.h +++ b/src/lib/stream/chacha/chacha.h @@ -47,6 +47,12 @@ class BOTAN_DLL ChaCha final : public StreamCipher private: void key_schedule(const byte key[], size_t key_len) override; + void chacha(byte output[64], const u32bit input[16], size_t rounds); + +#if defined(BOTAN_TARGET_SUPPORTS_SSE2) + void chacha_sse2(byte output[64], const u32bit input[16], size_t rounds); +#endif + size_t m_rounds; secure_vector m_state; secure_vector m_buffer; diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp new file mode 100644 index 000000000..aa1ca45ff --- /dev/null +++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp @@ -0,0 +1,90 @@ +/* +* SSE2 ChaCha +* (C) 2016 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include +#include + +namespace Botan { + +//static +void ChaCha::chacha_sse2(byte output[64], const u32bit input[16], size_t rounds) + { + BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); + + const __m128i* input_mm = reinterpret_cast(input); + + const __m128i input0 = _mm_loadu_si128(input_mm); + const __m128i input1 = _mm_loadu_si128(input_mm + 1); + const __m128i input2 = _mm_loadu_si128(input_mm + 2); + const __m128i input3 = _mm_loadu_si128(input_mm + 3); + + __m128i r0 = input0; + __m128i r1 = input1; + __m128i r2 = input2; + __m128i r3 = input3; + +#define mm_rotl(r, n) \ + _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) + + for(size_t i = 0; i != rounds / 2; ++i) + { + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 16); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 12); + + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 8); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 7); + + r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(0, 3, 2, 1)); + r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); + r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(2, 1, 0, 3)); + + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 16); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 12); + + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 8); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 7); + + r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 1, 0, 3)); + r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); + r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(0, 3, 2, 1)); + } + +#undef mm_rotl + + r0 = _mm_add_epi32(r0, input0); + r1 = _mm_add_epi32(r1, input1); + r2 = _mm_add_epi32(r2, input2); + r3 = _mm_add_epi32(r3, input3); + + __m128i* output_mm = reinterpret_cast<__m128i*>(output); + _mm_storeu_si128(output_mm , r0); + _mm_storeu_si128(output_mm + 1, r1); + _mm_storeu_si128(output_mm + 2, r2); + _mm_storeu_si128(output_mm + 3, r3); + } + +} diff --git a/src/lib/stream/chacha/chacha_sse2/info.txt b/src/lib/stream/chacha/chacha_sse2/info.txt new file mode 100644 index 000000000..965479746 --- /dev/null +++ b/src/lib/stream/chacha/chacha_sse2/info.txt @@ -0,0 +1,3 @@ +define CHACHA_SSE2 20160831 + +need_isa sse2 -- cgit v1.2.3