diff options
author | Jack Lloyd <[email protected]> | 2018-10-26 09:53:30 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2018-10-26 09:53:30 -0400 |
commit | 2e4d3f1ea0d61f5fb35b3b62aa9832c251f4d5c5 (patch) | |
tree | f4cd5c41ec982bbfbeebd07c938963e2e928c25c | |
parent | 170a1c052cc0dda26b7893107e6ecc037e1811a5 (diff) | |
parent | 21d39026ed5afb71923a6b57b1379e5ec24fb6d8 (diff) |
Merge GH #1719 Add NEON ChaCha
-rw-r--r-- | doc/todo.rst | 1 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha.cpp | 16 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha.h | 4 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp | 83 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_neon/chacha_neon.cpp | 299 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_neon/info.txt | 5 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 76 |
7 files changed, 403 insertions, 81 deletions
diff --git a/doc/todo.rst b/doc/todo.rst index 6a9c3f271..22cdcc5f3 100644 --- a/doc/todo.rst +++ b/doc/todo.rst @@ -15,7 +15,6 @@ Ciphers, Hashes, PBKDF * Compressed tables for AES * AES using vector permutes for NEON * Camellia using AES-NI -* ChaCha20 using NEON * Poly1305 using AVX2 * ASCON 1.2 (CAESAR) * NORX-64 3.0 (CAESAR) diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp index 0670faa5e..c415d7fec 100644 --- a/src/lib/stream/chacha/chacha.cpp +++ b/src/lib/stream/chacha/chacha.cpp @@ -81,6 +81,13 @@ std::string ChaCha::provider() const } #endif +#if defined(BOTAN_HAS_CHACHA_NEON) + if(CPUID::has_neon()) + { + return "neon"; + } +#endif + return "base"; } @@ -105,6 +112,15 @@ void ChaCha::chacha_x8(uint8_t output[64*8], uint32_t input[16], size_t rounds) } #endif +#if defined(BOTAN_HAS_CHACHA_NEON) + if(CPUID::has_neon()) + { + ChaCha::chacha_neon_x4(output, input, rounds); + ChaCha::chacha_neon_x4(output + 4*64, input, rounds); + return; + } +#endif + // TODO interleave rounds for(size_t i = 0; i != 8; ++i) { diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h index 390c3b788..89deaad52 100644 --- a/src/lib/stream/chacha/chacha.h +++ b/src/lib/stream/chacha/chacha.h @@ -68,6 +68,10 @@ class BOTAN_PUBLIC_API(2,0) ChaCha final : public StreamCipher void chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds); #endif +#if defined(BOTAN_HAS_CHACHA_NEON) + void chacha_neon_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds); +#endif + size_t m_rounds; secure_vector<uint32_t> m_key; secure_vector<uint32_t> m_state; diff --git a/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp b/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp index b8b448ab7..1cc619ef6 100644 --- a/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp +++ b/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp @@ -11,17 +11,17 @@ namespace Botan { //static BOTAN_FUNC_ISA("avx2") -void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t input[16], size_t rounds) +void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - const __m128i* input_mm = reinterpret_cast<const __m128i*>(input); + const __m128i* state_mm = reinterpret_cast<const __m128i*>(state); __m256i* output_mm = reinterpret_cast<__m256i*>(output); - const __m256i input0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(input_mm)); - const __m256i input1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(input_mm + 1)); - const __m256i input2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(input_mm + 2)); - const __m256i input3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(input_mm + 3)); + const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm)); + const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 1)); + const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2)); + const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3)); const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4); const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5); @@ -36,25 +36,25 @@ void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t input[16], size_t rou #define mm_rotl(r, n) \ _mm256_or_si256(_mm256_slli_epi32(r, n), _mm256_srli_epi32(r, 32-n)) - __m256i X0_0 = input0; - __m256i X0_1 = input1; - __m256i X0_2 = input2; - __m256i X0_3 = _mm256_add_epi64(input3, CTR0); + __m256i X0_0 = state0; + __m256i X0_1 = state1; + __m256i X0_2 = state2; + __m256i X0_3 = _mm256_add_epi64(state3, CTR0); - __m256i X1_0 = input0; - __m256i X1_1 = input1; - __m256i X1_2 = input2; - __m256i X1_3 = _mm256_add_epi64(input3, CTR1); + __m256i X1_0 = state0; + __m256i X1_1 = state1; + __m256i X1_2 = state2; + __m256i X1_3 = _mm256_add_epi64(state3, CTR1); - __m256i X2_0 = input0; - __m256i X2_1 = input1; - __m256i X2_2 = input2; - __m256i X2_3 = _mm256_add_epi64(input3, CTR2); + __m256i X2_0 = state0; + __m256i X2_1 = state1; + __m256i X2_2 = state2; + __m256i X2_3 = _mm256_add_epi64(state3, CTR2); - __m256i X3_0 = input0; - __m256i X3_1 = input1; - __m256i X3_2 = input2; - __m256i X3_3 = _mm256_add_epi64(input3, CTR3); + __m256i X3_0 = state0; + __m256i X3_1 = state1; + __m256i X3_2 = state2; + __m256i X3_3 = _mm256_add_epi64(state3, CTR3); for(size_t r = 0; r != rounds / 2; ++r) { @@ -211,28 +211,28 @@ void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t input[16], size_t rou X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); } - X0_0 = _mm256_add_epi32(X0_0, input0); - X0_1 = _mm256_add_epi32(X0_1, input1); - X0_2 = _mm256_add_epi32(X0_2, input2); - X0_3 = _mm256_add_epi32(X0_3, input3); + X0_0 = _mm256_add_epi32(X0_0, state0); + X0_1 = _mm256_add_epi32(X0_1, state1); + X0_2 = _mm256_add_epi32(X0_2, state2); + X0_3 = _mm256_add_epi32(X0_3, state3); X0_3 = _mm256_add_epi64(X0_3, CTR0); - X1_0 = _mm256_add_epi32(X1_0, input0); - X1_1 = _mm256_add_epi32(X1_1, input1); - X1_2 = _mm256_add_epi32(X1_2, input2); - X1_3 = _mm256_add_epi32(X1_3, input3); + X1_0 = _mm256_add_epi32(X1_0, state0); + X1_1 = _mm256_add_epi32(X1_1, state1); + X1_2 = _mm256_add_epi32(X1_2, state2); + X1_3 = _mm256_add_epi32(X1_3, state3); X1_3 = _mm256_add_epi64(X1_3, CTR1); - X2_0 = _mm256_add_epi32(X2_0, input0); - X2_1 = _mm256_add_epi32(X2_1, input1); - X2_2 = _mm256_add_epi32(X2_2, input2); - X2_3 = _mm256_add_epi32(X2_3, input3); + X2_0 = _mm256_add_epi32(X2_0, state0); + X2_1 = _mm256_add_epi32(X2_1, state1); + X2_2 = _mm256_add_epi32(X2_2, state2); + X2_3 = _mm256_add_epi32(X2_3, state3); X2_3 = _mm256_add_epi64(X2_3, CTR2); - X3_0 = _mm256_add_epi32(X3_0, input0); - X3_1 = _mm256_add_epi32(X3_1, input1); - X3_2 = _mm256_add_epi32(X3_2, input2); - X3_3 = _mm256_add_epi32(X3_3, input3); + X3_0 = _mm256_add_epi32(X3_0, state0); + X3_1 = _mm256_add_epi32(X3_1, state1); + X3_2 = _mm256_add_epi32(X3_2, state2); + X3_3 = _mm256_add_epi32(X3_3, state3); X3_3 = _mm256_add_epi64(X3_3, CTR3); _mm256_storeu_si256(output_mm , _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); @@ -255,9 +255,8 @@ void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t input[16], size_t rou #undef mm_rotl - input[12] += 8; - if(input[12] < 8) - input[13]++; - + state[12] += 8; + if(state[12] < 8) + state[13]++; } } diff --git a/src/lib/stream/chacha/chacha_neon/chacha_neon.cpp b/src/lib/stream/chacha/chacha_neon/chacha_neon.cpp new file mode 100644 index 000000000..eb777a58c --- /dev/null +++ b/src/lib/stream/chacha/chacha_neon/chacha_neon.cpp @@ -0,0 +1,299 @@ +/* +* NEON ChaCha impl originally written by Jeffrey Walton for Crypto++ +* and released as public domain. +* +* Further changes +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/chacha.h> +#include <arm_neon.h> + +namespace Botan { + +namespace { + +template <unsigned int R> +inline uint32x4_t RotateLeft(const uint32x4_t& val) + { + return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R)); + } + +template <unsigned int R> +inline uint32x4_t RotateRight(const uint32x4_t& val) + { + return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R)); + } + +// ChaCha's use of shuffle is really a 4, 8, or 12 byte rotation: +// * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x) +// * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x) +// * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x) +template <unsigned int S> +inline uint32x4_t Shuffle(const uint32x4_t& val) + { + return vextq_u32(val, val, S); + } + +#if defined(BOTAN_TARGET_ARCH_IS_ARM64) + +template <> +inline uint32x4_t RotateLeft<8>(const uint32x4_t& val) + { + const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; + const uint8x16_t mask = vld1q_u8(maskb); + + return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); + } + +template <> +inline uint32x4_t RotateLeft<16>(const uint32x4_t& val) + { + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(val))); + } + +#endif + +} + +//static +void ChaCha::chacha_neon_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) + { + BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); + + const uint32x4_t state0 = vld1q_u32(state + 0*4); + const uint32x4_t state1 = vld1q_u32(state + 1*4); + const uint32x4_t state2 = vld1q_u32(state + 2*4); + const uint32x4_t state3 = vld1q_u32(state + 3*4); + + const uint64x2_t CTRS[3] = { + {1, 0}, {2, 0}, {3, 0} + //{0, 1}, {0, 2}, {0, 3} + }; + + uint32x4_t r0_0 = state0; + uint32x4_t r0_1 = state1; + uint32x4_t r0_2 = state2; + uint32x4_t r0_3 = state3; + + uint32x4_t r1_0 = state0; + uint32x4_t r1_1 = state1; + uint32x4_t r1_2 = state2; + uint32x4_t r1_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r0_3), CTRS[0])); + + uint32x4_t r2_0 = state0; + uint32x4_t r2_1 = state1; + uint32x4_t r2_2 = state2; + uint32x4_t r2_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r0_3), CTRS[1])); + + uint32x4_t r3_0 = state0; + uint32x4_t r3_1 = state1; + uint32x4_t r3_2 = state2; + uint32x4_t r3_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r0_3), CTRS[2])); + + for(size_t i = 0; i != rounds / 2; ++i) + { + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); + + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); + + r0_1 = Shuffle<1>(r0_1); + r0_2 = Shuffle<2>(r0_2); + r0_3 = Shuffle<3>(r0_3); + + r1_1 = Shuffle<1>(r1_1); + r1_2 = Shuffle<2>(r1_2); + r1_3 = Shuffle<3>(r1_3); + + r2_1 = Shuffle<1>(r2_1); + r2_2 = Shuffle<2>(r2_2); + r2_3 = Shuffle<3>(r2_3); + + r3_1 = Shuffle<1>(r3_1); + r3_2 = Shuffle<2>(r3_2); + r3_3 = Shuffle<3>(r3_3); + + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); + + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); + + r0_1 = Shuffle<3>(r0_1); + r0_2 = Shuffle<2>(r0_2); + r0_3 = Shuffle<1>(r0_3); + + r1_1 = Shuffle<3>(r1_1); + r1_2 = Shuffle<2>(r1_2); + r1_3 = Shuffle<1>(r1_3); + + r2_1 = Shuffle<3>(r2_1); + r2_2 = Shuffle<2>(r2_2); + r2_3 = Shuffle<1>(r2_3); + + r3_1 = Shuffle<3>(r3_1); + r3_2 = Shuffle<2>(r3_2); + r3_3 = Shuffle<1>(r3_3); + } + + r0_0 = vaddq_u32(r0_0, state0); + r0_1 = vaddq_u32(r0_1, state1); + r0_2 = vaddq_u32(r0_2, state2); + r0_3 = vaddq_u32(r0_3, state3); + + r1_0 = vaddq_u32(r1_0, state0); + r1_1 = vaddq_u32(r1_1, state1); + r1_2 = vaddq_u32(r1_2, state2); + r1_3 = vaddq_u32(r1_3, state3); + r1_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r1_3), CTRS[0])); + + r2_0 = vaddq_u32(r2_0, state0); + r2_1 = vaddq_u32(r2_1, state1); + r2_2 = vaddq_u32(r2_2, state2); + r2_3 = vaddq_u32(r2_3, state3); + r2_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r2_3), CTRS[1])); + + r3_0 = vaddq_u32(r3_0, state0); + r3_1 = vaddq_u32(r3_1, state1); + r3_2 = vaddq_u32(r3_2, state2); + r3_3 = vaddq_u32(r3_3, state3); + r3_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r3_3), CTRS[2])); + + vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0)); + vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1)); + vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2)); + vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3)); + + vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0)); + vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1)); + vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2)); + vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3)); + + vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0)); + vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1)); + vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2)); + vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3)); + + vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0)); + vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1)); + vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2)); + vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3)); + + state[12] += 4; + if(state[12] < 4) + state[13]++; + } + +} diff --git a/src/lib/stream/chacha/chacha_neon/info.txt b/src/lib/stream/chacha/chacha_neon/info.txt new file mode 100644 index 000000000..118478b7a --- /dev/null +++ b/src/lib/stream/chacha/chacha_neon/info.txt @@ -0,0 +1,5 @@ +<defines> +CHACHA_NEON -> 20181026 +</defines> + +need_isa neon diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp index 9641be67b..121c92a3e 100644 --- a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp +++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp @@ -12,41 +12,41 @@ namespace Botan { //static BOTAN_FUNC_ISA("sse2") -void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t input[16], size_t rounds) +void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - const __m128i* input_mm = reinterpret_cast<const __m128i*>(input); + const __m128i* state_mm = reinterpret_cast<const __m128i*>(state); __m128i* output_mm = reinterpret_cast<__m128i*>(output); - __m128i input0 = _mm_loadu_si128(input_mm); - __m128i input1 = _mm_loadu_si128(input_mm + 1); - __m128i input2 = _mm_loadu_si128(input_mm + 2); - __m128i input3 = _mm_loadu_si128(input_mm + 3); + __m128i state0 = _mm_loadu_si128(state_mm); + __m128i state1 = _mm_loadu_si128(state_mm + 1); + __m128i state2 = _mm_loadu_si128(state_mm + 2); + __m128i state3 = _mm_loadu_si128(state_mm + 3); // TODO: try transposing, which would avoid the permutations each round #define mm_rotl(r, n) \ _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) - __m128i r0_0 = input0; - __m128i r0_1 = input1; - __m128i r0_2 = input2; - __m128i r0_3 = input3; + __m128i r0_0 = state0; + __m128i r0_1 = state1; + __m128i r0_2 = state2; + __m128i r0_3 = state3; - __m128i r1_0 = input0; - __m128i r1_1 = input1; - __m128i r1_2 = input2; + __m128i r1_0 = state0; + __m128i r1_1 = state1; + __m128i r1_2 = state2; __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); - __m128i r2_0 = input0; - __m128i r2_1 = input1; - __m128i r2_2 = input2; + __m128i r2_0 = state0; + __m128i r2_1 = state1; + __m128i r2_2 = state2; __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); - __m128i r3_0 = input0; - __m128i r3_1 = input1; - __m128i r3_2 = input2; + __m128i r3_0 = state0; + __m128i r3_1 = state1; + __m128i r3_2 = state2; __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); for(size_t r = 0; r != rounds / 2; ++r) @@ -204,27 +204,27 @@ void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t input[16], size_t rou r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); } - r0_0 = _mm_add_epi32(r0_0, input0); - r0_1 = _mm_add_epi32(r0_1, input1); - r0_2 = _mm_add_epi32(r0_2, input2); - r0_3 = _mm_add_epi32(r0_3, input3); + r0_0 = _mm_add_epi32(r0_0, state0); + r0_1 = _mm_add_epi32(r0_1, state1); + r0_2 = _mm_add_epi32(r0_2, state2); + r0_3 = _mm_add_epi32(r0_3, state3); - r1_0 = _mm_add_epi32(r1_0, input0); - r1_1 = _mm_add_epi32(r1_1, input1); - r1_2 = _mm_add_epi32(r1_2, input2); - r1_3 = _mm_add_epi32(r1_3, input3); + r1_0 = _mm_add_epi32(r1_0, state0); + r1_1 = _mm_add_epi32(r1_1, state1); + r1_2 = _mm_add_epi32(r1_2, state2); + r1_3 = _mm_add_epi32(r1_3, state3); r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); - r2_0 = _mm_add_epi32(r2_0, input0); - r2_1 = _mm_add_epi32(r2_1, input1); - r2_2 = _mm_add_epi32(r2_2, input2); - r2_3 = _mm_add_epi32(r2_3, input3); + r2_0 = _mm_add_epi32(r2_0, state0); + r2_1 = _mm_add_epi32(r2_1, state1); + r2_2 = _mm_add_epi32(r2_2, state2); + r2_3 = _mm_add_epi32(r2_3, state3); r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); - r3_0 = _mm_add_epi32(r3_0, input0); - r3_1 = _mm_add_epi32(r3_1, input1); - r3_2 = _mm_add_epi32(r3_2, input2); - r3_3 = _mm_add_epi32(r3_3, input3); + r3_0 = _mm_add_epi32(r3_0, state0); + r3_1 = _mm_add_epi32(r3_1, state1); + r3_2 = _mm_add_epi32(r3_2, state2); + r3_3 = _mm_add_epi32(r3_3, state3); r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); _mm_storeu_si128(output_mm + 0, r0_0); @@ -249,9 +249,9 @@ void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t input[16], size_t rou #undef mm_rotl - input[12] += 4; - if(input[12] < 4) - input[13]++; + state[12] += 4; + if(state[12] < 4) + state[13]++; } } |