diff options
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/stream/chacha/chacha.cpp | 32 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha.h | 8 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp | 26 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_neon/chacha_neon.cpp | 299 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_neon/info.txt | 5 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_simd32/chacha_simd32.cpp | 205 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_simd32/info.txt | 7 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 257 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/info.txt | 5 |
9 files changed, 236 insertions, 608 deletions
diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp index c415d7fec..a1355cbe6 100644 --- a/src/lib/stream/chacha/chacha.cpp +++ b/src/lib/stream/chacha/chacha.cpp @@ -74,17 +74,10 @@ std::string ChaCha::provider() const } #endif -#if defined(BOTAN_HAS_CHACHA_SSE2) - if(CPUID::has_sse2()) +#if defined(BOTAN_HAS_CHACHA_SIMD32) + if(CPUID::has_simd_32()) { - return "sse2"; - } -#endif - -#if defined(BOTAN_HAS_CHACHA_NEON) - if(CPUID::has_neon()) - { - return "neon"; + return "simd32"; } #endif @@ -103,20 +96,11 @@ void ChaCha::chacha_x8(uint8_t output[64*8], uint32_t input[16], size_t rounds) } #endif -#if defined(BOTAN_HAS_CHACHA_SSE2) - if(CPUID::has_sse2()) - { - ChaCha::chacha_sse2_x4(output, input, rounds); - ChaCha::chacha_sse2_x4(output + 4*64, input, rounds); - return; - } -#endif - -#if defined(BOTAN_HAS_CHACHA_NEON) - if(CPUID::has_neon()) +#if defined(BOTAN_HAS_CHACHA_SIMD32) + if(CPUID::has_simd_32()) { - ChaCha::chacha_neon_x4(output, input, rounds); - ChaCha::chacha_neon_x4(output + 4*64, input, rounds); + ChaCha::chacha_simd32_x4(output, input, rounds); + ChaCha::chacha_simd32_x4(output + 4*64, input, rounds); return; } #endif @@ -177,7 +161,7 @@ void ChaCha::chacha_x8(uint8_t output[64*8], uint32_t input[16], size_t rounds) store_le(x15, output + 64 * i + 4 * 15); input[12]++; - input[13] += input[12] < i; // carry? + input[13] += (input[12] == 0); } } diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h index 89deaad52..261d950bd 100644 --- a/src/lib/stream/chacha/chacha.h +++ b/src/lib/stream/chacha/chacha.h @@ -60,18 +60,14 @@ class BOTAN_PUBLIC_API(2,0) ChaCha final : public StreamCipher void chacha_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds); -#if defined(BOTAN_HAS_CHACHA_SSE2) - void chacha_sse2_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds); +#if defined(BOTAN_HAS_CHACHA_SIMD32) + void chacha_simd32_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds); #endif #if defined(BOTAN_HAS_CHACHA_AVX2) void chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds); #endif -#if defined(BOTAN_HAS_CHACHA_NEON) - void chacha_neon_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds); -#endif - size_t m_rounds; secure_vector<uint32_t> m_key; secure_vector<uint32_t> m_state; diff --git a/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp b/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp index 1cc619ef6..df37304c7 100644 --- a/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp +++ b/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp @@ -23,10 +23,12 @@ void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rou const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2)); const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3)); - const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4); - const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5); - const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6); - const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7); + const uint32_t C = 0xFFFFFFFF - state[12]; + + const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4); + const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5); + const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6); + const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7); const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); @@ -39,22 +41,22 @@ void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rou __m256i X0_0 = state0; __m256i X0_1 = state1; __m256i X0_2 = state2; - __m256i X0_3 = _mm256_add_epi64(state3, CTR0); + __m256i X0_3 = _mm256_add_epi32(state3, CTR0); __m256i X1_0 = state0; __m256i X1_1 = state1; __m256i X1_2 = state2; - __m256i X1_3 = _mm256_add_epi64(state3, CTR1); + __m256i X1_3 = _mm256_add_epi32(state3, CTR1); __m256i X2_0 = state0; __m256i X2_1 = state1; __m256i X2_2 = state2; - __m256i X2_3 = _mm256_add_epi64(state3, CTR2); + __m256i X2_3 = _mm256_add_epi32(state3, CTR2); __m256i X3_0 = state0; __m256i X3_1 = state1; __m256i X3_2 = state2; - __m256i X3_3 = _mm256_add_epi64(state3, CTR3); + __m256i X3_3 = _mm256_add_epi32(state3, CTR3); for(size_t r = 0; r != rounds / 2; ++r) { @@ -215,25 +217,25 @@ void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rou X0_1 = _mm256_add_epi32(X0_1, state1); X0_2 = _mm256_add_epi32(X0_2, state2); X0_3 = _mm256_add_epi32(X0_3, state3); - X0_3 = _mm256_add_epi64(X0_3, CTR0); + X0_3 = _mm256_add_epi32(X0_3, CTR0); X1_0 = _mm256_add_epi32(X1_0, state0); X1_1 = _mm256_add_epi32(X1_1, state1); X1_2 = _mm256_add_epi32(X1_2, state2); X1_3 = _mm256_add_epi32(X1_3, state3); - X1_3 = _mm256_add_epi64(X1_3, CTR1); + X1_3 = _mm256_add_epi32(X1_3, CTR1); X2_0 = _mm256_add_epi32(X2_0, state0); X2_1 = _mm256_add_epi32(X2_1, state1); X2_2 = _mm256_add_epi32(X2_2, state2); X2_3 = _mm256_add_epi32(X2_3, state3); - X2_3 = _mm256_add_epi64(X2_3, CTR2); + X2_3 = _mm256_add_epi32(X2_3, CTR2); X3_0 = _mm256_add_epi32(X3_0, state0); X3_1 = _mm256_add_epi32(X3_1, state1); X3_2 = _mm256_add_epi32(X3_2, state2); X3_3 = _mm256_add_epi32(X3_3, state3); - X3_3 = _mm256_add_epi64(X3_3, CTR3); + X3_3 = _mm256_add_epi32(X3_3, CTR3); _mm256_storeu_si256(output_mm , _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); _mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); diff --git a/src/lib/stream/chacha/chacha_neon/chacha_neon.cpp b/src/lib/stream/chacha/chacha_neon/chacha_neon.cpp deleted file mode 100644 index eb777a58c..000000000 --- a/src/lib/stream/chacha/chacha_neon/chacha_neon.cpp +++ /dev/null @@ -1,299 +0,0 @@ -/* -* NEON ChaCha impl originally written by Jeffrey Walton for Crypto++ -* and released as public domain. -* -* Further changes -* (C) 2018 Jack Lloyd -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#include <botan/chacha.h> -#include <arm_neon.h> - -namespace Botan { - -namespace { - -template <unsigned int R> -inline uint32x4_t RotateLeft(const uint32x4_t& val) - { - return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R)); - } - -template <unsigned int R> -inline uint32x4_t RotateRight(const uint32x4_t& val) - { - return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R)); - } - -// ChaCha's use of shuffle is really a 4, 8, or 12 byte rotation: -// * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x) -// * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x) -// * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x) -template <unsigned int S> -inline uint32x4_t Shuffle(const uint32x4_t& val) - { - return vextq_u32(val, val, S); - } - -#if defined(BOTAN_TARGET_ARCH_IS_ARM64) - -template <> -inline uint32x4_t RotateLeft<8>(const uint32x4_t& val) - { - const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; - const uint8x16_t mask = vld1q_u8(maskb); - - return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); - } - -template <> -inline uint32x4_t RotateLeft<16>(const uint32x4_t& val) - { - return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(val))); - } - -#endif - -} - -//static -void ChaCha::chacha_neon_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) - { - BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - - const uint32x4_t state0 = vld1q_u32(state + 0*4); - const uint32x4_t state1 = vld1q_u32(state + 1*4); - const uint32x4_t state2 = vld1q_u32(state + 2*4); - const uint32x4_t state3 = vld1q_u32(state + 3*4); - - const uint64x2_t CTRS[3] = { - {1, 0}, {2, 0}, {3, 0} - //{0, 1}, {0, 2}, {0, 3} - }; - - uint32x4_t r0_0 = state0; - uint32x4_t r0_1 = state1; - uint32x4_t r0_2 = state2; - uint32x4_t r0_3 = state3; - - uint32x4_t r1_0 = state0; - uint32x4_t r1_1 = state1; - uint32x4_t r1_2 = state2; - uint32x4_t r1_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r0_3), CTRS[0])); - - uint32x4_t r2_0 = state0; - uint32x4_t r2_1 = state1; - uint32x4_t r2_2 = state2; - uint32x4_t r2_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r0_3), CTRS[1])); - - uint32x4_t r3_0 = state0; - uint32x4_t r3_1 = state1; - uint32x4_t r3_2 = state2; - uint32x4_t r3_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r0_3), CTRS[2])); - - for(size_t i = 0; i != rounds / 2; ++i) - { - r0_0 = vaddq_u32(r0_0, r0_1); - r1_0 = vaddq_u32(r1_0, r1_1); - r2_0 = vaddq_u32(r2_0, r2_1); - r3_0 = vaddq_u32(r3_0, r3_1); - - r0_3 = veorq_u32(r0_3, r0_0); - r1_3 = veorq_u32(r1_3, r1_0); - r2_3 = veorq_u32(r2_3, r2_0); - r3_3 = veorq_u32(r3_3, r3_0); - - r0_3 = RotateLeft<16>(r0_3); - r1_3 = RotateLeft<16>(r1_3); - r2_3 = RotateLeft<16>(r2_3); - r3_3 = RotateLeft<16>(r3_3); - - r0_2 = vaddq_u32(r0_2, r0_3); - r1_2 = vaddq_u32(r1_2, r1_3); - r2_2 = vaddq_u32(r2_2, r2_3); - r3_2 = vaddq_u32(r3_2, r3_3); - - r0_1 = veorq_u32(r0_1, r0_2); - r1_1 = veorq_u32(r1_1, r1_2); - r2_1 = veorq_u32(r2_1, r2_2); - r3_1 = veorq_u32(r3_1, r3_2); - - r0_1 = RotateLeft<12>(r0_1); - r1_1 = RotateLeft<12>(r1_1); - r2_1 = RotateLeft<12>(r2_1); - r3_1 = RotateLeft<12>(r3_1); - - r0_0 = vaddq_u32(r0_0, r0_1); - r1_0 = vaddq_u32(r1_0, r1_1); - r2_0 = vaddq_u32(r2_0, r2_1); - r3_0 = vaddq_u32(r3_0, r3_1); - - r0_3 = veorq_u32(r0_3, r0_0); - r1_3 = veorq_u32(r1_3, r1_0); - r2_3 = veorq_u32(r2_3, r2_0); - r3_3 = veorq_u32(r3_3, r3_0); - - r0_3 = RotateLeft<8>(r0_3); - r1_3 = RotateLeft<8>(r1_3); - r2_3 = RotateLeft<8>(r2_3); - r3_3 = RotateLeft<8>(r3_3); - - r0_2 = vaddq_u32(r0_2, r0_3); - r1_2 = vaddq_u32(r1_2, r1_3); - r2_2 = vaddq_u32(r2_2, r2_3); - r3_2 = vaddq_u32(r3_2, r3_3); - - r0_1 = veorq_u32(r0_1, r0_2); - r1_1 = veorq_u32(r1_1, r1_2); - r2_1 = veorq_u32(r2_1, r2_2); - r3_1 = veorq_u32(r3_1, r3_2); - - r0_1 = RotateLeft<7>(r0_1); - r1_1 = RotateLeft<7>(r1_1); - r2_1 = RotateLeft<7>(r2_1); - r3_1 = RotateLeft<7>(r3_1); - - r0_1 = Shuffle<1>(r0_1); - r0_2 = Shuffle<2>(r0_2); - r0_3 = Shuffle<3>(r0_3); - - r1_1 = Shuffle<1>(r1_1); - r1_2 = Shuffle<2>(r1_2); - r1_3 = Shuffle<3>(r1_3); - - r2_1 = Shuffle<1>(r2_1); - r2_2 = Shuffle<2>(r2_2); - r2_3 = Shuffle<3>(r2_3); - - r3_1 = Shuffle<1>(r3_1); - r3_2 = Shuffle<2>(r3_2); - r3_3 = Shuffle<3>(r3_3); - - r0_0 = vaddq_u32(r0_0, r0_1); - r1_0 = vaddq_u32(r1_0, r1_1); - r2_0 = vaddq_u32(r2_0, r2_1); - r3_0 = vaddq_u32(r3_0, r3_1); - - r0_3 = veorq_u32(r0_3, r0_0); - r1_3 = veorq_u32(r1_3, r1_0); - r2_3 = veorq_u32(r2_3, r2_0); - r3_3 = veorq_u32(r3_3, r3_0); - - r0_3 = RotateLeft<16>(r0_3); - r1_3 = RotateLeft<16>(r1_3); - r2_3 = RotateLeft<16>(r2_3); - r3_3 = RotateLeft<16>(r3_3); - - r0_2 = vaddq_u32(r0_2, r0_3); - r1_2 = vaddq_u32(r1_2, r1_3); - r2_2 = vaddq_u32(r2_2, r2_3); - r3_2 = vaddq_u32(r3_2, r3_3); - - r0_1 = veorq_u32(r0_1, r0_2); - r1_1 = veorq_u32(r1_1, r1_2); - r2_1 = veorq_u32(r2_1, r2_2); - r3_1 = veorq_u32(r3_1, r3_2); - - r0_1 = RotateLeft<12>(r0_1); - r1_1 = RotateLeft<12>(r1_1); - r2_1 = RotateLeft<12>(r2_1); - r3_1 = RotateLeft<12>(r3_1); - - r0_0 = vaddq_u32(r0_0, r0_1); - r1_0 = vaddq_u32(r1_0, r1_1); - r2_0 = vaddq_u32(r2_0, r2_1); - r3_0 = vaddq_u32(r3_0, r3_1); - - r0_3 = veorq_u32(r0_3, r0_0); - r1_3 = veorq_u32(r1_3, r1_0); - r2_3 = veorq_u32(r2_3, r2_0); - r3_3 = veorq_u32(r3_3, r3_0); - - r0_3 = RotateLeft<8>(r0_3); - r1_3 = RotateLeft<8>(r1_3); - r2_3 = RotateLeft<8>(r2_3); - r3_3 = RotateLeft<8>(r3_3); - - r0_2 = vaddq_u32(r0_2, r0_3); - r1_2 = vaddq_u32(r1_2, r1_3); - r2_2 = vaddq_u32(r2_2, r2_3); - r3_2 = vaddq_u32(r3_2, r3_3); - - r0_1 = veorq_u32(r0_1, r0_2); - r1_1 = veorq_u32(r1_1, r1_2); - r2_1 = veorq_u32(r2_1, r2_2); - r3_1 = veorq_u32(r3_1, r3_2); - - r0_1 = RotateLeft<7>(r0_1); - r1_1 = RotateLeft<7>(r1_1); - r2_1 = RotateLeft<7>(r2_1); - r3_1 = RotateLeft<7>(r3_1); - - r0_1 = Shuffle<3>(r0_1); - r0_2 = Shuffle<2>(r0_2); - r0_3 = Shuffle<1>(r0_3); - - r1_1 = Shuffle<3>(r1_1); - r1_2 = Shuffle<2>(r1_2); - r1_3 = Shuffle<1>(r1_3); - - r2_1 = Shuffle<3>(r2_1); - r2_2 = Shuffle<2>(r2_2); - r2_3 = Shuffle<1>(r2_3); - - r3_1 = Shuffle<3>(r3_1); - r3_2 = Shuffle<2>(r3_2); - r3_3 = Shuffle<1>(r3_3); - } - - r0_0 = vaddq_u32(r0_0, state0); - r0_1 = vaddq_u32(r0_1, state1); - r0_2 = vaddq_u32(r0_2, state2); - r0_3 = vaddq_u32(r0_3, state3); - - r1_0 = vaddq_u32(r1_0, state0); - r1_1 = vaddq_u32(r1_1, state1); - r1_2 = vaddq_u32(r1_2, state2); - r1_3 = vaddq_u32(r1_3, state3); - r1_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r1_3), CTRS[0])); - - r2_0 = vaddq_u32(r2_0, state0); - r2_1 = vaddq_u32(r2_1, state1); - r2_2 = vaddq_u32(r2_2, state2); - r2_3 = vaddq_u32(r2_3, state3); - r2_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r2_3), CTRS[1])); - - r3_0 = vaddq_u32(r3_0, state0); - r3_1 = vaddq_u32(r3_1, state1); - r3_2 = vaddq_u32(r3_2, state2); - r3_3 = vaddq_u32(r3_3, state3); - r3_3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(r3_3), CTRS[2])); - - vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0)); - vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1)); - vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2)); - vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3)); - - vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0)); - vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1)); - vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2)); - vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3)); - - vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0)); - vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1)); - vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2)); - vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3)); - - vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0)); - vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1)); - vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2)); - vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3)); - - state[12] += 4; - if(state[12] < 4) - state[13]++; - } - -} diff --git a/src/lib/stream/chacha/chacha_neon/info.txt b/src/lib/stream/chacha/chacha_neon/info.txt deleted file mode 100644 index 118478b7a..000000000 --- a/src/lib/stream/chacha/chacha_neon/info.txt +++ /dev/null @@ -1,5 +0,0 @@ -<defines> -CHACHA_NEON -> 20181026 -</defines> - -need_isa neon diff --git a/src/lib/stream/chacha/chacha_simd32/chacha_simd32.cpp b/src/lib/stream/chacha/chacha_simd32/chacha_simd32.cpp new file mode 100644 index 000000000..6cd6acd0d --- /dev/null +++ b/src/lib/stream/chacha/chacha_simd32/chacha_simd32.cpp @@ -0,0 +1,205 @@ +/* +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/chacha.h> +#include <botan/internal/simd_32.h> + +namespace Botan { + +//static +void ChaCha::chacha_simd32_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) + { + BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); + const SIMD_4x32 CTR0 = SIMD_4x32(0, 1, 2, 3); + + const uint32_t C = 0xFFFFFFFF - state[12]; + const SIMD_4x32 CTR1 = SIMD_4x32(0, C < 1, C < 2, C < 3); + + SIMD_4x32 R00 = SIMD_4x32::splat(state[ 0]); + SIMD_4x32 R01 = SIMD_4x32::splat(state[ 1]); + SIMD_4x32 R02 = SIMD_4x32::splat(state[ 2]); + SIMD_4x32 R03 = SIMD_4x32::splat(state[ 3]); + SIMD_4x32 R04 = SIMD_4x32::splat(state[ 4]); + SIMD_4x32 R05 = SIMD_4x32::splat(state[ 5]); + SIMD_4x32 R06 = SIMD_4x32::splat(state[ 6]); + SIMD_4x32 R07 = SIMD_4x32::splat(state[ 7]); + SIMD_4x32 R08 = SIMD_4x32::splat(state[ 8]); + SIMD_4x32 R09 = SIMD_4x32::splat(state[ 9]); + SIMD_4x32 R10 = SIMD_4x32::splat(state[10]); + SIMD_4x32 R11 = SIMD_4x32::splat(state[11]); + SIMD_4x32 R12 = SIMD_4x32::splat(state[12]) + CTR0; + SIMD_4x32 R13 = SIMD_4x32::splat(state[13]) + CTR1; + SIMD_4x32 R14 = SIMD_4x32::splat(state[14]); + SIMD_4x32 R15 = SIMD_4x32::splat(state[15]); + + for(size_t r = 0; r != rounds / 2; ++r) + { + R00 += R04; + R01 += R05; + R02 += R06; + R03 += R07; + + R12 ^= R00; + R13 ^= R01; + R14 ^= R02; + R15 ^= R03; + + R12 = R12.rotl<16>(); + R13 = R13.rotl<16>(); + R14 = R14.rotl<16>(); + R15 = R15.rotl<16>(); + + R08 += R12; + R09 += R13; + R10 += R14; + R11 += R15; + + R04 ^= R08; + R05 ^= R09; + R06 ^= R10; + R07 ^= R11; + + R04 = R04.rotl<12>(); + R05 = R05.rotl<12>(); + R06 = R06.rotl<12>(); + R07 = R07.rotl<12>(); + + R00 += R04; + R01 += R05; + R02 += R06; + R03 += R07; + + R12 ^= R00; + R13 ^= R01; + R14 ^= R02; + R15 ^= R03; + + R12 = R12.rotl<8>(); + R13 = R13.rotl<8>(); + R14 = R14.rotl<8>(); + R15 = R15.rotl<8>(); + + R08 += R12; + R09 += R13; + R10 += R14; + R11 += R15; + + R04 ^= R08; + R05 ^= R09; + R06 ^= R10; + R07 ^= R11; + + R04 = R04.rotl<7>(); + R05 = R05.rotl<7>(); + R06 = R06.rotl<7>(); + R07 = R07.rotl<7>(); + + R00 += R05; + R01 += R06; + R02 += R07; + R03 += R04; + + R15 ^= R00; + R12 ^= R01; + R13 ^= R02; + R14 ^= R03; + + R15 = R15.rotl<16>(); + R12 = R12.rotl<16>(); + R13 = R13.rotl<16>(); + R14 = R14.rotl<16>(); + + R10 += R15; + R11 += R12; + R08 += R13; + R09 += R14; + + R05 ^= R10; + R06 ^= R11; + R07 ^= R08; + R04 ^= R09; + + R05 = R05.rotl<12>(); + R06 = R06.rotl<12>(); + R07 = R07.rotl<12>(); + R04 = R04.rotl<12>(); + + R00 += R05; + R01 += R06; + R02 += R07; + R03 += R04; + + R15 ^= R00; + R12 ^= R01; + R13 ^= R02; + R14 ^= R03; + + R15 = R15.rotl<8>(); + R12 = R12.rotl<8>(); + R13 = R13.rotl<8>(); + R14 = R14.rotl<8>(); + + R10 += R15; + R11 += R12; + R08 += R13; + R09 += R14; + + R05 ^= R10; + R06 ^= R11; + R07 ^= R08; + R04 ^= R09; + + R05 = R05.rotl<7>(); + R06 = R06.rotl<7>(); + R07 = R07.rotl<7>(); + R04 = R04.rotl<7>(); + } + + R00 += SIMD_4x32::splat(state[0]); + R01 += SIMD_4x32::splat(state[1]); + R02 += SIMD_4x32::splat(state[2]); + R03 += SIMD_4x32::splat(state[3]); + R04 += SIMD_4x32::splat(state[4]); + R05 += SIMD_4x32::splat(state[5]); + R06 += SIMD_4x32::splat(state[6]); + R07 += SIMD_4x32::splat(state[7]); + R08 += SIMD_4x32::splat(state[8]); + R09 += SIMD_4x32::splat(state[9]); + R10 += SIMD_4x32::splat(state[10]); + R11 += SIMD_4x32::splat(state[11]); + R12 += SIMD_4x32::splat(state[12]) + CTR0; + R13 += SIMD_4x32::splat(state[13]) + CTR1; + R14 += SIMD_4x32::splat(state[14]); + R15 += SIMD_4x32::splat(state[15]); + + SIMD_4x32::transpose(R00, R01, R02, R03); + SIMD_4x32::transpose(R04, R05, R06, R07); + SIMD_4x32::transpose(R08, R09, R10, R11); + SIMD_4x32::transpose(R12, R13, R14, R15); + + R00.store_le(output + 0*16); + R04.store_le(output + 1*16); + R08.store_le(output + 2*16); + R12.store_le(output + 3*16); + R01.store_le(output + 4*16); + R05.store_le(output + 5*16); + R09.store_le(output + 6*16); + R13.store_le(output + 7*16); + R02.store_le(output + 8*16); + R06.store_le(output + 9*16); + R10.store_le(output + 10*16); + R14.store_le(output + 11*16); + R03.store_le(output + 12*16); + R07.store_le(output + 13*16); + R11.store_le(output + 14*16); + R15.store_le(output + 15*16); + + state[12] += 4; + if(state[12] < 4) + state[13]++; + } + +} diff --git a/src/lib/stream/chacha/chacha_simd32/info.txt b/src/lib/stream/chacha/chacha_simd32/info.txt new file mode 100644 index 000000000..1ec932f81 --- /dev/null +++ b/src/lib/stream/chacha/chacha_simd32/info.txt @@ -0,0 +1,7 @@ +<defines> +CHACHA_SIMD32 -> 20181104 +</defines> + +<requires> +simd +</requires> diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp deleted file mode 100644 index 121c92a3e..000000000 --- a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp +++ /dev/null @@ -1,257 +0,0 @@ -/* -* SSE2 ChaCha -* (C) 2016 Jack Lloyd -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#include <botan/chacha.h> -#include <emmintrin.h> - -namespace Botan { - -//static -BOTAN_FUNC_ISA("sse2") -void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) - { - BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - - const __m128i* state_mm = reinterpret_cast<const __m128i*>(state); - __m128i* output_mm = reinterpret_cast<__m128i*>(output); - - __m128i state0 = _mm_loadu_si128(state_mm); - __m128i state1 = _mm_loadu_si128(state_mm + 1); - __m128i state2 = _mm_loadu_si128(state_mm + 2); - __m128i state3 = _mm_loadu_si128(state_mm + 3); - - // TODO: try transposing, which would avoid the permutations each round - -#define mm_rotl(r, n) \ - _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) - - __m128i r0_0 = state0; - __m128i r0_1 = state1; - __m128i r0_2 = state2; - __m128i r0_3 = state3; - - __m128i r1_0 = state0; - __m128i r1_1 = state1; - __m128i r1_2 = state2; - __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); - - __m128i r2_0 = state0; - __m128i r2_1 = state1; - __m128i r2_2 = state2; - __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); - - __m128i r3_0 = state0; - __m128i r3_1 = state1; - __m128i r3_2 = state2; - __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); - - for(size_t r = 0; r != rounds / 2; ++r) - { - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 16); - r1_3 = mm_rotl(r1_3, 16); - r2_3 = mm_rotl(r2_3, 16); - r3_3 = mm_rotl(r3_3, 16); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 12); - r1_1 = mm_rotl(r1_1, 12); - r2_1 = mm_rotl(r2_1, 12); - r3_1 = mm_rotl(r3_1, 12); - - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 8); - r1_3 = mm_rotl(r1_3, 8); - r2_3 = mm_rotl(r2_3, 8); - r3_3 = mm_rotl(r3_3, 8); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 7); - r1_1 = mm_rotl(r1_1, 7); - r2_1 = mm_rotl(r2_1, 7); - r3_1 = mm_rotl(r3_1, 7); - - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 16); - r1_3 = mm_rotl(r1_3, 16); - r2_3 = mm_rotl(r2_3, 16); - r3_3 = mm_rotl(r3_3, 16); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 12); - r1_1 = mm_rotl(r1_1, 12); - r2_1 = mm_rotl(r2_1, 12); - r3_1 = mm_rotl(r3_1, 12); - - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); - - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); - - r0_3 = mm_rotl(r0_3, 8); - r1_3 = mm_rotl(r1_3, 8); - r2_3 = mm_rotl(r2_3, 8); - r3_3 = mm_rotl(r3_3, 8); - - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); - - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); - - r0_1 = mm_rotl(r0_1, 7); - r1_1 = mm_rotl(r1_1, 7); - r2_1 = mm_rotl(r2_1, 7); - r3_1 = mm_rotl(r3_1, 7); - - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); - } - - r0_0 = _mm_add_epi32(r0_0, state0); - r0_1 = _mm_add_epi32(r0_1, state1); - r0_2 = _mm_add_epi32(r0_2, state2); - r0_3 = _mm_add_epi32(r0_3, state3); - - r1_0 = _mm_add_epi32(r1_0, state0); - r1_1 = _mm_add_epi32(r1_1, state1); - r1_2 = _mm_add_epi32(r1_2, state2); - r1_3 = _mm_add_epi32(r1_3, state3); - r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); - - r2_0 = _mm_add_epi32(r2_0, state0); - r2_1 = _mm_add_epi32(r2_1, state1); - r2_2 = _mm_add_epi32(r2_2, state2); - r2_3 = _mm_add_epi32(r2_3, state3); - r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); - - r3_0 = _mm_add_epi32(r3_0, state0); - r3_1 = _mm_add_epi32(r3_1, state1); - r3_2 = _mm_add_epi32(r3_2, state2); - r3_3 = _mm_add_epi32(r3_3, state3); - r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); - - _mm_storeu_si128(output_mm + 0, r0_0); - _mm_storeu_si128(output_mm + 1, r0_1); - _mm_storeu_si128(output_mm + 2, r0_2); - _mm_storeu_si128(output_mm + 3, r0_3); - - _mm_storeu_si128(output_mm + 4, r1_0); - _mm_storeu_si128(output_mm + 5, r1_1); - _mm_storeu_si128(output_mm + 6, r1_2); - _mm_storeu_si128(output_mm + 7, r1_3); - - _mm_storeu_si128(output_mm + 8, r2_0); - _mm_storeu_si128(output_mm + 9, r2_1); - _mm_storeu_si128(output_mm + 10, r2_2); - _mm_storeu_si128(output_mm + 11, r2_3); - - _mm_storeu_si128(output_mm + 12, r3_0); - _mm_storeu_si128(output_mm + 13, r3_1); - _mm_storeu_si128(output_mm + 14, r3_2); - _mm_storeu_si128(output_mm + 15, r3_3); - -#undef mm_rotl - - state[12] += 4; - if(state[12] < 4) - state[13]++; - } - -} diff --git a/src/lib/stream/chacha/chacha_sse2/info.txt b/src/lib/stream/chacha/chacha_sse2/info.txt deleted file mode 100644 index 20e0eb03f..000000000 --- a/src/lib/stream/chacha/chacha_sse2/info.txt +++ /dev/null @@ -1,5 +0,0 @@ -<defines> -CHACHA_SSE2 -> 20160831 -</defines> - -need_isa sse2 |