diff options
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/stream/chacha/chacha.cpp | 156 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha.h | 6 | ||||
-rw-r--r-- | src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp | 120 |
3 files changed, 153 insertions, 129 deletions
diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp index 97b6465f9..fa8f48142 100644 --- a/src/lib/stream/chacha/chacha.cpp +++ b/src/lib/stream/chacha/chacha.cpp @@ -18,61 +18,85 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds) } //static -void ChaCha::chacha(byte output[64], const u32bit input[16], size_t rounds) +void ChaCha::chacha_x4(byte output[64*4], u32bit input[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); - #if defined(BOTAN_TARGET_SUPPORTS_SSE2) +#if defined(BOTAN_TARGET_SUPPORTS_SSE2) if(CPUID::has_sse2()) { - return ChaCha::chacha_sse2(output, input, rounds); + return ChaCha::chacha_sse2_x4(output, input, rounds); } - #endif - - u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], - x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], - x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], - x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; - -#define CHACHA_QUARTER_ROUND(a, b, c, d) \ - do { \ - a += b; d ^= a; d = rotate_left(d, 16); \ - c += d; b ^= c; b = rotate_left(b, 12); \ - a += b; d ^= a; d = rotate_left(d, 8); \ - c += d; b ^= c; b = rotate_left(b, 7); \ - } while(0) - - for(size_t i = 0; i != rounds / 2; ++i) +#endif + + // TODO interleave rounds + for(size_t i = 0; i != 4; ++i) { - CHACHA_QUARTER_ROUND(x00, x04, x08, x12); - CHACHA_QUARTER_ROUND(x01, x05, x09, x13); - CHACHA_QUARTER_ROUND(x02, x06, x10, x14); - CHACHA_QUARTER_ROUND(x03, x07, x11, x15); - - CHACHA_QUARTER_ROUND(x00, x05, x10, x15); - CHACHA_QUARTER_ROUND(x01, x06, x11, x12); - CHACHA_QUARTER_ROUND(x02, x07, x08, x13); - CHACHA_QUARTER_ROUND(x03, x04, x09, x14); - } + u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], + x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], + x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], + x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; + +#define CHACHA_QUARTER_ROUND(a, b, c, d) \ + do { \ + a += b; d ^= a; d = rotate_left(d, 16); \ + c += d; b ^= c; b = rotate_left(b, 12); \ + a += b; d ^= a; d = rotate_left(d, 8); \ + c += d; b ^= c; b = rotate_left(b, 7); \ + } while(0) + + for(size_t i = 0; i != rounds / 2; ++i) + { + CHACHA_QUARTER_ROUND(x00, x04, x08, x12); + CHACHA_QUARTER_ROUND(x01, x05, x09, x13); + CHACHA_QUARTER_ROUND(x02, x06, x10, x14); + CHACHA_QUARTER_ROUND(x03, x07, x11, x15); + + CHACHA_QUARTER_ROUND(x00, x05, x10, x15); + CHACHA_QUARTER_ROUND(x01, x06, x11, x12); + CHACHA_QUARTER_ROUND(x02, x07, x08, x13); + CHACHA_QUARTER_ROUND(x03, x04, x09, x14); + } #undef CHACHA_QUARTER_ROUND - store_le(x00 + input[ 0], output + 4 * 0); - store_le(x01 + input[ 1], output + 4 * 1); - store_le(x02 + input[ 2], output + 4 * 2); - store_le(x03 + input[ 3], output + 4 * 3); - store_le(x04 + input[ 4], output + 4 * 4); - store_le(x05 + input[ 5], output + 4 * 5); - store_le(x06 + input[ 6], output + 4 * 6); - store_le(x07 + input[ 7], output + 4 * 7); - store_le(x08 + input[ 8], output + 4 * 8); - store_le(x09 + input[ 9], output + 4 * 9); - store_le(x10 + input[10], output + 4 * 10); - store_le(x11 + input[11], output + 4 * 11); - store_le(x12 + input[12], output + 4 * 12); - store_le(x13 + input[13], output + 4 * 13); - store_le(x14 + input[14], output + 4 * 14); - store_le(x15 + input[15], output + 4 * 15); + x00 += input[0]; + x01 += input[1]; + x02 += input[2]; + x03 += input[3]; + x04 += input[4]; + x05 += input[5]; + x06 += input[6]; + x07 += input[7]; + x08 += input[8]; + x09 += input[9]; + x10 += input[10]; + x11 += input[11]; + x12 += input[12]; + x13 += input[13]; + x14 += input[14]; + x15 += input[15]; + + store_le(x00, output + 64 * i + 4 * 0); + store_le(x01, output + 64 * i + 4 * 1); + store_le(x02, output + 64 * i + 4 * 2); + store_le(x03, output + 64 * i + 4 * 3); + store_le(x04, output + 64 * i + 4 * 4); + store_le(x05, output + 64 * i + 4 * 5); + store_le(x06, output + 64 * i + 4 * 6); + store_le(x07, output + 64 * i + 4 * 7); + store_le(x08, output + 64 * i + 4 * 8); + store_le(x09, output + 64 * i + 4 * 9); + store_le(x10, output + 64 * i + 4 * 10); + store_le(x11, output + 64 * i + 4 * 11); + store_le(x12, output + 64 * i + 4 * 12); + store_le(x13, output + 64 * i + 4 * 13); + store_le(x14, output + 64 * i + 4 * 14); + store_le(x15, output + 64 * i + 4 * 15); + + input[12]++; + input[13] += input[12] < i; // carry? + } } /* @@ -86,11 +110,7 @@ void ChaCha::cipher(const byte in[], byte out[], size_t length) length -= (m_buffer.size() - m_position); in += (m_buffer.size() - m_position); out += (m_buffer.size() - m_position); - chacha_sse2(m_buffer.data(), m_state.data(), m_rounds); - - ++m_state[12]; - m_state[13] += (m_state[12] == 0); - + chacha_x4(m_buffer.data(), m_state.data(), m_rounds); m_position = 0; } @@ -112,8 +132,12 @@ void ChaCha::key_schedule(const byte key[], size_t length) const u32bit* CONSTANTS = (length == 16) ? TAU : SIGMA; + // Repeat the key if 128 bits + const byte* key2 = (length == 32) ? key + 16 : key; + + m_position = 0; m_state.resize(16); - m_buffer.resize(64); + m_buffer.resize(4*64); m_state[0] = CONSTANTS[0]; m_state[1] = CONSTANTS[1]; @@ -125,16 +149,12 @@ void ChaCha::key_schedule(const byte key[], size_t length) m_state[6] = load_le<u32bit>(key, 2); m_state[7] = load_le<u32bit>(key, 3); - if(length == 32) - key += 16; - - m_state[8] = load_le<u32bit>(key, 0); - m_state[9] = load_le<u32bit>(key, 1); - m_state[10] = load_le<u32bit>(key, 2); - m_state[11] = load_le<u32bit>(key, 3); - - m_position = 0; + m_state[8] = load_le<u32bit>(key2, 0); + m_state[9] = load_le<u32bit>(key2, 1); + m_state[10] = load_le<u32bit>(key2, 2); + m_state[11] = load_le<u32bit>(key2, 3); + // Default all-zero IV const byte ZERO[8] = { 0 }; set_iv(ZERO, sizeof(ZERO)); } @@ -159,10 +179,7 @@ void ChaCha::set_iv(const byte iv[], size_t length) m_state[15] = load_le<u32bit>(iv, 2); } - chacha(m_buffer.data(), m_state.data(), m_rounds); - ++m_state[12]; - m_state[13] += (m_state[12] == 0); - + chacha_x4(m_buffer.data(), m_state.data(), m_rounds); m_position = 0; } @@ -185,9 +202,8 @@ void ChaCha::seek(u64bit offset) throw Invalid_State("You have to setup the stream cipher (key and iv)"); } - m_position = offset % m_buffer.size(); - - u64bit counter = offset / m_buffer.size(); + // Find the block offset + u64bit counter = offset / 64; byte out[8]; @@ -196,9 +212,7 @@ void ChaCha::seek(u64bit offset) m_state[12] = load_le<u32bit>(out, 0); m_state[13] += load_le<u32bit>(out, 1); - chacha(m_buffer.data(), m_state.data(), m_rounds); - - ++m_state[12]; - m_state[13] += (m_state[12] == 0); + chacha_x4(m_buffer.data(), m_state.data(), m_rounds); + m_position = offset % 64; } } diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h index ab28f9563..34b8bbb87 100644 --- a/src/lib/stream/chacha/chacha.h +++ b/src/lib/stream/chacha/chacha.h @@ -47,10 +47,12 @@ class BOTAN_DLL ChaCha final : public StreamCipher private: void key_schedule(const byte key[], size_t key_len) override; - void chacha(byte output[64], const u32bit input[16], size_t rounds); + void incr_state_counter(size_t howmany); + + void chacha_x4(byte output[64*4], u32bit state[16], size_t rounds); #if defined(BOTAN_TARGET_SUPPORTS_SSE2) - void chacha_sse2(byte output[64], const u32bit input[16], size_t rounds); + void chacha_sse2_x4(byte output[64*4], u32bit state[16], size_t rounds); #endif size_t m_rounds; diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp index aa1ca45ff..34376d84c 100644 --- a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp +++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp @@ -11,7 +11,7 @@ namespace Botan { //static -void ChaCha::chacha_sse2(byte output[64], const u32bit input[16], size_t rounds) +void ChaCha::chacha_sse2_x4(byte output[64], u32bit input[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); @@ -21,70 +21,78 @@ void ChaCha::chacha_sse2(byte output[64], const u32bit input[16], size_t rounds) const __m128i input1 = _mm_loadu_si128(input_mm + 1); const __m128i input2 = _mm_loadu_si128(input_mm + 2); const __m128i input3 = _mm_loadu_si128(input_mm + 3); - - __m128i r0 = input0; - __m128i r1 = input1; - __m128i r2 = input2; - __m128i r3 = input3; + // TODO: interleave! #define mm_rotl(r, n) \ _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n)) - for(size_t i = 0; i != rounds / 2; ++i) + for(size_t i = 0; i != 4; ++i) { - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 16); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 12); - - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 8); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 7); - - r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(0, 3, 2, 1)); - r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); - r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(2, 1, 0, 3)); - - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 16); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 12); - - r0 = _mm_add_epi32(r0, r1); - r3 = _mm_xor_si128(r3, r0); - r3 = mm_rotl(r3, 8); - - r2 = _mm_add_epi32(r2, r3); - r1 = _mm_xor_si128(r1, r2); - r1 = mm_rotl(r1, 7); - - r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 1, 0, 3)); - r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); - r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(0, 3, 2, 1)); + __m128i r0 = input0; + __m128i r1 = input1; + __m128i r2 = input2; + __m128i r3 = input3; + + r3 = _mm_add_epi64(r3, _mm_set_epi64x(0, i)); + + for(size_t r = 0; r != rounds / 2; ++r) + { + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 16); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 12); + + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 8); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 7); + + r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(0, 3, 2, 1)); + r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); + r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(2, 1, 0, 3)); + + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 16); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 12); + + r0 = _mm_add_epi32(r0, r1); + r3 = _mm_xor_si128(r3, r0); + r3 = mm_rotl(r3, 8); + + r2 = _mm_add_epi32(r2, r3); + r1 = _mm_xor_si128(r1, r2); + r1 = mm_rotl(r1, 7); + + r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 1, 0, 3)); + r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2)); + r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + r0 = _mm_add_epi32(r0, input0); + r1 = _mm_add_epi32(r1, input1); + r2 = _mm_add_epi32(r2, input2); + r3 = _mm_add_epi32(r3, input3); + r3 = _mm_add_epi64(r3, _mm_set_epi64x(0, i)); + + __m128i* output_mm = reinterpret_cast<__m128i*>(output); + _mm_storeu_si128(output_mm + 4*i , r0); + _mm_storeu_si128(output_mm + 4*i + 1, r1); + _mm_storeu_si128(output_mm + 4*i + 2, r2); + _mm_storeu_si128(output_mm + 4*i + 3, r3); } #undef mm_rotl - r0 = _mm_add_epi32(r0, input0); - r1 = _mm_add_epi32(r1, input1); - r2 = _mm_add_epi32(r2, input2); - r3 = _mm_add_epi32(r3, input3); - - __m128i* output_mm = reinterpret_cast<__m128i*>(output); - _mm_storeu_si128(output_mm , r0); - _mm_storeu_si128(output_mm + 1, r1); - _mm_storeu_si128(output_mm + 2, r2); - _mm_storeu_si128(output_mm + 3, r3); } } |