aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/stream
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2016-09-01 12:56:43 -0400
committerJack Lloyd <[email protected]>2016-09-01 13:20:05 -0400
commite358acf9e3fd74e7dc307a203977652ca3a9a3c9 (patch)
treeef184fb090053582fa3eb4abaee264ae4a17849f /src/lib/stream
parent858e3be10396e082901b612ee8c5e18cd3e47286 (diff)
ChaCha 4 ways
Diffstat (limited to 'src/lib/stream')
-rw-r--r--src/lib/stream/chacha/chacha.cpp156
-rw-r--r--src/lib/stream/chacha/chacha.h6
-rw-r--r--src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp120
3 files changed, 153 insertions, 129 deletions
diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp
index 97b6465f9..fa8f48142 100644
--- a/src/lib/stream/chacha/chacha.cpp
+++ b/src/lib/stream/chacha/chacha.cpp
@@ -18,61 +18,85 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds)
}
//static
-void ChaCha::chacha(byte output[64], const u32bit input[16], size_t rounds)
+void ChaCha::chacha_x4(byte output[64*4], u32bit input[16], size_t rounds)
{
BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
- #if defined(BOTAN_TARGET_SUPPORTS_SSE2)
+#if defined(BOTAN_TARGET_SUPPORTS_SSE2)
if(CPUID::has_sse2())
{
- return ChaCha::chacha_sse2(output, input, rounds);
+ return ChaCha::chacha_sse2_x4(output, input, rounds);
}
- #endif
-
- u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3],
- x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7],
- x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11],
- x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
-
-#define CHACHA_QUARTER_ROUND(a, b, c, d) \
- do { \
- a += b; d ^= a; d = rotate_left(d, 16); \
- c += d; b ^= c; b = rotate_left(b, 12); \
- a += b; d ^= a; d = rotate_left(d, 8); \
- c += d; b ^= c; b = rotate_left(b, 7); \
- } while(0)
-
- for(size_t i = 0; i != rounds / 2; ++i)
+#endif
+
+ // TODO interleave rounds
+ for(size_t i = 0; i != 4; ++i)
{
- CHACHA_QUARTER_ROUND(x00, x04, x08, x12);
- CHACHA_QUARTER_ROUND(x01, x05, x09, x13);
- CHACHA_QUARTER_ROUND(x02, x06, x10, x14);
- CHACHA_QUARTER_ROUND(x03, x07, x11, x15);
-
- CHACHA_QUARTER_ROUND(x00, x05, x10, x15);
- CHACHA_QUARTER_ROUND(x01, x06, x11, x12);
- CHACHA_QUARTER_ROUND(x02, x07, x08, x13);
- CHACHA_QUARTER_ROUND(x03, x04, x09, x14);
- }
+ u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3],
+ x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7],
+ x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11],
+ x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
+
+#define CHACHA_QUARTER_ROUND(a, b, c, d) \
+ do { \
+ a += b; d ^= a; d = rotate_left(d, 16); \
+ c += d; b ^= c; b = rotate_left(b, 12); \
+ a += b; d ^= a; d = rotate_left(d, 8); \
+ c += d; b ^= c; b = rotate_left(b, 7); \
+ } while(0)
+
+ for(size_t i = 0; i != rounds / 2; ++i)
+ {
+ CHACHA_QUARTER_ROUND(x00, x04, x08, x12);
+ CHACHA_QUARTER_ROUND(x01, x05, x09, x13);
+ CHACHA_QUARTER_ROUND(x02, x06, x10, x14);
+ CHACHA_QUARTER_ROUND(x03, x07, x11, x15);
+
+ CHACHA_QUARTER_ROUND(x00, x05, x10, x15);
+ CHACHA_QUARTER_ROUND(x01, x06, x11, x12);
+ CHACHA_QUARTER_ROUND(x02, x07, x08, x13);
+ CHACHA_QUARTER_ROUND(x03, x04, x09, x14);
+ }
#undef CHACHA_QUARTER_ROUND
- store_le(x00 + input[ 0], output + 4 * 0);
- store_le(x01 + input[ 1], output + 4 * 1);
- store_le(x02 + input[ 2], output + 4 * 2);
- store_le(x03 + input[ 3], output + 4 * 3);
- store_le(x04 + input[ 4], output + 4 * 4);
- store_le(x05 + input[ 5], output + 4 * 5);
- store_le(x06 + input[ 6], output + 4 * 6);
- store_le(x07 + input[ 7], output + 4 * 7);
- store_le(x08 + input[ 8], output + 4 * 8);
- store_le(x09 + input[ 9], output + 4 * 9);
- store_le(x10 + input[10], output + 4 * 10);
- store_le(x11 + input[11], output + 4 * 11);
- store_le(x12 + input[12], output + 4 * 12);
- store_le(x13 + input[13], output + 4 * 13);
- store_le(x14 + input[14], output + 4 * 14);
- store_le(x15 + input[15], output + 4 * 15);
+ x00 += input[0];
+ x01 += input[1];
+ x02 += input[2];
+ x03 += input[3];
+ x04 += input[4];
+ x05 += input[5];
+ x06 += input[6];
+ x07 += input[7];
+ x08 += input[8];
+ x09 += input[9];
+ x10 += input[10];
+ x11 += input[11];
+ x12 += input[12];
+ x13 += input[13];
+ x14 += input[14];
+ x15 += input[15];
+
+ store_le(x00, output + 64 * i + 4 * 0);
+ store_le(x01, output + 64 * i + 4 * 1);
+ store_le(x02, output + 64 * i + 4 * 2);
+ store_le(x03, output + 64 * i + 4 * 3);
+ store_le(x04, output + 64 * i + 4 * 4);
+ store_le(x05, output + 64 * i + 4 * 5);
+ store_le(x06, output + 64 * i + 4 * 6);
+ store_le(x07, output + 64 * i + 4 * 7);
+ store_le(x08, output + 64 * i + 4 * 8);
+ store_le(x09, output + 64 * i + 4 * 9);
+ store_le(x10, output + 64 * i + 4 * 10);
+ store_le(x11, output + 64 * i + 4 * 11);
+ store_le(x12, output + 64 * i + 4 * 12);
+ store_le(x13, output + 64 * i + 4 * 13);
+ store_le(x14, output + 64 * i + 4 * 14);
+ store_le(x15, output + 64 * i + 4 * 15);
+
+ input[12]++;
+ input[13] += input[12] < i; // carry?
+ }
}
/*
@@ -86,11 +110,7 @@ void ChaCha::cipher(const byte in[], byte out[], size_t length)
length -= (m_buffer.size() - m_position);
in += (m_buffer.size() - m_position);
out += (m_buffer.size() - m_position);
- chacha_sse2(m_buffer.data(), m_state.data(), m_rounds);
-
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
-
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
m_position = 0;
}
@@ -112,8 +132,12 @@ void ChaCha::key_schedule(const byte key[], size_t length)
const u32bit* CONSTANTS = (length == 16) ? TAU : SIGMA;
+ // Repeat the key if 128 bits
+ const byte* key2 = (length == 32) ? key + 16 : key;
+
+ m_position = 0;
m_state.resize(16);
- m_buffer.resize(64);
+ m_buffer.resize(4*64);
m_state[0] = CONSTANTS[0];
m_state[1] = CONSTANTS[1];
@@ -125,16 +149,12 @@ void ChaCha::key_schedule(const byte key[], size_t length)
m_state[6] = load_le<u32bit>(key, 2);
m_state[7] = load_le<u32bit>(key, 3);
- if(length == 32)
- key += 16;
-
- m_state[8] = load_le<u32bit>(key, 0);
- m_state[9] = load_le<u32bit>(key, 1);
- m_state[10] = load_le<u32bit>(key, 2);
- m_state[11] = load_le<u32bit>(key, 3);
-
- m_position = 0;
+ m_state[8] = load_le<u32bit>(key2, 0);
+ m_state[9] = load_le<u32bit>(key2, 1);
+ m_state[10] = load_le<u32bit>(key2, 2);
+ m_state[11] = load_le<u32bit>(key2, 3);
+ // Default all-zero IV
const byte ZERO[8] = { 0 };
set_iv(ZERO, sizeof(ZERO));
}
@@ -159,10 +179,7 @@ void ChaCha::set_iv(const byte iv[], size_t length)
m_state[15] = load_le<u32bit>(iv, 2);
}
- chacha(m_buffer.data(), m_state.data(), m_rounds);
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
-
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
m_position = 0;
}
@@ -185,9 +202,8 @@ void ChaCha::seek(u64bit offset)
throw Invalid_State("You have to setup the stream cipher (key and iv)");
}
- m_position = offset % m_buffer.size();
-
- u64bit counter = offset / m_buffer.size();
+ // Find the block offset
+ u64bit counter = offset / 64;
byte out[8];
@@ -196,9 +212,7 @@ void ChaCha::seek(u64bit offset)
m_state[12] = load_le<u32bit>(out, 0);
m_state[13] += load_le<u32bit>(out, 1);
- chacha(m_buffer.data(), m_state.data(), m_rounds);
-
- ++m_state[12];
- m_state[13] += (m_state[12] == 0);
+ chacha_x4(m_buffer.data(), m_state.data(), m_rounds);
+ m_position = offset % 64;
}
}
diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h
index ab28f9563..34b8bbb87 100644
--- a/src/lib/stream/chacha/chacha.h
+++ b/src/lib/stream/chacha/chacha.h
@@ -47,10 +47,12 @@ class BOTAN_DLL ChaCha final : public StreamCipher
private:
void key_schedule(const byte key[], size_t key_len) override;
- void chacha(byte output[64], const u32bit input[16], size_t rounds);
+ void incr_state_counter(size_t howmany);
+
+ void chacha_x4(byte output[64*4], u32bit state[16], size_t rounds);
#if defined(BOTAN_TARGET_SUPPORTS_SSE2)
- void chacha_sse2(byte output[64], const u32bit input[16], size_t rounds);
+ void chacha_sse2_x4(byte output[64*4], u32bit state[16], size_t rounds);
#endif
size_t m_rounds;
diff --git a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
index aa1ca45ff..34376d84c 100644
--- a/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
+++ b/src/lib/stream/chacha/chacha_sse2/chacha_sse2.cpp
@@ -11,7 +11,7 @@
namespace Botan {
//static
-void ChaCha::chacha_sse2(byte output[64], const u32bit input[16], size_t rounds)
+void ChaCha::chacha_sse2_x4(byte output[64], u32bit input[16], size_t rounds)
{
BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
@@ -21,70 +21,78 @@ void ChaCha::chacha_sse2(byte output[64], const u32bit input[16], size_t rounds)
const __m128i input1 = _mm_loadu_si128(input_mm + 1);
const __m128i input2 = _mm_loadu_si128(input_mm + 2);
const __m128i input3 = _mm_loadu_si128(input_mm + 3);
-
- __m128i r0 = input0;
- __m128i r1 = input1;
- __m128i r2 = input2;
- __m128i r3 = input3;
+ // TODO: interleave!
#define mm_rotl(r, n) \
_mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n))
- for(size_t i = 0; i != rounds / 2; ++i)
+ for(size_t i = 0; i != 4; ++i)
{
- r0 = _mm_add_epi32(r0, r1);
- r3 = _mm_xor_si128(r3, r0);
- r3 = mm_rotl(r3, 16);
-
- r2 = _mm_add_epi32(r2, r3);
- r1 = _mm_xor_si128(r1, r2);
- r1 = mm_rotl(r1, 12);
-
- r0 = _mm_add_epi32(r0, r1);
- r3 = _mm_xor_si128(r3, r0);
- r3 = mm_rotl(r3, 8);
-
- r2 = _mm_add_epi32(r2, r3);
- r1 = _mm_xor_si128(r1, r2);
- r1 = mm_rotl(r1, 7);
-
- r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(0, 3, 2, 1));
- r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2));
- r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(2, 1, 0, 3));
-
- r0 = _mm_add_epi32(r0, r1);
- r3 = _mm_xor_si128(r3, r0);
- r3 = mm_rotl(r3, 16);
-
- r2 = _mm_add_epi32(r2, r3);
- r1 = _mm_xor_si128(r1, r2);
- r1 = mm_rotl(r1, 12);
-
- r0 = _mm_add_epi32(r0, r1);
- r3 = _mm_xor_si128(r3, r0);
- r3 = mm_rotl(r3, 8);
-
- r2 = _mm_add_epi32(r2, r3);
- r1 = _mm_xor_si128(r1, r2);
- r1 = mm_rotl(r1, 7);
-
- r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 1, 0, 3));
- r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2));
- r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(0, 3, 2, 1));
+ __m128i r0 = input0;
+ __m128i r1 = input1;
+ __m128i r2 = input2;
+ __m128i r3 = input3;
+
+ r3 = _mm_add_epi64(r3, _mm_set_epi64x(0, i));
+
+ for(size_t r = 0; r != rounds / 2; ++r)
+ {
+ r0 = _mm_add_epi32(r0, r1);
+ r3 = _mm_xor_si128(r3, r0);
+ r3 = mm_rotl(r3, 16);
+
+ r2 = _mm_add_epi32(r2, r3);
+ r1 = _mm_xor_si128(r1, r2);
+ r1 = mm_rotl(r1, 12);
+
+ r0 = _mm_add_epi32(r0, r1);
+ r3 = _mm_xor_si128(r3, r0);
+ r3 = mm_rotl(r3, 8);
+
+ r2 = _mm_add_epi32(r2, r3);
+ r1 = _mm_xor_si128(r1, r2);
+ r1 = mm_rotl(r1, 7);
+
+ r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(0, 3, 2, 1));
+ r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2));
+ r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(2, 1, 0, 3));
+
+ r0 = _mm_add_epi32(r0, r1);
+ r3 = _mm_xor_si128(r3, r0);
+ r3 = mm_rotl(r3, 16);
+
+ r2 = _mm_add_epi32(r2, r3);
+ r1 = _mm_xor_si128(r1, r2);
+ r1 = mm_rotl(r1, 12);
+
+ r0 = _mm_add_epi32(r0, r1);
+ r3 = _mm_xor_si128(r3, r0);
+ r3 = mm_rotl(r3, 8);
+
+ r2 = _mm_add_epi32(r2, r3);
+ r1 = _mm_xor_si128(r1, r2);
+ r1 = mm_rotl(r1, 7);
+
+ r1 = _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 1, 0, 3));
+ r2 = _mm_shuffle_epi32(r2, _MM_SHUFFLE(1, 0, 3, 2));
+ r3 = _mm_shuffle_epi32(r3, _MM_SHUFFLE(0, 3, 2, 1));
+ }
+
+ r0 = _mm_add_epi32(r0, input0);
+ r1 = _mm_add_epi32(r1, input1);
+ r2 = _mm_add_epi32(r2, input2);
+ r3 = _mm_add_epi32(r3, input3);
+ r3 = _mm_add_epi64(r3, _mm_set_epi64x(0, i));
+
+ __m128i* output_mm = reinterpret_cast<__m128i*>(output);
+ _mm_storeu_si128(output_mm + 4*i , r0);
+ _mm_storeu_si128(output_mm + 4*i + 1, r1);
+ _mm_storeu_si128(output_mm + 4*i + 2, r2);
+ _mm_storeu_si128(output_mm + 4*i + 3, r3);
}
#undef mm_rotl
- r0 = _mm_add_epi32(r0, input0);
- r1 = _mm_add_epi32(r1, input1);
- r2 = _mm_add_epi32(r2, input2);
- r3 = _mm_add_epi32(r3, input3);
-
- __m128i* output_mm = reinterpret_cast<__m128i*>(output);
- _mm_storeu_si128(output_mm , r0);
- _mm_storeu_si128(output_mm + 1, r1);
- _mm_storeu_si128(output_mm + 2, r2);
- _mm_storeu_si128(output_mm + 3, r3);
}
}