diff options
author | Jack Lloyd <[email protected]> | 2017-10-26 16:46:31 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-26 16:46:31 -0400 |
commit | 9bbca51d2c760db724136695837bc9c1d1a4f4b4 (patch) | |
tree | f54cbf3b8bbf562d3cd869e82bdfba39c047f66a /src | |
parent | 92d1921a56ac121c0b29c06398909fdf37c316dd (diff) |
Blake2b optimizations
Nothing major but does improve perf for large buffers from
910 MB/s to 970 MB/s on Skylake.
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/hash/blake2/blake2b.cpp | 207 | ||||
-rw-r--r-- | src/lib/hash/blake2/blake2b.h | 17 | ||||
-rw-r--r-- | src/lib/hash/mdx_hash/mdx_hash.cpp | 3 |
3 files changed, 94 insertions, 133 deletions
diff --git a/src/lib/hash/blake2/blake2b.cpp b/src/lib/hash/blake2/blake2b.cpp index 79a30de3d..2cc1c3888 100644 --- a/src/lib/hash/blake2/blake2b.cpp +++ b/src/lib/hash/blake2/blake2b.cpp @@ -1,6 +1,7 @@ /* * Blake2b * (C) 2016 cynecx +* (C) 2017 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -16,37 +17,27 @@ namespace Botan { namespace { -const uint64_t blake2b_IV[BLAKE2B_IVU64COUNT] = { - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_IVU64COUNT = 8 }; -const uint64_t blake2b_sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +const uint64_t blake2b_IV[BLAKE2B_IVU64COUNT] = { + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 }; + } Blake2b::Blake2b(size_t output_bits) : m_output_bits(output_bits), m_buffer(BLAKE2B_BLOCKBYTES), - m_buflen(0), + m_bufpos(0), m_H(BLAKE2B_IVU64COUNT) { - if(output_bits == 0 || output_bits % 8 != 0 - || output_bits / 8 > BLAKE2B_OUTBYTES) + if(output_bits == 0 || output_bits > 512 || output_bits % 8 != 0) { throw Invalid_Argument("Bad output bits size for Blake2b"); } @@ -56,145 +47,121 @@ Blake2b::Blake2b(size_t output_bits) : void Blake2b::state_init() { - std::copy(std::begin(blake2b_IV), std::end(blake2b_IV), m_H.begin()); + copy_mem(m_H.data(), blake2b_IV, BLAKE2B_IVU64COUNT); m_H[0] ^= 0x01010000 ^ static_cast<uint8_t>(output_length()); m_T[0] = m_T[1] = 0; m_F[0] = m_F[1] = 0; } -void Blake2b::compress(bool lastblock) +void Blake2b::compress(const uint8_t* input, size_t blocks, size_t increment) { - uint64_t m[16]; - uint64_t v[16]; - uint64_t* const H = m_H.data(); - const uint8_t* const block = m_buffer.data(); - - if(lastblock) + for(size_t b = 0; b != blocks; ++b) { - m_F[0] = ~0ULL; - } + m_T[0] += increment; + if(m_T[0] < increment) + { + m_T[1]++; + } - for(int i = 0; i < 16; i++) - { - m[i] = load_le<uint64_t>(block, i); - } + uint64_t M[16]; + uint64_t v[16]; + load_le(M, input, 16); - for(int i = 0; i < 8; i++) - { - v[i] = H[i]; - v[i + 8] = blake2b_IV[i]; - } + input += BLAKE2B_BLOCKBYTES; + + for(size_t i = 0; i < 8; i++) + v[i] = m_H[i]; + for(size_t i = 0; i != 8; ++i) + v[i + 8] = blake2b_IV[i]; - v[12] ^= m_T[0]; - v[13] ^= m_T[1]; - v[14] ^= m_F[0]; - v[15] ^= m_F[1]; + v[12] ^= m_T[0]; + v[13] ^= m_T[1]; + v[14] ^= m_F[0]; + v[15] ^= m_F[1]; -#define G(r, i, a, b, c, d) \ +#define G(a, b, c, d, M0, M1) \ do { \ - a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ + a = a + b + M0; \ d = rotr<32>(d ^ a); \ c = c + d; \ b = rotr<24>(b ^ c); \ - a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ + a = a + b + M1; \ d = rotr<16>(d ^ a); \ c = c + d; \ b = rotr<63>(b ^ c); \ } while(0) -#define ROUND(r) \ - do { \ - G(r, 0, v[0], v[4], v[8], v[12]); \ - G(r, 1, v[1], v[5], v[9], v[13]); \ - G(r, 2, v[2], v[6], v[10], v[14]); \ - G(r, 3, v[3], v[7], v[11], v[15]); \ - G(r, 4, v[0], v[5], v[10], v[15]); \ - G(r, 5, v[1], v[6], v[11], v[12]); \ - G(r, 6, v[2], v[7], v[8], v[13]); \ - G(r, 7, v[3], v[4], v[9], v[14]); \ +#define ROUND(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, iA, iB, iC, iD, iE, iF) \ + do { \ + G(v[ 0], v[ 4], v[ 8], v[12], M[i0], M[i1]); \ + G(v[ 1], v[ 5], v[ 9], v[13], M[i2], M[i3]); \ + G(v[ 2], v[ 6], v[10], v[14], M[i4], M[i5]); \ + G(v[ 3], v[ 7], v[11], v[15], M[i6], M[i7]); \ + G(v[ 0], v[ 5], v[10], v[15], M[i8], M[i9]); \ + G(v[ 1], v[ 6], v[11], v[12], M[iA], M[iB]); \ + G(v[ 2], v[ 7], v[ 8], v[13], M[iC], M[iD]); \ + G(v[ 3], v[ 4], v[ 9], v[14], M[iE], M[iF]); \ } while(0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - ROUND(10); - ROUND(11); - - for(int i = 0; i < 8; i++) - { - H[i] ^= v[i] ^ v[i + 8]; + ROUND( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + ROUND(14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + ROUND(11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); + ROUND( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); + ROUND( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); + ROUND( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); + ROUND(12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); + ROUND(13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); + ROUND( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); + ROUND(10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0); + ROUND( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + ROUND(14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + + for(size_t i = 0; i < 8; i++) + { + m_H[i] ^= v[i] ^ v[i + 8]; + } } #undef G #undef ROUND } -void Blake2b::increment_counter(const uint64_t inc) - { - m_T[0] += inc; - if(m_T[0] < inc) - { - m_T[1]++; - } - } - void Blake2b::add_data(const uint8_t input[], size_t length) { - if(!input || length == 0) + if(m_bufpos > 0) { - return; - } - - uint8_t* const buffer = m_buffer.data(); + const size_t take = std::min(BLAKE2B_BLOCKBYTES - m_bufpos, length); + copy_mem(&m_buffer[m_bufpos], input, take); + m_bufpos += take; + length -= take; + input += take; - while(length > 0) - { - size_t fill = BLAKE2B_BLOCKBYTES - m_buflen; - - if(length <= fill) + if(m_bufpos == m_buffer.size() && length > 0) { - std::memcpy(buffer + m_buflen, input, length); - m_buflen += length; - return; + compress(m_buffer.data(), 1, BLAKE2B_BLOCKBYTES); + m_bufpos = 0; } + } - std::memcpy(buffer + m_buflen, input, fill); - increment_counter(BLAKE2B_BLOCKBYTES); - compress(); + if(length > BLAKE2B_BLOCKBYTES) + { + const size_t full_blocks = ((length-1) / BLAKE2B_BLOCKBYTES); + compress(input, full_blocks, BLAKE2B_BLOCKBYTES); - m_buflen = 0; - input += fill; - length -= fill; + input += full_blocks * BLAKE2B_BLOCKBYTES; + length -= full_blocks * BLAKE2B_BLOCKBYTES; } + + copy_mem(&m_buffer[m_bufpos], input, length); + m_bufpos += length; } void Blake2b::final_result(uint8_t output[]) { - if(!output) - { - return; - } - - uint8_t* const buffer = m_buffer.data(); - const uint64_t* const H = static_cast<const uint64_t*>(m_H.data()); - uint16_t outlen = static_cast<uint16_t>(output_length()); - - std::memset(buffer + m_buflen, 0, BLAKE2B_BLOCKBYTES - m_buflen); - increment_counter(m_buflen); - compress(true); - - for (uint16_t i = 0; i < outlen; i++) - { - output[i] = (H[i >> 3] >> (8 * (i & 7))) & 0xFF; - } - + clear_mem(&m_buffer[m_bufpos], BLAKE2B_BLOCKBYTES - m_bufpos); + m_F[0] = 0xFFFFFFFFFFFFFFFF; + compress(m_buffer.data(), 1, m_bufpos); + copy_out_vec_le(output, output_length(), m_H); clear(); } @@ -217,7 +184,7 @@ void Blake2b::clear() { zeroise(m_H); zeroise(m_buffer); - m_buflen = 0; + m_bufpos = 0; state_init(); } diff --git a/src/lib/hash/blake2/blake2b.h b/src/lib/hash/blake2/blake2b.h index fa67ab2cf..f4a68ad55 100644 --- a/src/lib/hash/blake2/blake2b.h +++ b/src/lib/hash/blake2/blake2b.h @@ -14,12 +14,6 @@ namespace Botan { -enum blake2b_constant { - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_IVU64COUNT = 8 -}; - /** * BLAKE2B */ @@ -31,7 +25,7 @@ class BOTAN_PUBLIC_API(2,0) Blake2b final : public HashFunction */ explicit Blake2b(size_t output_bits = 512); - size_t hash_block_size() const override { return BLAKE2B_BLOCKBYTES; } + size_t hash_block_size() const override { return 128; } size_t output_length() const override { return m_output_bits / 8; } HashFunction* clone() const override; @@ -44,14 +38,13 @@ class BOTAN_PUBLIC_API(2,0) Blake2b final : public HashFunction void add_data(const uint8_t input[], size_t length) override; void final_result(uint8_t out[]) override; - inline void state_init(); - inline void increment_counter(const uint64_t inc); - void compress(bool lastblock = false); + void state_init(); + void compress(const uint8_t* data, size_t blocks, uint64_t increment); - size_t m_output_bits; + const size_t m_output_bits; secure_vector<uint8_t> m_buffer; - size_t m_buflen; + size_t m_bufpos; secure_vector<uint64_t> m_H; uint64_t m_T[2]; diff --git a/src/lib/hash/mdx_hash/mdx_hash.cpp b/src/lib/hash/mdx_hash/mdx_hash.cpp index c2fb320ec..8c668874a 100644 --- a/src/lib/hash/mdx_hash/mdx_hash.cpp +++ b/src/lib/hash/mdx_hash/mdx_hash.cpp @@ -19,11 +19,12 @@ MDx_HashFunction::MDx_HashFunction(size_t block_len, bool bit_end, size_t cnt_size) : m_buffer(block_len), + m_count(0), + m_position(0), BIG_BYTE_ENDIAN(byte_end), BIG_BIT_ENDIAN(bit_end), COUNT_SIZE(cnt_size) { - m_count = m_position = 0; } /* |