diff options
author | Jack Lloyd <[email protected]> | 2017-10-13 12:08:30 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-13 12:16:39 -0400 |
commit | 577828a93755549f0e9d8413488e3e4485c67263 (patch) | |
tree | dbb1d6284914e0aa89212bfd33016e1a1a2c45c5 /src/lib/modes/aead/gcm/gcm.cpp | |
parent | 742420b4b631d6d9139fe5f63ca5650f4fb56b9d (diff) |
Optimize GCM
By allowing multiple blocks for clmul, slight speedup there though still
far behind optimum.
Precompute a table of multiples of H, 3-4x faster on systems without clmul
(and still no secret indexes).
Refactor GMAC to not derive from GHASH
Diffstat (limited to 'src/lib/modes/aead/gcm/gcm.cpp')
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.cpp | 111 |
1 files changed, 73 insertions, 38 deletions
diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp index bb832245e..9079aa039 100644 --- a/src/lib/modes/aead/gcm/gcm.cpp +++ b/src/lib/modes/aead/gcm/gcm.cpp @@ -24,52 +24,52 @@ namespace Botan { static const size_t GCM_BS = 16; -void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const +void GHASH::gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks) { + if(blocks == 0) + return; + #if defined(BOTAN_HAS_GCM_CLMUL) if(CPUID::has_clmul()) - return gcm_multiply_clmul(x.data(), m_H.data()); + return gcm_multiply_clmul(x.data(), m_H.data(), input, blocks); #elif defined(BOTAN_HAS_GCM_PMULL) if(CPUID::has_arm_pmull()) - return gcm_multiply_pmull(x.data(), m_H.data()); + return gcm_multiply_pmull(x.data(), m_H.data(), input, blocks); #endif - static const uint64_t R = 0xE100000000000000; - - uint64_t H[2] = { - load_be<uint64_t>(m_H.data(), 0), - load_be<uint64_t>(m_H.data(), 1) - }; - - uint64_t Z[2] = { 0, 0 }; - - CT::poison(H, 2); - CT::poison(Z, 2); CT::poison(x.data(), x.size()); // SSE2 might be useful here - for(size_t i = 0; i != 2; ++i) - { - const uint64_t X = load_be<uint64_t>(x.data(), i); + const uint64_t ones = 0xFFFFFFFFFFFFFFFF; - uint64_t mask = 0x8000000000000000; - for(size_t j = 0; j != 64; ++j) - { - const uint64_t XMASK = CT::expand_mask<uint64_t>(X & mask); - mask >>= 1; - Z[0] ^= H[0] & XMASK; - Z[1] ^= H[1] & XMASK; + uint64_t X0 = load_be<uint64_t>(x.data(), 0); + uint64_t X1 = load_be<uint64_t>(x.data(), 1); - // GCM's bit ops are reversed so we carry out of the bottom - const uint64_t carry = R & CT::expand_mask<uint64_t>(H[1] & 1); + for(size_t b = 0; b != blocks; ++b) + { + X0 ^= load_be<uint64_t>(input, 2*b); + X1 ^= load_be<uint64_t>(input, 2*b+1); + + uint64_t Z[2] = { 0, 0 }; - H[1] = (H[1] >> 1) | (H[0] << 63); - H[0] = (H[0] >> 1) ^ carry; + for(size_t i = 0; i != 64; ++i) + { + const uint64_t X0MASK = ones * ((X0 >> (63-i)) & 1); + const uint64_t X1MASK = ones * ((X1 >> (63-i)) & 1); + Z[0] ^= m_HM[4*i ] & X0MASK; + Z[1] ^= m_HM[4*i+1] & X0MASK; + Z[0] ^= m_HM[4*i+2] & X1MASK; + Z[1] ^= m_HM[4*i+3] & X1MASK; } + + X0 = Z[0]; + X1 = Z[1]; } - store_be<uint64_t>(x.data(), Z[0], Z[1]); + store_be<uint64_t>(x.data(), X0, X1); CT::unpoison(x.data(), x.size()); } @@ -80,16 +80,20 @@ void GHASH::ghash_update(secure_vector<uint8_t>& ghash, This assumes if less than block size input then we're just on the final block and should pad with zeros */ - while(length) - { - const size_t to_proc = std::min(length, GCM_BS); - xor_buf(ghash.data(), input, to_proc); + const size_t full_blocks = length / GCM_BS; + const size_t final_bytes = length - (full_blocks * GCM_BS); - gcm_multiply(ghash); + if(full_blocks > 0) + { + gcm_multiply(ghash, input, full_blocks); + } - input += to_proc; - length -= to_proc; + if(final_bytes) + { + secure_vector<uint8_t> last_block(GCM_BS); + copy_mem(last_block.data(), input + full_blocks * GCM_BS, final_bytes); + gcm_multiply(ghash, last_block.data(), 1); } } @@ -99,6 +103,32 @@ void GHASH::key_schedule(const uint8_t key[], size_t length) m_H_ad.resize(GCM_BS); m_ad_len = 0; m_text_len = 0; + + uint64_t H0 = load_be<uint64_t>(m_H.data(), 0); + uint64_t H1 = load_be<uint64_t>(m_H.data(), 1); + + const uint64_t R = 0xE100000000000000; + + m_HM.resize(256); + + // precompute the multiples of H + for(size_t i = 0; i != 2; ++i) + { + for(size_t j = 0; j != 64; ++j) + { + /* + we interleave H^1, H^65, H^2, H^66, ... + to make indexing nicer in the multiplication code + */ + m_HM[4*j+2*i] = H0; + m_HM[4*j+2*i+1] = H1; + + // GCM's bit ops are reversed so we carry out of the bottom + const uint64_t carry = R * (H1 & 1); + H1 = (H1 >> 1) | (H0 << 63); + H0 = (H0 >> 1) ^ carry; + } + } } void GHASH::start(const uint8_t nonce[], size_t len) @@ -115,12 +145,17 @@ void GHASH::set_associated_data(const uint8_t input[], size_t length) m_ad_len = length; } -void GHASH::update(const uint8_t input[], size_t length) +void GHASH::update_associated_data(const uint8_t ad[], size_t length) { BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); + m_ad_len += length; + ghash_update(m_ghash, ad, length); + } +void GHASH::update(const uint8_t input[], size_t length) + { + BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); m_text_len += length; - ghash_update(m_ghash, input, length); } |