diff options
author | Jack Lloyd <[email protected]> | 2017-10-13 12:08:30 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-13 12:16:39 -0400 |
commit | 577828a93755549f0e9d8413488e3e4485c67263 (patch) | |
tree | dbb1d6284914e0aa89212bfd33016e1a1a2c45c5 | |
parent | 742420b4b631d6d9139fe5f63ca5650f4fb56b9d (diff) |
Optimize GCM
By allowing multiple blocks for clmul, slight speedup there though still
far behind optimum.
Precompute a table of multiples of H, 3-4x faster on systems without clmul
(and still no secret indexes).
Refactor GMAC to not derive from GHASH
-rw-r--r-- | src/cli/speed.cpp | 1 | ||||
-rw-r--r-- | src/lib/mac/gmac/gmac.cpp | 46 | ||||
-rw-r--r-- | src/lib/mac/gmac/gmac.h | 4 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.cpp | 113 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.h | 3 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.cpp | 111 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.h | 27 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.cpp | 109 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.h | 3 | ||||
-rw-r--r-- | src/tests/test_mac.cpp | 2 |
10 files changed, 245 insertions, 174 deletions
diff --git a/src/cli/speed.cpp b/src/cli/speed.cpp index 83bb29efa..01a84d7c9 100644 --- a/src/cli/speed.cpp +++ b/src/cli/speed.cpp @@ -1024,6 +1024,7 @@ class Speed final : public Command const Botan::SymmetricKey key(rng(), mac.maximum_keylength()); mac.set_key(key); + mac.start(nullptr, 0); Timer timer(mac.name(), provider, "mac", buffer.size(), buf_size); timer.run_until_elapsed(runtime, [&]() { mac.update(buffer); }); diff --git a/src/lib/mac/gmac/gmac.cpp b/src/lib/mac/gmac/gmac.cpp index 7ce546ad5..af5d245ba 100644 --- a/src/lib/mac/gmac/gmac.cpp +++ b/src/lib/mac/gmac/gmac.cpp @@ -1,6 +1,7 @@ /* * GMAC * (C) 2016 Matthias Gierlings, René Korthaus + * (C) 2017 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -9,20 +10,17 @@ namespace Botan { -GMAC::GMAC(BlockCipher* cipher) - : GHASH(), - m_aad_buf(), - m_cipher(cipher), - m_initialized(false) +GMAC::GMAC(BlockCipher* cipher) : + m_aad_buf(), + m_cipher(cipher), + m_ghash(new GHASH), + m_initialized(false) {} void GMAC::clear() { - GHASH::clear(); - m_H.resize(GCM_BS); - m_H_ad.resize(GCM_BS); - m_ghash.resize(GCM_BS); m_cipher->clear(); + m_ghash->clear(); m_aad_buf.clear(); m_initialized = false; } @@ -39,7 +37,10 @@ size_t GMAC::output_length() const void GMAC::add_data(const uint8_t input[], size_t size) { - m_ad_len += size; + /* + FIXME this could be much more efficient, and only buffer leftovers + as needed, instead of inserting everything into the buffer + */ // buffer partial blocks till we received a full input block // or final is called. @@ -47,9 +48,8 @@ void GMAC::add_data(const uint8_t input[], size_t size) if(m_aad_buf.size() >= GCM_BS) { // process all complete input blocks. - ghash_update(m_ghash, - m_aad_buf.data(), - m_aad_buf.size() - (m_aad_buf.size() % GCM_BS)); + m_ghash->update_associated_data(m_aad_buf.data(), + m_aad_buf.size() - (m_aad_buf.size() % GCM_BS)); // remove all processed blocks from buffer. m_aad_buf.erase(m_aad_buf.begin(), @@ -61,7 +61,10 @@ void GMAC::key_schedule(const uint8_t key[], size_t size) { clear(); m_cipher->set_key(key, size); - m_cipher->encrypt(m_H_ad.data(), m_H.data()); + + secure_vector<uint8_t> H(GCM_BS); + m_cipher->encrypt(H); + m_ghash->set_key(H); } void GMAC::start_msg(const uint8_t nonce[], size_t nonce_len) @@ -75,13 +78,13 @@ void GMAC::start_msg(const uint8_t nonce[], size_t nonce_len) } else { - ghash_update(y0, nonce, nonce_len); - add_final_block(y0, 0, nonce_len); + m_ghash->ghash_update(y0, nonce, nonce_len); + m_ghash->add_final_block(y0, 0, nonce_len); } secure_vector<uint8_t> m_enc_y0(GCM_BS); m_cipher->encrypt(y0.data(), m_enc_y0.data()); - GHASH::start(m_enc_y0.data(), m_enc_y0.size()); + m_ghash->start(m_enc_y0.data(), m_enc_y0.size()); m_initialized = true; } @@ -90,17 +93,16 @@ void GMAC::final_result(uint8_t mac[]) // This ensures the GMAC computation has been initialized with a fresh // nonce. The aim of this check is to prevent developers from re-using // nonces (and potential nonce-reuse attacks). - BOTAN_ASSERT(m_initialized, "GMAC was used with a fresh nonce"); + if(m_initialized == false) + throw Invalid_State("GMAC was not used with a fresh nonce"); // process the rest of the aad buffer. Even if it is a partial block only // ghash_update will process it properly. if(m_aad_buf.size() > 0) { - ghash_update(m_ghash, - m_aad_buf.data(), - m_aad_buf.size()); + m_ghash->update_associated_data(m_aad_buf.data(), m_aad_buf.size()); } - secure_vector<uint8_t> result = GHASH::final(); + secure_vector<uint8_t> result = m_ghash->final(); copy_mem(mac, result.data(), result.size()); clear(); } diff --git a/src/lib/mac/gmac/gmac.h b/src/lib/mac/gmac/gmac.h index 970f9c047..bab170252 100644 --- a/src/lib/mac/gmac/gmac.h +++ b/src/lib/mac/gmac/gmac.h @@ -1,6 +1,7 @@ /* * GMAC * (C) 2016 Matthias Gierlings, René Korthaus + * (C) 2017 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -20,7 +21,7 @@ namespace Botan { * GMAC requires a unique initialization vector be used for each message. * This must be provided via the MessageAuthenticationCode::start() API */ -class BOTAN_PUBLIC_API(2,0) GMAC final : public MessageAuthenticationCode, public GHASH +class BOTAN_PUBLIC_API(2,0) GMAC final : public MessageAuthenticationCode { public: void clear() override; @@ -52,6 +53,7 @@ class BOTAN_PUBLIC_API(2,0) GMAC final : public MessageAuthenticationCode, publi static const size_t GCM_BS = 16; secure_vector<uint8_t> m_aad_buf; std::unique_ptr<BlockCipher> m_cipher; + std::unique_ptr<GHASH> m_ghash; bool m_initialized; }; diff --git a/src/lib/modes/aead/gcm/clmul/clmul.cpp b/src/lib/modes/aead/gcm/clmul/clmul.cpp index ed3473b4e..33378d833 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.cpp +++ b/src/lib/modes/aead/gcm/clmul/clmul.cpp @@ -12,67 +12,76 @@ namespace Botan { BOTAN_FUNC_ISA("pclmul,ssse3") -void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16]) +void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks) { /* * Algorithms 1 and 5 from Intel's CLMUL guide */ const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i b = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H)), BSWAP_MASK); + __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x)); - __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(H)); + a = _mm_shuffle_epi8(a, BSWAP_MASK); + + for(size_t i = 0; i != blocks; ++i) + { + __m128i m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input) + i); + m = _mm_shuffle_epi8(m, BSWAP_MASK); + + a = _mm_xor_si128(a, m); + + __m128i T0, T1, T2, T3, T4, T5; + + T0 = _mm_clmulepi64_si128(a, b, 0x00); + T1 = _mm_clmulepi64_si128(a, b, 0x01); + T2 = _mm_clmulepi64_si128(a, b, 0x10); + T3 = _mm_clmulepi64_si128(a, b, 0x11); + + T1 = _mm_xor_si128(T1, T2); + T2 = _mm_slli_si128(T1, 8); + T1 = _mm_srli_si128(T1, 8); + T0 = _mm_xor_si128(T0, T2); + T3 = _mm_xor_si128(T3, T1); + + T4 = _mm_srli_epi32(T0, 31); + T0 = _mm_slli_epi32(T0, 1); + + T5 = _mm_srli_epi32(T3, 31); + T3 = _mm_slli_epi32(T3, 1); + + T2 = _mm_srli_si128(T4, 12); + T5 = _mm_slli_si128(T5, 4); + T4 = _mm_slli_si128(T4, 4); + T0 = _mm_or_si128(T0, T4); + T3 = _mm_or_si128(T3, T5); + T3 = _mm_or_si128(T3, T2); + + T4 = _mm_slli_epi32(T0, 31); + T5 = _mm_slli_epi32(T0, 30); + T2 = _mm_slli_epi32(T0, 25); + + T4 = _mm_xor_si128(T4, T5); + T4 = _mm_xor_si128(T4, T2); + T5 = _mm_srli_si128(T4, 4); + T3 = _mm_xor_si128(T3, T5); + T4 = _mm_slli_si128(T4, 12); + T0 = _mm_xor_si128(T0, T4); + T3 = _mm_xor_si128(T3, T0); + + T4 = _mm_srli_epi32(T0, 1); + T1 = _mm_srli_epi32(T0, 2); + T2 = _mm_srli_epi32(T0, 7); + T3 = _mm_xor_si128(T3, T1); + T3 = _mm_xor_si128(T3, T2); + T3 = _mm_xor_si128(T3, T4); + + a = T3; + } a = _mm_shuffle_epi8(a, BSWAP_MASK); - b = _mm_shuffle_epi8(b, BSWAP_MASK); - - __m128i T0, T1, T2, T3, T4, T5; - - T0 = _mm_clmulepi64_si128(a, b, 0x00); - T1 = _mm_clmulepi64_si128(a, b, 0x01); - T2 = _mm_clmulepi64_si128(a, b, 0x10); - T3 = _mm_clmulepi64_si128(a, b, 0x11); - - T1 = _mm_xor_si128(T1, T2); - T2 = _mm_slli_si128(T1, 8); - T1 = _mm_srli_si128(T1, 8); - T0 = _mm_xor_si128(T0, T2); - T3 = _mm_xor_si128(T3, T1); - - T4 = _mm_srli_epi32(T0, 31); - T0 = _mm_slli_epi32(T0, 1); - - T5 = _mm_srli_epi32(T3, 31); - T3 = _mm_slli_epi32(T3, 1); - - T2 = _mm_srli_si128(T4, 12); - T5 = _mm_slli_si128(T5, 4); - T4 = _mm_slli_si128(T4, 4); - T0 = _mm_or_si128(T0, T4); - T3 = _mm_or_si128(T3, T5); - T3 = _mm_or_si128(T3, T2); - - T4 = _mm_slli_epi32(T0, 31); - T5 = _mm_slli_epi32(T0, 30); - T2 = _mm_slli_epi32(T0, 25); - - T4 = _mm_xor_si128(T4, T5); - T4 = _mm_xor_si128(T4, T2); - T5 = _mm_srli_si128(T4, 4); - T3 = _mm_xor_si128(T3, T5); - T4 = _mm_slli_si128(T4, 12); - T0 = _mm_xor_si128(T0, T4); - T3 = _mm_xor_si128(T3, T0); - - T4 = _mm_srli_epi32(T0, 1); - T1 = _mm_srli_epi32(T0, 2); - T2 = _mm_srli_epi32(T0, 7); - T3 = _mm_xor_si128(T3, T1); - T3 = _mm_xor_si128(T3, T2); - T3 = _mm_xor_si128(T3, T4); - - T3 = _mm_shuffle_epi8(T3, BSWAP_MASK); - - _mm_storeu_si128(reinterpret_cast<__m128i*>(x), T3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(x), a); } } diff --git a/src/lib/modes/aead/gcm/clmul/clmul.h b/src/lib/modes/aead/gcm/clmul/clmul.h index b47c73f27..d68e021d2 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.h +++ b/src/lib/modes/aead/gcm/clmul/clmul.h @@ -12,7 +12,8 @@ namespace Botan { -void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16]); +void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks); } diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp index bb832245e..9079aa039 100644 --- a/src/lib/modes/aead/gcm/gcm.cpp +++ b/src/lib/modes/aead/gcm/gcm.cpp @@ -24,52 +24,52 @@ namespace Botan { static const size_t GCM_BS = 16; -void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const +void GHASH::gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks) { + if(blocks == 0) + return; + #if defined(BOTAN_HAS_GCM_CLMUL) if(CPUID::has_clmul()) - return gcm_multiply_clmul(x.data(), m_H.data()); + return gcm_multiply_clmul(x.data(), m_H.data(), input, blocks); #elif defined(BOTAN_HAS_GCM_PMULL) if(CPUID::has_arm_pmull()) - return gcm_multiply_pmull(x.data(), m_H.data()); + return gcm_multiply_pmull(x.data(), m_H.data(), input, blocks); #endif - static const uint64_t R = 0xE100000000000000; - - uint64_t H[2] = { - load_be<uint64_t>(m_H.data(), 0), - load_be<uint64_t>(m_H.data(), 1) - }; - - uint64_t Z[2] = { 0, 0 }; - - CT::poison(H, 2); - CT::poison(Z, 2); CT::poison(x.data(), x.size()); // SSE2 might be useful here - for(size_t i = 0; i != 2; ++i) - { - const uint64_t X = load_be<uint64_t>(x.data(), i); + const uint64_t ones = 0xFFFFFFFFFFFFFFFF; - uint64_t mask = 0x8000000000000000; - for(size_t j = 0; j != 64; ++j) - { - const uint64_t XMASK = CT::expand_mask<uint64_t>(X & mask); - mask >>= 1; - Z[0] ^= H[0] & XMASK; - Z[1] ^= H[1] & XMASK; + uint64_t X0 = load_be<uint64_t>(x.data(), 0); + uint64_t X1 = load_be<uint64_t>(x.data(), 1); - // GCM's bit ops are reversed so we carry out of the bottom - const uint64_t carry = R & CT::expand_mask<uint64_t>(H[1] & 1); + for(size_t b = 0; b != blocks; ++b) + { + X0 ^= load_be<uint64_t>(input, 2*b); + X1 ^= load_be<uint64_t>(input, 2*b+1); + + uint64_t Z[2] = { 0, 0 }; - H[1] = (H[1] >> 1) | (H[0] << 63); - H[0] = (H[0] >> 1) ^ carry; + for(size_t i = 0; i != 64; ++i) + { + const uint64_t X0MASK = ones * ((X0 >> (63-i)) & 1); + const uint64_t X1MASK = ones * ((X1 >> (63-i)) & 1); + Z[0] ^= m_HM[4*i ] & X0MASK; + Z[1] ^= m_HM[4*i+1] & X0MASK; + Z[0] ^= m_HM[4*i+2] & X1MASK; + Z[1] ^= m_HM[4*i+3] & X1MASK; } + + X0 = Z[0]; + X1 = Z[1]; } - store_be<uint64_t>(x.data(), Z[0], Z[1]); + store_be<uint64_t>(x.data(), X0, X1); CT::unpoison(x.data(), x.size()); } @@ -80,16 +80,20 @@ void GHASH::ghash_update(secure_vector<uint8_t>& ghash, This assumes if less than block size input then we're just on the final block and should pad with zeros */ - while(length) - { - const size_t to_proc = std::min(length, GCM_BS); - xor_buf(ghash.data(), input, to_proc); + const size_t full_blocks = length / GCM_BS; + const size_t final_bytes = length - (full_blocks * GCM_BS); - gcm_multiply(ghash); + if(full_blocks > 0) + { + gcm_multiply(ghash, input, full_blocks); + } - input += to_proc; - length -= to_proc; + if(final_bytes) + { + secure_vector<uint8_t> last_block(GCM_BS); + copy_mem(last_block.data(), input + full_blocks * GCM_BS, final_bytes); + gcm_multiply(ghash, last_block.data(), 1); } } @@ -99,6 +103,32 @@ void GHASH::key_schedule(const uint8_t key[], size_t length) m_H_ad.resize(GCM_BS); m_ad_len = 0; m_text_len = 0; + + uint64_t H0 = load_be<uint64_t>(m_H.data(), 0); + uint64_t H1 = load_be<uint64_t>(m_H.data(), 1); + + const uint64_t R = 0xE100000000000000; + + m_HM.resize(256); + + // precompute the multiples of H + for(size_t i = 0; i != 2; ++i) + { + for(size_t j = 0; j != 64; ++j) + { + /* + we interleave H^1, H^65, H^2, H^66, ... + to make indexing nicer in the multiplication code + */ + m_HM[4*j+2*i] = H0; + m_HM[4*j+2*i+1] = H1; + + // GCM's bit ops are reversed so we carry out of the bottom + const uint64_t carry = R * (H1 & 1); + H1 = (H1 >> 1) | (H0 << 63); + H0 = (H0 >> 1) ^ carry; + } + } } void GHASH::start(const uint8_t nonce[], size_t len) @@ -115,12 +145,17 @@ void GHASH::set_associated_data(const uint8_t input[], size_t length) m_ad_len = length; } -void GHASH::update(const uint8_t input[], size_t length) +void GHASH::update_associated_data(const uint8_t ad[], size_t length) { BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); + m_ad_len += length; + ghash_update(m_ghash, ad, length); + } +void GHASH::update(const uint8_t input[], size_t length) + { + BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); m_text_len += length; - ghash_update(m_ghash, input, length); } diff --git a/src/lib/modes/aead/gcm/gcm.h b/src/lib/modes/aead/gcm/gcm.h index deedfdade..eac2add93 100644 --- a/src/lib/modes/aead/gcm/gcm.h +++ b/src/lib/modes/aead/gcm/gcm.h @@ -111,9 +111,10 @@ class BOTAN_PUBLIC_API(2,0) GCM_Decryption final : public GCM_Mode /** * GCM's GHASH -* Maybe a Transform? +* This is not intended for general use, but is exposed to allow +* shared code between GCM and GMAC */ -class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm +class BOTAN_PUBLIC_API(2,0) GHASH final : public SymmetricAlgorithm { public: void set_associated_data(const uint8_t ad[], size_t ad_len); @@ -127,6 +128,11 @@ class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm */ void update(const uint8_t in[], size_t len); + /* + * Incremental update of associated data + */ + void update_associated_data(const uint8_t ad[], size_t len); + secure_vector<uint8_t> final(); Key_Length_Specification key_spec() const override @@ -137,24 +143,25 @@ class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm void reset(); std::string name() const override { return "GHASH"; } - protected: + void ghash_update(secure_vector<uint8_t>& x, const uint8_t input[], size_t input_len); void add_final_block(secure_vector<uint8_t>& x, size_t ad_len, size_t pt_len); - - secure_vector<uint8_t> m_H; - secure_vector<uint8_t> m_H_ad; - secure_vector<uint8_t> m_ghash; - size_t m_ad_len = 0; - private: void key_schedule(const uint8_t key[], size_t key_len) override; - void gcm_multiply(secure_vector<uint8_t>& x) const; + void gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks); + secure_vector<uint8_t> m_H; + secure_vector<uint8_t> m_H_ad; + secure_vector<uint8_t> m_ghash; secure_vector<uint8_t> m_nonce; + secure_vector<uint64_t> m_HM; + size_t m_ad_len = 0; size_t m_text_len = 0; }; diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp index 54e841650..12d6ff7d1 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.cpp +++ b/src/lib/modes/aead/gcm/pmull/pmull.cpp @@ -10,62 +10,73 @@ namespace Botan { BOTAN_FUNC_ISA("+crypto") -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]) +void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks) { /* * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf */ - const uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); + uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); const uint64x2_t b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(H+8)), vrev64_u8(vld1_u8(H)))); - uint64x2_t T0, T1, T2, T3, T4, T5; - - T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); - T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); - T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); - T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); - - T1 = veorq_u64(T1, T2); - T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); - T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); - T0 = veorq_u64(T0, T2); - T3 = veorq_u64(T3, T1); - - T4 = vshrq_n_u64(T0, 31); - T0 = vshlq_n_u64(T0, 1); - - T5 = vshrq_n_u64(T3, 31); - T3 = vshlq_n_u64(T3, 1); - - T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); - T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); - T0 = vorrq_u64(T0, T4); - T3 = vorrq_u64(T3, T5); - T3 = vorrq_u64(T3, T2); - - T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); - T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); - T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); - - T4 = veorq_u64(T4, T5); - T4 = veorq_u64(T4, T2); - T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); - T3 = veorq_u64(T3, T5); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); - T0 = veorq_u64(T0, T4); - T3 = veorq_u64(T3, T0); - - T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); - T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); - T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); - T3 = veorq_u64(T3, T1); - T3 = veorq_u64(T3, T2); - T3 = veorq_u64(T3, T4); - - vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(T3)))); - vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(T3)))); + for(size_t i = 0; i != blocks; ++i) + { + const uint64x2_t m64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(input+8)), vrev64_u8(vld1_u8(input)))); + input += 16; + + a64 = veorq_u64(a64, m64); + + uint64x2_t T0, T1, T2, T3, T4, T5; + + T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); + T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); + T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); + T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); + + T1 = veorq_u64(T1, T2); + T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); + T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); + T0 = veorq_u64(T0, T2); + T3 = veorq_u64(T3, T1); + + T4 = vshrq_n_u64(T0, 31); + T0 = vshlq_n_u64(T0, 1); + + T5 = vshrq_n_u64(T3, 31); + T3 = vshlq_n_u64(T3, 1); + + T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); + T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); + T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); + T0 = vorrq_u64(T0, T4); + T3 = vorrq_u64(T3, T5); + T3 = vorrq_u64(T3, T2); + + T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); + T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); + T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); + + T4 = veorq_u64(T4, T5); + T4 = veorq_u64(T4, T2); + T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); + T3 = veorq_u64(T3, T5); + T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); + T0 = veorq_u64(T0, T4); + T3 = veorq_u64(T3, T0); + + T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); + T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); + T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); + T3 = veorq_u64(T3, T1); + T3 = veorq_u64(T3, T2); + T3 = veorq_u64(T3, T4); + + a64 = T3; + } + + vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(a64)))); + vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(a64)))); } } diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h index 4ddcc8f27..638b845cd 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.h +++ b/src/lib/modes/aead/gcm/pmull/pmull.h @@ -12,7 +12,8 @@ namespace Botan { -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]); +void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks); } diff --git a/src/tests/test_mac.cpp b/src/tests/test_mac.cpp index 471a15fed..2792aeb3e 100644 --- a/src/tests/test_mac.cpp +++ b/src/tests/test_mac.cpp @@ -68,6 +68,7 @@ class Message_Auth_Tests final : public Text_Based_Test // Test to make sure clear() resets what we need it to mac->set_key(key); + mac->start(iv); mac->update("some discarded input"); mac->clear(); @@ -81,6 +82,7 @@ class Message_Auth_Tests final : public Text_Based_Test result.confirm("Clone has different pointer", mac.get() != clone.get()); result.test_eq("Clone has same name", mac->name(), clone->name()); clone->set_key(key); + clone->start(iv); clone->update(Test::rng().random_vec(32)); result.test_eq(provider + " correct mac", mac->verify_mac(expected.data(), expected.size()), true); |