diff options
author | Jack Lloyd <[email protected]> | 2017-10-19 10:21:47 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-19 10:21:47 -0400 |
commit | e990033be3f4ea3e226b1843e41dff6cd730543c (patch) | |
tree | 81bd55a5e8fe9747fda86c3f0c12b0a8403484ea /src/lib/modes/aead | |
parent | b8467e3f54182396107d36437c77bb40b70cd598 (diff) | |
parent | 8805f1535fa75523903995f05348ffcc7a7d2e86 (diff) |
Merge GH #1262 GCM and CTR optimizations
Diffstat (limited to 'src/lib/modes/aead')
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.cpp | 203 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.h | 5 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.cpp | 216 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.h | 58 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/ghash.cpp | 250 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/ghash.h | 78 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.cpp | 238 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.h | 5 |
8 files changed, 670 insertions, 383 deletions
diff --git a/src/lib/modes/aead/gcm/clmul/clmul.cpp b/src/lib/modes/aead/gcm/clmul/clmul.cpp index 33378d833..632de6d33 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.cpp +++ b/src/lib/modes/aead/gcm/clmul/clmul.cpp @@ -1,6 +1,6 @@ /* * CLMUL hook -* (C) 2013 Jack Lloyd +* (C) 2013,2017 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -11,73 +11,168 @@ namespace Botan { -BOTAN_FUNC_ISA("pclmul,ssse3") -void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16], - const uint8_t input[], size_t blocks) +namespace { + +BOTAN_FUNC_ISA("sse2") +inline __m128i gcm_reduce(const __m128i& B0, const __m128i& B1) + { + __m128i T0, T1, T2, T3; + + T0 = _mm_srli_epi32(B1, 31); + T1 = _mm_slli_epi32(B1, 1); + T2 = _mm_srli_epi32(B0, 31); + T3 = _mm_slli_epi32(B0, 1); + + T3 = _mm_or_si128(T3, _mm_srli_si128(T0, 12)); + T3 = _mm_or_si128(T3, _mm_slli_si128(T2, 4)); + T1 = _mm_or_si128(T1, _mm_slli_si128(T0, 4)); + + T0 = _mm_xor_si128(_mm_slli_epi32(T1, 31), _mm_slli_epi32(T1, 30)); + T0 = _mm_xor_si128(T0, _mm_slli_epi32(T1, 25)); + + T1 = _mm_xor_si128(T1, _mm_slli_si128(T0, 12)); + + T0 = _mm_xor_si128(T3, _mm_srli_si128(T0, 4)); + T0 = _mm_xor_si128(T0, T1); + T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 7)); + T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 1)); + T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 2)); + return T0; + } + +BOTAN_FUNC_ISA("pclmul,sse2") +inline __m128i gcm_multiply(const __m128i& H, const __m128i& x) + { + __m128i T0, T1, T2, T3, T4; + + T0 = _mm_clmulepi64_si128(x, H, 0x11); + T1 = _mm_clmulepi64_si128(x, H, 0x10); + T2 = _mm_clmulepi64_si128(x, H, 0x01); + T3 = _mm_clmulepi64_si128(x, H, 0x00); + + T1 = _mm_xor_si128(T1, T2); + T0 = _mm_xor_si128(T0, _mm_srli_si128(T1, 8)); + T3 = _mm_xor_si128(T3, _mm_slli_si128(T1, 8)); + + return gcm_reduce(T0, T3); + } + +BOTAN_FUNC_ISA("pclmul,sse2") +inline __m128i gcm_multiply_x4(const __m128i& H1, const __m128i& H2, const __m128i& H3, const __m128i& H4, + const __m128i& X1, const __m128i& X2, const __m128i& X3, const __m128i& X4) + { + /* + * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski + * and Pierre Laurent of Intel + */ + + const __m128i H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); + const __m128i H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); + const __m128i H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); + const __m128i H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); + + const __m128i lo = _mm_xor_si128( + _mm_xor_si128(H1_X1_lo, H2_X2_lo), + _mm_xor_si128(H3_X3_lo, H4_X4_lo)); + + const __m128i H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); + const __m128i H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); + const __m128i H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); + const __m128i H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); + + const __m128i hi = _mm_xor_si128( + _mm_xor_si128(H1_X1_hi, H2_X2_hi), + _mm_xor_si128(H3_X3_hi, H4_X4_hi)); + + __m128i T0 = _mm_xor_si128(lo, hi); + __m128i T1, T2, T3, T4; + + T1 = _mm_xor_si128(_mm_srli_si128(H1, 8), H1); + T2 = _mm_xor_si128(_mm_srli_si128(X1, 8), X1); + T3 = _mm_xor_si128(_mm_srli_si128(H2, 8), H2); + T4 = _mm_xor_si128(_mm_srli_si128(X2, 8), X2); + T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00)); + T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00)); + + T1 = _mm_xor_si128(_mm_srli_si128(H3, 8), H3); + T2 = _mm_xor_si128(_mm_srli_si128(X3, 8), X3); + T3 = _mm_xor_si128(_mm_srli_si128(H4, 8), H4); + T4 = _mm_xor_si128(_mm_srli_si128(X4, 8), X4); + T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00)); + T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00)); + + T1 = _mm_xor_si128(_mm_srli_si128(T0, 8), hi); + T2 = _mm_xor_si128(_mm_slli_si128(T0, 8), lo); + + return gcm_reduce(T1, T2); + } + +} + +BOTAN_FUNC_ISA("ssse3") +void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) + { + const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + const __m128i H = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H_bytes)), BSWAP_MASK); + const __m128i H2 = gcm_multiply(H, H); + const __m128i H3 = gcm_multiply(H, H2); + const __m128i H4 = gcm_multiply(H, H3); + + __m128i* H_pow_mm = reinterpret_cast<__m128i*>(H_pow); + + _mm_storeu_si128(H_pow_mm+0, H); + _mm_storeu_si128(H_pow_mm+1, H2); + _mm_storeu_si128(H_pow_mm+2, H3); + _mm_storeu_si128(H_pow_mm+3, H4); + } + +BOTAN_FUNC_ISA("ssse3") +void gcm_multiply_clmul(uint8_t x[16], + const uint64_t H_pow[8], + const uint8_t input_bytes[], size_t blocks) { /* * Algorithms 1 and 5 from Intel's CLMUL guide */ const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const __m128i b = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H)), BSWAP_MASK); + const __m128i* input = reinterpret_cast<const __m128i*>(input_bytes); + + const __m128i* H_pow_mm = reinterpret_cast<const __m128i*>(H_pow); + + const __m128i H = _mm_loadu_si128(H_pow_mm); __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x)); a = _mm_shuffle_epi8(a, BSWAP_MASK); + if(blocks >= 4) + { + const __m128i H2 = _mm_loadu_si128(H_pow_mm + 1); + const __m128i H3 = _mm_loadu_si128(H_pow_mm + 2); + const __m128i H4 = _mm_loadu_si128(H_pow_mm + 3); + + while(blocks >= 4) + { + const __m128i m0 = _mm_shuffle_epi8(_mm_loadu_si128(input + 0), BSWAP_MASK); + const __m128i m1 = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), BSWAP_MASK); + const __m128i m2 = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), BSWAP_MASK); + const __m128i m3 = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), BSWAP_MASK); + + a = _mm_xor_si128(a, m0); + a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a); + + input += 4; + blocks -= 4; + } + } + for(size_t i = 0; i != blocks; ++i) { - __m128i m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input) + i); - m = _mm_shuffle_epi8(m, BSWAP_MASK); + const __m128i m = _mm_shuffle_epi8(_mm_loadu_si128(input + i), BSWAP_MASK); a = _mm_xor_si128(a, m); - - __m128i T0, T1, T2, T3, T4, T5; - - T0 = _mm_clmulepi64_si128(a, b, 0x00); - T1 = _mm_clmulepi64_si128(a, b, 0x01); - T2 = _mm_clmulepi64_si128(a, b, 0x10); - T3 = _mm_clmulepi64_si128(a, b, 0x11); - - T1 = _mm_xor_si128(T1, T2); - T2 = _mm_slli_si128(T1, 8); - T1 = _mm_srli_si128(T1, 8); - T0 = _mm_xor_si128(T0, T2); - T3 = _mm_xor_si128(T3, T1); - - T4 = _mm_srli_epi32(T0, 31); - T0 = _mm_slli_epi32(T0, 1); - - T5 = _mm_srli_epi32(T3, 31); - T3 = _mm_slli_epi32(T3, 1); - - T2 = _mm_srli_si128(T4, 12); - T5 = _mm_slli_si128(T5, 4); - T4 = _mm_slli_si128(T4, 4); - T0 = _mm_or_si128(T0, T4); - T3 = _mm_or_si128(T3, T5); - T3 = _mm_or_si128(T3, T2); - - T4 = _mm_slli_epi32(T0, 31); - T5 = _mm_slli_epi32(T0, 30); - T2 = _mm_slli_epi32(T0, 25); - - T4 = _mm_xor_si128(T4, T5); - T4 = _mm_xor_si128(T4, T2); - T5 = _mm_srli_si128(T4, 4); - T3 = _mm_xor_si128(T3, T5); - T4 = _mm_slli_si128(T4, 12); - T0 = _mm_xor_si128(T0, T4); - T3 = _mm_xor_si128(T3, T0); - - T4 = _mm_srli_epi32(T0, 1); - T1 = _mm_srli_epi32(T0, 2); - T2 = _mm_srli_epi32(T0, 7); - T3 = _mm_xor_si128(T3, T1); - T3 = _mm_xor_si128(T3, T2); - T3 = _mm_xor_si128(T3, T4); - - a = T3; + a = gcm_multiply(H, a); } a = _mm_shuffle_epi8(a, BSWAP_MASK); diff --git a/src/lib/modes/aead/gcm/clmul/clmul.h b/src/lib/modes/aead/gcm/clmul/clmul.h index d68e021d2..25cdfbd96 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.h +++ b/src/lib/modes/aead/gcm/clmul/clmul.h @@ -12,7 +12,10 @@ namespace Botan { -void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16], +void gcm_clmul_precompute(const uint8_t H[16], uint64_t H_pow[4*2]); + +void gcm_multiply_clmul(uint8_t x[16], + const uint64_t H_pow[4*2], const uint8_t input[], size_t blocks); } diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp index 57272f2ac..dfaffedb7 100644 --- a/src/lib/modes/aead/gcm/gcm.cpp +++ b/src/lib/modes/aead/gcm/gcm.cpp @@ -7,213 +7,12 @@ */ #include <botan/gcm.h> +#include <botan/ghash.h> #include <botan/block_cipher.h> -#include <botan/internal/ct_utils.h> -#include <botan/loadstor.h> #include <botan/ctr.h> -#include <botan/cpuid.h> - -#if defined(BOTAN_HAS_GCM_CLMUL) - #include <botan/internal/clmul.h> -#endif - -#if defined(BOTAN_HAS_GCM_PMULL) - #include <botan/internal/pmull.h> -#endif namespace Botan { -static const size_t GCM_BS = 16; - -void GHASH::gcm_multiply(secure_vector<uint8_t>& x, - const uint8_t input[], - size_t blocks) - { -#if defined(BOTAN_HAS_GCM_CLMUL) - if(CPUID::has_clmul()) - { - return gcm_multiply_clmul(x.data(), m_H.data(), input, blocks); - } -#endif - -#if defined(BOTAN_HAS_GCM_PMULL) - if(CPUID::has_arm_pmull()) - { - return gcm_multiply_pmull(x.data(), m_H.data(), input, blocks); - } -#endif - - CT::poison(x.data(), x.size()); - - // SSE2 might be useful here - - const uint64_t ALL_BITS = 0xFFFFFFFFFFFFFFFF; - - uint64_t X[2] = { - load_be<uint64_t>(x.data(), 0), - load_be<uint64_t>(x.data(), 1) - }; - - for(size_t b = 0; b != blocks; ++b) - { - X[0] ^= load_be<uint64_t>(input, 2*b); - X[1] ^= load_be<uint64_t>(input, 2*b+1); - - uint64_t Z[2] = { 0, 0 }; - - for(size_t i = 0; i != 64; ++i) - { - const uint64_t X0MASK = (ALL_BITS + (X[0] >> 63)) ^ ALL_BITS; - const uint64_t X1MASK = (ALL_BITS + (X[1] >> 63)) ^ ALL_BITS; - - X[0] <<= 1; - X[1] <<= 1; - - Z[0] ^= m_HM[4*i ] & X0MASK; - Z[1] ^= m_HM[4*i+1] & X0MASK; - Z[0] ^= m_HM[4*i+2] & X1MASK; - Z[1] ^= m_HM[4*i+3] & X1MASK; - } - - X[0] = Z[0]; - X[1] = Z[1]; - } - - store_be<uint64_t>(x.data(), X[0], X[1]); - CT::unpoison(x.data(), x.size()); - } - -void GHASH::ghash_update(secure_vector<uint8_t>& ghash, - const uint8_t input[], size_t length) - { - /* - This assumes if less than block size input then we're just on the - final block and should pad with zeros - */ - - const size_t full_blocks = length / GCM_BS; - const size_t final_bytes = length - (full_blocks * GCM_BS); - - if(full_blocks > 0) - { - gcm_multiply(ghash, input, full_blocks); - } - - if(final_bytes) - { - secure_vector<uint8_t> last_block(GCM_BS); - copy_mem(last_block.data(), input + full_blocks * GCM_BS, final_bytes); - gcm_multiply(ghash, last_block.data(), 1); - } - } - -void GHASH::key_schedule(const uint8_t key[], size_t length) - { - m_H.assign(key, key+length); - m_H_ad.resize(GCM_BS); - m_ad_len = 0; - m_text_len = 0; - - uint64_t H0 = load_be<uint64_t>(m_H.data(), 0); - uint64_t H1 = load_be<uint64_t>(m_H.data(), 1); - - const uint64_t R = 0xE100000000000000; - - m_HM.resize(256); - - // precompute the multiples of H - for(size_t i = 0; i != 2; ++i) - { - for(size_t j = 0; j != 64; ++j) - { - /* - we interleave H^1, H^65, H^2, H^66, ... - to make indexing nicer in the multiplication code - */ - m_HM[4*j+2*i] = H0; - m_HM[4*j+2*i+1] = H1; - - // GCM's bit ops are reversed so we carry out of the bottom - const uint64_t carry = R * (H1 & 1); - H1 = (H1 >> 1) | (H0 << 63); - H0 = (H0 >> 1) ^ carry; - } - } - } - -void GHASH::start(const uint8_t nonce[], size_t len) - { - m_nonce.assign(nonce, nonce + len); - m_ghash = m_H_ad; - } - -void GHASH::set_associated_data(const uint8_t input[], size_t length) - { - zeroise(m_H_ad); - - ghash_update(m_H_ad, input, length); - m_ad_len = length; - } - -void GHASH::update_associated_data(const uint8_t ad[], size_t length) - { - BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); - m_ad_len += length; - ghash_update(m_ghash, ad, length); - } - -void GHASH::update(const uint8_t input[], size_t length) - { - BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); - m_text_len += length; - ghash_update(m_ghash, input, length); - } - -void GHASH::add_final_block(secure_vector<uint8_t>& hash, - size_t ad_len, size_t text_len) - { - secure_vector<uint8_t> final_block(GCM_BS); - store_be<uint64_t>(final_block.data(), 8*ad_len, 8*text_len); - ghash_update(hash, final_block.data(), final_block.size()); - } - -secure_vector<uint8_t> GHASH::final() - { - add_final_block(m_ghash, m_ad_len, m_text_len); - - secure_vector<uint8_t> mac; - mac.swap(m_ghash); - - mac ^= m_nonce; - m_text_len = 0; - return mac; - } - -secure_vector<uint8_t> GHASH::nonce_hash(const uint8_t nonce[], size_t nonce_len) - { - BOTAN_ASSERT(m_ghash.size() == 0, "nonce_hash called during wrong time"); - secure_vector<uint8_t> y0(GCM_BS); - - ghash_update(y0, nonce, nonce_len); - add_final_block(y0, 0, nonce_len); - - return y0; - } - -void GHASH::clear() - { - zeroise(m_H); - reset(); - } - -void GHASH::reset() - { - zeroise(m_H_ad); - m_ghash.clear(); - m_nonce.clear(); - m_text_len = m_ad_len = 0; - } - /* * GCM_Mode Constructor */ @@ -255,12 +54,7 @@ std::string GCM_Mode::name() const std::string GCM_Mode::provider() const { -#if defined(BOTAN_HAS_GCM_CLMUL) - if(CPUID::has_clmul()) - return "clmul"; -#endif - - return "base"; + return m_ghash->provider(); } size_t GCM_Mode::update_granularity() const @@ -309,10 +103,10 @@ void GCM_Mode::start_msg(const uint8_t nonce[], size_t nonce_len) m_ctr->set_iv(y0.data(), y0.size()); - secure_vector<uint8_t> m_enc_y0(GCM_BS); - m_ctr->encipher(m_enc_y0); + zeroise(y0); + m_ctr->encipher(y0); - m_ghash->start(m_enc_y0.data(), m_enc_y0.size()); + m_ghash->start(y0.data(), y0.size()); } size_t GCM_Encryption::process(uint8_t buf[], size_t sz) diff --git a/src/lib/modes/aead/gcm/gcm.h b/src/lib/modes/aead/gcm/gcm.h index eac2add93..de7c1ea9a 100644 --- a/src/lib/modes/aead/gcm/gcm.h +++ b/src/lib/modes/aead/gcm/gcm.h @@ -47,7 +47,7 @@ class BOTAN_PUBLIC_API(2,0) GCM_Mode : public AEAD_Mode ~GCM_Mode(); - const size_t m_BS = 16; + static const size_t GCM_BS = 16; const size_t m_tag_size; const std::string m_cipher_name; @@ -109,62 +109,6 @@ class BOTAN_PUBLIC_API(2,0) GCM_Decryption final : public GCM_Mode void finish(secure_vector<uint8_t>& final_block, size_t offset = 0) override; }; -/** -* GCM's GHASH -* This is not intended for general use, but is exposed to allow -* shared code between GCM and GMAC -*/ -class BOTAN_PUBLIC_API(2,0) GHASH final : public SymmetricAlgorithm - { - public: - void set_associated_data(const uint8_t ad[], size_t ad_len); - - secure_vector<uint8_t> nonce_hash(const uint8_t nonce[], size_t len); - - void start(const uint8_t nonce[], size_t len); - - /* - * Assumes input len is multiple of 16 - */ - void update(const uint8_t in[], size_t len); - - /* - * Incremental update of associated data - */ - void update_associated_data(const uint8_t ad[], size_t len); - - secure_vector<uint8_t> final(); - - Key_Length_Specification key_spec() const override - { return Key_Length_Specification(16); } - - void clear() override; - - void reset(); - - std::string name() const override { return "GHASH"; } - - void ghash_update(secure_vector<uint8_t>& x, - const uint8_t input[], size_t input_len); - - void add_final_block(secure_vector<uint8_t>& x, - size_t ad_len, size_t pt_len); - private: - void key_schedule(const uint8_t key[], size_t key_len) override; - - void gcm_multiply(secure_vector<uint8_t>& x, - const uint8_t input[], - size_t blocks); - - secure_vector<uint8_t> m_H; - secure_vector<uint8_t> m_H_ad; - secure_vector<uint8_t> m_ghash; - secure_vector<uint8_t> m_nonce; - secure_vector<uint64_t> m_HM; - size_t m_ad_len = 0; - size_t m_text_len = 0; - }; - } #endif diff --git a/src/lib/modes/aead/gcm/ghash.cpp b/src/lib/modes/aead/gcm/ghash.cpp new file mode 100644 index 000000000..c3c2453e8 --- /dev/null +++ b/src/lib/modes/aead/gcm/ghash.cpp @@ -0,0 +1,250 @@ +/* +* GCM GHASH +* (C) 2013,2015,2017 Jack Lloyd +* (C) 2016 Daniel Neus, Rohde & Schwarz Cybersecurity +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/ghash.h> +#include <botan/internal/ct_utils.h> +#include <botan/loadstor.h> +#include <botan/cpuid.h> + +#if defined(BOTAN_HAS_GCM_CLMUL) + #include <botan/internal/clmul.h> +#endif + +#if defined(BOTAN_HAS_GCM_PMULL) + #include <botan/internal/pmull.h> +#endif + +namespace Botan { + +std::string GHASH::provider() const + { +#if defined(BOTAN_HAS_GCM_CLMUL) + if(CPUID::has_clmul()) + return "clmul"; +#endif + +#if defined(BOTAN_HAS_GCM_PMULL) + if(CPUID::has_arm_pmull()) + return "pmull"; +#endif + + return "base"; + } + +void GHASH::gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks) + { +#if defined(BOTAN_HAS_GCM_CLMUL) + if(CPUID::has_clmul()) + { + return gcm_multiply_clmul(x.data(), m_H_pow.data(), input, blocks); + } +#endif + +#if defined(BOTAN_HAS_GCM_PMULL) + if(CPUID::has_arm_pmull()) + { + return gcm_multiply_pmull(x.data(), m_H_pow.data(), input, blocks); + } +#endif + + CT::poison(x.data(), x.size()); + + // SSE2 might be useful here + + const uint64_t ALL_BITS = 0xFFFFFFFFFFFFFFFF; + + uint64_t X[2] = { + load_be<uint64_t>(x.data(), 0), + load_be<uint64_t>(x.data(), 1) + }; + + for(size_t b = 0; b != blocks; ++b) + { + X[0] ^= load_be<uint64_t>(input, 2*b); + X[1] ^= load_be<uint64_t>(input, 2*b+1); + + uint64_t Z[2] = { 0, 0 }; + + for(size_t i = 0; i != 64; ++i) + { + const uint64_t X0MASK = (ALL_BITS + (X[0] >> 63)) ^ ALL_BITS; + const uint64_t X1MASK = (ALL_BITS + (X[1] >> 63)) ^ ALL_BITS; + + X[0] <<= 1; + X[1] <<= 1; + + Z[0] ^= m_HM[4*i ] & X0MASK; + Z[1] ^= m_HM[4*i+1] & X0MASK; + Z[0] ^= m_HM[4*i+2] & X1MASK; + Z[1] ^= m_HM[4*i+3] & X1MASK; + } + + X[0] = Z[0]; + X[1] = Z[1]; + } + + store_be<uint64_t>(x.data(), X[0], X[1]); + CT::unpoison(x.data(), x.size()); + } + +void GHASH::ghash_update(secure_vector<uint8_t>& ghash, + const uint8_t input[], size_t length) + { + /* + This assumes if less than block size input then we're just on the + final block and should pad with zeros + */ + + const size_t full_blocks = length / GCM_BS; + const size_t final_bytes = length - (full_blocks * GCM_BS); + + if(full_blocks > 0) + { + gcm_multiply(ghash, input, full_blocks); + } + + if(final_bytes) + { + secure_vector<uint8_t> last_block(GCM_BS); + copy_mem(last_block.data(), input + full_blocks * GCM_BS, final_bytes); + gcm_multiply(ghash, last_block.data(), 1); + } + } + +void GHASH::key_schedule(const uint8_t key[], size_t length) + { + m_H.assign(key, key+length); + m_H_ad.resize(GCM_BS); + m_ad_len = 0; + m_text_len = 0; + + uint64_t H0 = load_be<uint64_t>(m_H.data(), 0); + uint64_t H1 = load_be<uint64_t>(m_H.data(), 1); + + const uint64_t R = 0xE100000000000000; + + m_HM.resize(256); + + // precompute the multiples of H + for(size_t i = 0; i != 2; ++i) + { + for(size_t j = 0; j != 64; ++j) + { + /* + we interleave H^1, H^65, H^2, H^66, H3, H67, H4, H68 + to make indexing nicer in the multiplication code + */ + m_HM[4*j+2*i] = H0; + m_HM[4*j+2*i+1] = H1; + + // GCM's bit ops are reversed so we carry out of the bottom + const uint64_t carry = R * (H1 & 1); + H1 = (H1 >> 1) | (H0 << 63); + H0 = (H0 >> 1) ^ carry; + } + } + +#if defined(BOTAN_HAS_GCM_CLMUL) + if(CPUID::has_clmul()) + { + m_H_pow.resize(8); + gcm_clmul_precompute(m_H.data(), m_H_pow.data()); + } +#endif + +#if defined(BOTAN_HAS_GCM_PMULL) + if(CPUID::has_arm_pmull()) + { + m_H_pow.resize(8); + gcm_pmull_precompute(m_H.data(), m_H_pow.data()); + } +#endif + + } + +void GHASH::start(const uint8_t nonce[], size_t len) + { + m_nonce.assign(nonce, nonce + len); + m_ghash = m_H_ad; + } + +void GHASH::set_associated_data(const uint8_t input[], size_t length) + { + zeroise(m_H_ad); + + ghash_update(m_H_ad, input, length); + m_ad_len = length; + } + +void GHASH::update_associated_data(const uint8_t ad[], size_t length) + { + BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); + m_ad_len += length; + ghash_update(m_ghash, ad, length); + } + +void GHASH::update(const uint8_t input[], size_t length) + { + BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); + m_text_len += length; + ghash_update(m_ghash, input, length); + } + +void GHASH::add_final_block(secure_vector<uint8_t>& hash, + size_t ad_len, size_t text_len) + { + /* + * stack buffer is fine here since the text len is public + * and the length of the AD is probably not sensitive either. + */ + uint8_t final_block[GCM_BS]; + store_be<uint64_t>(final_block, 8*ad_len, 8*text_len); + ghash_update(hash, final_block, GCM_BS); + } + +secure_vector<uint8_t> GHASH::final() + { + add_final_block(m_ghash, m_ad_len, m_text_len); + + secure_vector<uint8_t> mac; + mac.swap(m_ghash); + + mac ^= m_nonce; + m_text_len = 0; + return mac; + } + +secure_vector<uint8_t> GHASH::nonce_hash(const uint8_t nonce[], size_t nonce_len) + { + BOTAN_ASSERT(m_ghash.size() == 0, "nonce_hash called during wrong time"); + secure_vector<uint8_t> y0(GCM_BS); + + ghash_update(y0, nonce, nonce_len); + add_final_block(y0, 0, nonce_len); + + return y0; + } + +void GHASH::clear() + { + zeroise(m_H); + zeroise(m_HM); + reset(); + } + +void GHASH::reset() + { + zeroise(m_H_ad); + m_ghash.clear(); + m_nonce.clear(); + m_text_len = m_ad_len = 0; + } + +} diff --git a/src/lib/modes/aead/gcm/ghash.h b/src/lib/modes/aead/gcm/ghash.h new file mode 100644 index 000000000..7fcf7cfaa --- /dev/null +++ b/src/lib/modes/aead/gcm/ghash.h @@ -0,0 +1,78 @@ +/* +* (C) 2013 Jack Lloyd +* (C) 2016 Daniel Neus, Rohde & Schwarz Cybersecurity +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#ifndef BOTAN_GCM_GHASH_H_ +#define BOTAN_GCM_GHASH_H_ + +#include <botan/sym_algo.h> + +namespace Botan { + +/** +* GCM's GHASH +* This is not intended for general use, but is exposed to allow +* shared code between GCM and GMAC +*/ +class BOTAN_PUBLIC_API(2,0) GHASH final : public SymmetricAlgorithm + { + public: + void set_associated_data(const uint8_t ad[], size_t ad_len); + + secure_vector<uint8_t> nonce_hash(const uint8_t nonce[], size_t len); + + void start(const uint8_t nonce[], size_t len); + + /* + * Assumes input len is multiple of 16 + */ + void update(const uint8_t in[], size_t len); + + /* + * Incremental update of associated data + */ + void update_associated_data(const uint8_t ad[], size_t len); + + secure_vector<uint8_t> final(); + + Key_Length_Specification key_spec() const override + { return Key_Length_Specification(16); } + + void clear() override; + + void reset(); + + std::string name() const override { return "GHASH"; } + + std::string provider() const; + + void ghash_update(secure_vector<uint8_t>& x, + const uint8_t input[], size_t input_len); + + void add_final_block(secure_vector<uint8_t>& x, + size_t ad_len, size_t pt_len); + private: + void key_schedule(const uint8_t key[], size_t key_len) override; + + void gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks); + + static const size_t GCM_BS = 16; + + secure_vector<uint8_t> m_H; + secure_vector<uint8_t> m_H_ad; + secure_vector<uint8_t> m_ghash; + secure_vector<uint8_t> m_nonce; + secure_vector<uint64_t> m_HM; + secure_vector<uint64_t> m_H_pow; + size_t m_ad_len = 0; + size_t m_text_len = 0; + }; + +} + +#endif diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp index 12d6ff7d1..77eb1909f 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.cpp +++ b/src/lib/modes/aead/gcm/pmull/pmull.cpp @@ -1,6 +1,9 @@ /* * Contributed by Jeffrey Walton * +* Further changes +* (C) 2017 Jack Lloyd +* * Botan is released under the Simplified BSD License (see license.txt) */ @@ -9,74 +12,191 @@ namespace Botan { +/* +This follows the same pattern as the clmul implementation. + +See also http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf +*/ + +namespace { + +BOTAN_FUNC_ISA("+simd") +inline uint64x2_t gcm_reduce(uint32x4_t B0, uint32x4_t B1) + { + const uint32x4_t zero = vdupq_n_u32(0); + + uint32x4_t T0, T1, T2, T3, T4, T5; + + T4 = vshrq_n_u32(B0, 31); + T0 = vshlq_n_u32(B0, 1); + T5 = vshrq_n_u32(B1, 31); + T3 = vshlq_n_u32(B1, 1); + + T2 = vextq_u32(T4, zero, 3); + T5 = vextq_u32(zero, T5, 3); + T4 = vextq_u32(zero, T4, 3); + T0 = vorrq_u32(T0, T4); + T3 = vorrq_u32(T3, T5); + T3 = vorrq_u32(T3, T2); + + T4 = vshlq_n_u32(T0, 31); + T5 = vshlq_n_u32(T0, 30); + T2 = vshlq_n_u32(T0, 25); + + T4 = veorq_u32(T4, T5); + T4 = veorq_u32(T4, T2); + T5 = vextq_u32(T4, zero, 1); + T3 = veorq_u32(T3, T5); + T4 = vextq_u32(zero, T4, 1); + T0 = veorq_u32(T0, T4); + T3 = veorq_u32(T3, T0); + + T4 = vshrq_n_u32(T0, 1); + T1 = vshrq_n_u32(T0, 2); + T2 = vshrq_n_u32(T0, 7); + T3 = veorq_u32(T3, T1); + T3 = veorq_u32(T3, T2); + T3 = veorq_u32(T3, T4); + + return vreinterpretq_u64_u32(T3); + } + +BOTAN_FUNC_ISA("+crypto") +inline uint64x2_t gcm_multiply(uint64x2_t H, uint64x2_t x) + { + const uint32x4_t zero = vdupq_n_u32(0); + + const uint64_t x_hi = vgetq_lane_u64(x, 0); + const uint64_t x_lo = vgetq_lane_u64(x, 1); + const uint64_t H_hi = vgetq_lane_u64(H, 0); + const uint64_t H_lo = vgetq_lane_u64(H, 1); + + uint32x4_t T0 = (uint32x4_t)vmull_p64(x_hi, H_hi); + uint32x4_t T1 = (uint32x4_t)vmull_p64(x_lo, H_hi); + uint32x4_t T2 = (uint32x4_t)vmull_p64(x_hi, H_lo); + uint32x4_t T3 = (uint32x4_t)vmull_p64(x_lo, H_lo); + + T1 = veorq_u32(T1, T2); + T0 = veorq_u32(T0, vextq_u32(zero, T1, 2)); + T3 = veorq_u32(T3, vextq_u32(T1, zero, 2)); + + return gcm_reduce(T0, T3); + } + BOTAN_FUNC_ISA("+crypto") -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], +inline uint64x2_t gcm_multiply_x4(uint64x2_t H1, uint64x2_t H2, uint64x2_t H3, uint64x2_t H4, + uint64x2_t X1, uint64x2_t X2, uint64x2_t X3, uint64x2_t X4) + { + const uint64_t H1_hi = vgetq_lane_u64(H1, 0); + const uint64_t H1_lo = vgetq_lane_u64(H1, 1); + const uint64_t H2_hi = vgetq_lane_u64(H2, 0); + const uint64_t H2_lo = vgetq_lane_u64(H2, 1); + const uint64_t H3_hi = vgetq_lane_u64(H3, 0); + const uint64_t H3_lo = vgetq_lane_u64(H3, 1); + const uint64_t H4_hi = vgetq_lane_u64(H4, 0); + const uint64_t H4_lo = vgetq_lane_u64(H4, 1); + + const uint64_t X1_hi = vgetq_lane_u64(X1, 0); + const uint64_t X1_lo = vgetq_lane_u64(X1, 1); + const uint64_t X2_hi = vgetq_lane_u64(X2, 0); + const uint64_t X2_lo = vgetq_lane_u64(X2, 1); + const uint64_t X3_hi = vgetq_lane_u64(X3, 0); + const uint64_t X3_lo = vgetq_lane_u64(X3, 1); + const uint64_t X4_hi = vgetq_lane_u64(X4, 0); + const uint64_t X4_lo = vgetq_lane_u64(X4, 1); + + const uint32x4_t H1_X1_lo = (uint32x4_t)vmull_p64(X1_lo, H1_lo); + const uint32x4_t H2_X2_lo = (uint32x4_t)vmull_p64(X2_lo, H2_lo); + const uint32x4_t H3_X3_lo = (uint32x4_t)vmull_p64(X3_lo, H3_lo); + const uint32x4_t H4_X4_lo = (uint32x4_t)vmull_p64(X4_lo, H4_lo); + + const uint32x4_t lo = veorq_u32( + veorq_u32(H1_X1_lo, H2_X2_lo), + veorq_u32(H3_X3_lo, H4_X4_lo)); + + const uint32x4_t H1_X1_hi = (uint32x4_t)vmull_p64(X1_hi, H1_hi); + const uint32x4_t H2_X2_hi = (uint32x4_t)vmull_p64(X2_hi, H2_hi); + const uint32x4_t H3_X3_hi = (uint32x4_t)vmull_p64(X3_hi, H3_hi); + const uint32x4_t H4_X4_hi = (uint32x4_t)vmull_p64(X4_hi, H4_hi); + + const uint32x4_t hi = veorq_u32( + veorq_u32(H1_X1_hi, H2_X2_hi), + veorq_u32(H3_X3_hi, H4_X4_hi)); + + uint32x4_t T0 = veorq_u32(lo, hi); + + T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X1_hi ^ X1_lo, H1_hi ^ H1_lo)); + T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X2_hi ^ X2_lo, H2_hi ^ H2_lo)); + T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X3_hi ^ X3_lo, H3_hi ^ H3_lo)); + T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X4_hi ^ X4_lo, H4_hi ^ H4_lo)); + + const uint32x4_t zero = vdupq_n_u32(0); + uint32x4_t B0 = veorq_u32(vextq_u32(zero, T0, 2), hi); + uint32x4_t B1 = veorq_u32(vextq_u32(T0, zero, 2), lo); + return gcm_reduce(B0, B1); + } + +BOTAN_FUNC_ISA("+simd") +inline uint8x16_t bswap_vec(uint8x16_t v) + { + const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + const uint8x16_t mask = vld1q_u8(maskb); + return vqtbl1q_u8(v, mask); + } + +} + +BOTAN_FUNC_ISA("+simd") +void gcm_pmull_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) + { + const uint64x2_t H = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(H_bytes))); + const uint64x2_t H2 = gcm_multiply(H, H); + const uint64x2_t H3 = gcm_multiply(H, H2); + const uint64x2_t H4 = gcm_multiply(H, H3); + + vst1q_u64(H_pow , H); + vst1q_u64(H_pow+2, H2); + vst1q_u64(H_pow+4, H3); + vst1q_u64(H_pow+6, H4); + } + +BOTAN_FUNC_ISA("+simd") +void gcm_multiply_pmull(uint8_t x[16], + const uint64_t H64[8], const uint8_t input[], size_t blocks) { - /* - * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf - */ + const uint64x2_t H = vld1q_u64(H64); + uint64x2_t a = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(x))); - uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); - const uint64x2_t b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(H+8)), vrev64_u8(vld1_u8(H)))); + if(blocks >= 4) + { + const uint64x2_t H2 = vld1q_u64(H64 + 2); + const uint64x2_t H3 = vld1q_u64(H64 + 4); + const uint64x2_t H4 = vld1q_u64(H64 + 6); + + while(blocks >= 4) + { + const uint64x2_t m0 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input))); + const uint64x2_t m1 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16))); + const uint64x2_t m2 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 32))); + const uint64x2_t m3 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 48))); + + a = veorq_u64(a, m0); + a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a); + + input += 64; + blocks -= 4; + } + } for(size_t i = 0; i != blocks; ++i) { - const uint64x2_t m64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(input+8)), vrev64_u8(vld1_u8(input)))); - input += 16; - - a64 = veorq_u64(a64, m64); - - uint64x2_t T0, T1, T2, T3, T4, T5; - - T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); - T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); - T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); - T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); - - T1 = veorq_u64(T1, T2); - T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); - T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); - T0 = veorq_u64(T0, T2); - T3 = veorq_u64(T3, T1); - - T4 = vshrq_n_u64(T0, 31); - T0 = vshlq_n_u64(T0, 1); - - T5 = vshrq_n_u64(T3, 31); - T3 = vshlq_n_u64(T3, 1); - - T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); - T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); - T0 = vorrq_u64(T0, T4); - T3 = vorrq_u64(T3, T5); - T3 = vorrq_u64(T3, T2); - - T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); - T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); - T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); - - T4 = veorq_u64(T4, T5); - T4 = veorq_u64(T4, T2); - T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); - T3 = veorq_u64(T3, T5); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); - T0 = veorq_u64(T0, T4); - T3 = veorq_u64(T3, T0); - - T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); - T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); - T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); - T3 = veorq_u64(T3, T1); - T3 = veorq_u64(T3, T2); - T3 = veorq_u64(T3, T4); - - a64 = T3; + const uint64x2_t m = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16*i))); + a = veorq_u64(a, m); + a = gcm_multiply(H, a); } - vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(a64)))); - vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(a64)))); + vst1q_u8(x, bswap_vec(vreinterpretq_u8_u64(a))); } } diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h index 638b845cd..17e61097f 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.h +++ b/src/lib/modes/aead/gcm/pmull/pmull.h @@ -12,7 +12,10 @@ namespace Botan { -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], +void gcm_pmull_precompute(const uint8_t H[16], uint64_t H_pow[4*2]); + +void gcm_multiply_pmull(uint8_t x[16], + const uint64_t H[8], const uint8_t input[], size_t blocks); } |