diff options
Diffstat (limited to 'src/lib/modes/aead/gcm')
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.cpp | 113 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.h | 3 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.cpp | 111 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/gcm.h | 27 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.cpp | 109 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.h | 3 |
6 files changed, 215 insertions, 151 deletions
diff --git a/src/lib/modes/aead/gcm/clmul/clmul.cpp b/src/lib/modes/aead/gcm/clmul/clmul.cpp index ed3473b4e..33378d833 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.cpp +++ b/src/lib/modes/aead/gcm/clmul/clmul.cpp @@ -12,67 +12,76 @@ namespace Botan { BOTAN_FUNC_ISA("pclmul,ssse3") -void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16]) +void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks) { /* * Algorithms 1 and 5 from Intel's CLMUL guide */ const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i b = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H)), BSWAP_MASK); + __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x)); - __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(H)); + a = _mm_shuffle_epi8(a, BSWAP_MASK); + + for(size_t i = 0; i != blocks; ++i) + { + __m128i m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input) + i); + m = _mm_shuffle_epi8(m, BSWAP_MASK); + + a = _mm_xor_si128(a, m); + + __m128i T0, T1, T2, T3, T4, T5; + + T0 = _mm_clmulepi64_si128(a, b, 0x00); + T1 = _mm_clmulepi64_si128(a, b, 0x01); + T2 = _mm_clmulepi64_si128(a, b, 0x10); + T3 = _mm_clmulepi64_si128(a, b, 0x11); + + T1 = _mm_xor_si128(T1, T2); + T2 = _mm_slli_si128(T1, 8); + T1 = _mm_srli_si128(T1, 8); + T0 = _mm_xor_si128(T0, T2); + T3 = _mm_xor_si128(T3, T1); + + T4 = _mm_srli_epi32(T0, 31); + T0 = _mm_slli_epi32(T0, 1); + + T5 = _mm_srli_epi32(T3, 31); + T3 = _mm_slli_epi32(T3, 1); + + T2 = _mm_srli_si128(T4, 12); + T5 = _mm_slli_si128(T5, 4); + T4 = _mm_slli_si128(T4, 4); + T0 = _mm_or_si128(T0, T4); + T3 = _mm_or_si128(T3, T5); + T3 = _mm_or_si128(T3, T2); + + T4 = _mm_slli_epi32(T0, 31); + T5 = _mm_slli_epi32(T0, 30); + T2 = _mm_slli_epi32(T0, 25); + + T4 = _mm_xor_si128(T4, T5); + T4 = _mm_xor_si128(T4, T2); + T5 = _mm_srli_si128(T4, 4); + T3 = _mm_xor_si128(T3, T5); + T4 = _mm_slli_si128(T4, 12); + T0 = _mm_xor_si128(T0, T4); + T3 = _mm_xor_si128(T3, T0); + + T4 = _mm_srli_epi32(T0, 1); + T1 = _mm_srli_epi32(T0, 2); + T2 = _mm_srli_epi32(T0, 7); + T3 = _mm_xor_si128(T3, T1); + T3 = _mm_xor_si128(T3, T2); + T3 = _mm_xor_si128(T3, T4); + + a = T3; + } a = _mm_shuffle_epi8(a, BSWAP_MASK); - b = _mm_shuffle_epi8(b, BSWAP_MASK); - - __m128i T0, T1, T2, T3, T4, T5; - - T0 = _mm_clmulepi64_si128(a, b, 0x00); - T1 = _mm_clmulepi64_si128(a, b, 0x01); - T2 = _mm_clmulepi64_si128(a, b, 0x10); - T3 = _mm_clmulepi64_si128(a, b, 0x11); - - T1 = _mm_xor_si128(T1, T2); - T2 = _mm_slli_si128(T1, 8); - T1 = _mm_srli_si128(T1, 8); - T0 = _mm_xor_si128(T0, T2); - T3 = _mm_xor_si128(T3, T1); - - T4 = _mm_srli_epi32(T0, 31); - T0 = _mm_slli_epi32(T0, 1); - - T5 = _mm_srli_epi32(T3, 31); - T3 = _mm_slli_epi32(T3, 1); - - T2 = _mm_srli_si128(T4, 12); - T5 = _mm_slli_si128(T5, 4); - T4 = _mm_slli_si128(T4, 4); - T0 = _mm_or_si128(T0, T4); - T3 = _mm_or_si128(T3, T5); - T3 = _mm_or_si128(T3, T2); - - T4 = _mm_slli_epi32(T0, 31); - T5 = _mm_slli_epi32(T0, 30); - T2 = _mm_slli_epi32(T0, 25); - - T4 = _mm_xor_si128(T4, T5); - T4 = _mm_xor_si128(T4, T2); - T5 = _mm_srli_si128(T4, 4); - T3 = _mm_xor_si128(T3, T5); - T4 = _mm_slli_si128(T4, 12); - T0 = _mm_xor_si128(T0, T4); - T3 = _mm_xor_si128(T3, T0); - - T4 = _mm_srli_epi32(T0, 1); - T1 = _mm_srli_epi32(T0, 2); - T2 = _mm_srli_epi32(T0, 7); - T3 = _mm_xor_si128(T3, T1); - T3 = _mm_xor_si128(T3, T2); - T3 = _mm_xor_si128(T3, T4); - - T3 = _mm_shuffle_epi8(T3, BSWAP_MASK); - - _mm_storeu_si128(reinterpret_cast<__m128i*>(x), T3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(x), a); } } diff --git a/src/lib/modes/aead/gcm/clmul/clmul.h b/src/lib/modes/aead/gcm/clmul/clmul.h index b47c73f27..d68e021d2 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.h +++ b/src/lib/modes/aead/gcm/clmul/clmul.h @@ -12,7 +12,8 @@ namespace Botan { -void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16]); +void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks); } diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp index bb832245e..9079aa039 100644 --- a/src/lib/modes/aead/gcm/gcm.cpp +++ b/src/lib/modes/aead/gcm/gcm.cpp @@ -24,52 +24,52 @@ namespace Botan { static const size_t GCM_BS = 16; -void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const +void GHASH::gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks) { + if(blocks == 0) + return; + #if defined(BOTAN_HAS_GCM_CLMUL) if(CPUID::has_clmul()) - return gcm_multiply_clmul(x.data(), m_H.data()); + return gcm_multiply_clmul(x.data(), m_H.data(), input, blocks); #elif defined(BOTAN_HAS_GCM_PMULL) if(CPUID::has_arm_pmull()) - return gcm_multiply_pmull(x.data(), m_H.data()); + return gcm_multiply_pmull(x.data(), m_H.data(), input, blocks); #endif - static const uint64_t R = 0xE100000000000000; - - uint64_t H[2] = { - load_be<uint64_t>(m_H.data(), 0), - load_be<uint64_t>(m_H.data(), 1) - }; - - uint64_t Z[2] = { 0, 0 }; - - CT::poison(H, 2); - CT::poison(Z, 2); CT::poison(x.data(), x.size()); // SSE2 might be useful here - for(size_t i = 0; i != 2; ++i) - { - const uint64_t X = load_be<uint64_t>(x.data(), i); + const uint64_t ones = 0xFFFFFFFFFFFFFFFF; - uint64_t mask = 0x8000000000000000; - for(size_t j = 0; j != 64; ++j) - { - const uint64_t XMASK = CT::expand_mask<uint64_t>(X & mask); - mask >>= 1; - Z[0] ^= H[0] & XMASK; - Z[1] ^= H[1] & XMASK; + uint64_t X0 = load_be<uint64_t>(x.data(), 0); + uint64_t X1 = load_be<uint64_t>(x.data(), 1); - // GCM's bit ops are reversed so we carry out of the bottom - const uint64_t carry = R & CT::expand_mask<uint64_t>(H[1] & 1); + for(size_t b = 0; b != blocks; ++b) + { + X0 ^= load_be<uint64_t>(input, 2*b); + X1 ^= load_be<uint64_t>(input, 2*b+1); + + uint64_t Z[2] = { 0, 0 }; - H[1] = (H[1] >> 1) | (H[0] << 63); - H[0] = (H[0] >> 1) ^ carry; + for(size_t i = 0; i != 64; ++i) + { + const uint64_t X0MASK = ones * ((X0 >> (63-i)) & 1); + const uint64_t X1MASK = ones * ((X1 >> (63-i)) & 1); + Z[0] ^= m_HM[4*i ] & X0MASK; + Z[1] ^= m_HM[4*i+1] & X0MASK; + Z[0] ^= m_HM[4*i+2] & X1MASK; + Z[1] ^= m_HM[4*i+3] & X1MASK; } + + X0 = Z[0]; + X1 = Z[1]; } - store_be<uint64_t>(x.data(), Z[0], Z[1]); + store_be<uint64_t>(x.data(), X0, X1); CT::unpoison(x.data(), x.size()); } @@ -80,16 +80,20 @@ void GHASH::ghash_update(secure_vector<uint8_t>& ghash, This assumes if less than block size input then we're just on the final block and should pad with zeros */ - while(length) - { - const size_t to_proc = std::min(length, GCM_BS); - xor_buf(ghash.data(), input, to_proc); + const size_t full_blocks = length / GCM_BS; + const size_t final_bytes = length - (full_blocks * GCM_BS); - gcm_multiply(ghash); + if(full_blocks > 0) + { + gcm_multiply(ghash, input, full_blocks); + } - input += to_proc; - length -= to_proc; + if(final_bytes) + { + secure_vector<uint8_t> last_block(GCM_BS); + copy_mem(last_block.data(), input + full_blocks * GCM_BS, final_bytes); + gcm_multiply(ghash, last_block.data(), 1); } } @@ -99,6 +103,32 @@ void GHASH::key_schedule(const uint8_t key[], size_t length) m_H_ad.resize(GCM_BS); m_ad_len = 0; m_text_len = 0; + + uint64_t H0 = load_be<uint64_t>(m_H.data(), 0); + uint64_t H1 = load_be<uint64_t>(m_H.data(), 1); + + const uint64_t R = 0xE100000000000000; + + m_HM.resize(256); + + // precompute the multiples of H + for(size_t i = 0; i != 2; ++i) + { + for(size_t j = 0; j != 64; ++j) + { + /* + we interleave H^1, H^65, H^2, H^66, ... + to make indexing nicer in the multiplication code + */ + m_HM[4*j+2*i] = H0; + m_HM[4*j+2*i+1] = H1; + + // GCM's bit ops are reversed so we carry out of the bottom + const uint64_t carry = R * (H1 & 1); + H1 = (H1 >> 1) | (H0 << 63); + H0 = (H0 >> 1) ^ carry; + } + } } void GHASH::start(const uint8_t nonce[], size_t len) @@ -115,12 +145,17 @@ void GHASH::set_associated_data(const uint8_t input[], size_t length) m_ad_len = length; } -void GHASH::update(const uint8_t input[], size_t length) +void GHASH::update_associated_data(const uint8_t ad[], size_t length) { BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); + m_ad_len += length; + ghash_update(m_ghash, ad, length); + } +void GHASH::update(const uint8_t input[], size_t length) + { + BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set"); m_text_len += length; - ghash_update(m_ghash, input, length); } diff --git a/src/lib/modes/aead/gcm/gcm.h b/src/lib/modes/aead/gcm/gcm.h index deedfdade..eac2add93 100644 --- a/src/lib/modes/aead/gcm/gcm.h +++ b/src/lib/modes/aead/gcm/gcm.h @@ -111,9 +111,10 @@ class BOTAN_PUBLIC_API(2,0) GCM_Decryption final : public GCM_Mode /** * GCM's GHASH -* Maybe a Transform? +* This is not intended for general use, but is exposed to allow +* shared code between GCM and GMAC */ -class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm +class BOTAN_PUBLIC_API(2,0) GHASH final : public SymmetricAlgorithm { public: void set_associated_data(const uint8_t ad[], size_t ad_len); @@ -127,6 +128,11 @@ class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm */ void update(const uint8_t in[], size_t len); + /* + * Incremental update of associated data + */ + void update_associated_data(const uint8_t ad[], size_t len); + secure_vector<uint8_t> final(); Key_Length_Specification key_spec() const override @@ -137,24 +143,25 @@ class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm void reset(); std::string name() const override { return "GHASH"; } - protected: + void ghash_update(secure_vector<uint8_t>& x, const uint8_t input[], size_t input_len); void add_final_block(secure_vector<uint8_t>& x, size_t ad_len, size_t pt_len); - - secure_vector<uint8_t> m_H; - secure_vector<uint8_t> m_H_ad; - secure_vector<uint8_t> m_ghash; - size_t m_ad_len = 0; - private: void key_schedule(const uint8_t key[], size_t key_len) override; - void gcm_multiply(secure_vector<uint8_t>& x) const; + void gcm_multiply(secure_vector<uint8_t>& x, + const uint8_t input[], + size_t blocks); + secure_vector<uint8_t> m_H; + secure_vector<uint8_t> m_H_ad; + secure_vector<uint8_t> m_ghash; secure_vector<uint8_t> m_nonce; + secure_vector<uint64_t> m_HM; + size_t m_ad_len = 0; size_t m_text_len = 0; }; diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp index 54e841650..12d6ff7d1 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.cpp +++ b/src/lib/modes/aead/gcm/pmull/pmull.cpp @@ -10,62 +10,73 @@ namespace Botan { BOTAN_FUNC_ISA("+crypto") -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]) +void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks) { /* * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf */ - const uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); + uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); const uint64x2_t b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(H+8)), vrev64_u8(vld1_u8(H)))); - uint64x2_t T0, T1, T2, T3, T4, T5; - - T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); - T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); - T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); - T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); - - T1 = veorq_u64(T1, T2); - T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); - T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); - T0 = veorq_u64(T0, T2); - T3 = veorq_u64(T3, T1); - - T4 = vshrq_n_u64(T0, 31); - T0 = vshlq_n_u64(T0, 1); - - T5 = vshrq_n_u64(T3, 31); - T3 = vshlq_n_u64(T3, 1); - - T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); - T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); - T0 = vorrq_u64(T0, T4); - T3 = vorrq_u64(T3, T5); - T3 = vorrq_u64(T3, T2); - - T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); - T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); - T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); - - T4 = veorq_u64(T4, T5); - T4 = veorq_u64(T4, T2); - T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); - T3 = veorq_u64(T3, T5); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); - T0 = veorq_u64(T0, T4); - T3 = veorq_u64(T3, T0); - - T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); - T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); - T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); - T3 = veorq_u64(T3, T1); - T3 = veorq_u64(T3, T2); - T3 = veorq_u64(T3, T4); - - vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(T3)))); - vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(T3)))); + for(size_t i = 0; i != blocks; ++i) + { + const uint64x2_t m64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(input+8)), vrev64_u8(vld1_u8(input)))); + input += 16; + + a64 = veorq_u64(a64, m64); + + uint64x2_t T0, T1, T2, T3, T4, T5; + + T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); + T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); + T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); + T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); + + T1 = veorq_u64(T1, T2); + T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); + T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); + T0 = veorq_u64(T0, T2); + T3 = veorq_u64(T3, T1); + + T4 = vshrq_n_u64(T0, 31); + T0 = vshlq_n_u64(T0, 1); + + T5 = vshrq_n_u64(T3, 31); + T3 = vshlq_n_u64(T3, 1); + + T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); + T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); + T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); + T0 = vorrq_u64(T0, T4); + T3 = vorrq_u64(T3, T5); + T3 = vorrq_u64(T3, T2); + + T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); + T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); + T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); + + T4 = veorq_u64(T4, T5); + T4 = veorq_u64(T4, T2); + T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); + T3 = veorq_u64(T3, T5); + T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); + T0 = veorq_u64(T0, T4); + T3 = veorq_u64(T3, T0); + + T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); + T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); + T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); + T3 = veorq_u64(T3, T1); + T3 = veorq_u64(T3, T2); + T3 = veorq_u64(T3, T4); + + a64 = T3; + } + + vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(a64)))); + vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(a64)))); } } diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h index 4ddcc8f27..638b845cd 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.h +++ b/src/lib/modes/aead/gcm/pmull/pmull.h @@ -12,7 +12,8 @@ namespace Botan { -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]); +void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks); } |