diff options
author | Jack Lloyd <[email protected]> | 2019-09-25 07:07:04 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2019-09-25 07:54:09 -0400 |
commit | 9a0230b73f11654b181a04b4084af458c504552f (patch) | |
tree | 26e123e3d74ab41dab91bf8c6dfa43345292e89a /src | |
parent | 50e1552e49aeef26614c8f5317aa7b0f33272219 (diff) |
Merge CLMUL and PMULL code
Same algorithms were used just using SSSE3 vs NEON
Diffstat (limited to 'src')
-rw-r--r-- | src/build-data/policy/bsi.txt | 3 | ||||
-rw-r--r-- | src/build-data/policy/modern.txt | 3 | ||||
-rw-r--r-- | src/build-data/policy/nist.txt | 3 | ||||
-rw-r--r-- | src/lib/block/aes/aes_vperm/aes_vperm.cpp | 37 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/clmul.cpp | 182 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul/info.txt | 13 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp | 169 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.h (renamed from src/lib/modes/aead/gcm/clmul/clmul.h) | 0 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/clmul_cpu/info.txt | 33 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/ghash.cpp | 41 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/info.txt | 16 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.cpp | 208 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.h | 23 | ||||
-rw-r--r-- | src/lib/utils/cpuid/cpuid.h | 15 | ||||
-rw-r--r-- | src/lib/utils/simd/simd_32.h | 60 | ||||
-rw-r--r-- | src/tests/data/block/aes.vec | 2 |
16 files changed, 291 insertions, 517 deletions
diff --git a/src/build-data/policy/bsi.txt b/src/build-data/policy/bsi.txt index d89f4433d..a3e324268 100644 --- a/src/build-data/policy/bsi.txt +++ b/src/build-data/policy/bsi.txt @@ -56,9 +56,8 @@ aes_armv8 aes_power8 # modes -clmul +clmul_cpu clmul_ssse3 -pmull # hash sha2_32_x86 diff --git a/src/build-data/policy/modern.txt b/src/build-data/policy/modern.txt index 2a9c12613..0c2e7f8c3 100644 --- a/src/build-data/policy/modern.txt +++ b/src/build-data/policy/modern.txt @@ -51,9 +51,8 @@ prf_tls newhope ed25519 -clmul +clmul_cpu clmul_ssse3 -pmull locking_allocator http_util # needed by x509 for OCSP online checks diff --git a/src/build-data/policy/nist.txt b/src/build-data/policy/nist.txt index 2fdf60ea6..7eb0be23b 100644 --- a/src/build-data/policy/nist.txt +++ b/src/build-data/policy/nist.txt @@ -63,9 +63,8 @@ sha2_64_bmi2 sha3_bmi2 # modes -clmul +clmul_cpu clmul_ssse3 -pmull # hash sha2_32_x86 diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp index 24470bda3..7e7b36116 100644 --- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp +++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp @@ -18,14 +18,6 @@ #include <tmmintrin.h> #endif -#if defined(BOTAN_SIMD_USE_SSE2) - #define BOTAN_VPERM_ISA "ssse3" -#elif defined(BOTAN_SIMD_USE_NEON) - #define BOTAN_VPERM_ISA "+simd" -#elif defined(BOTAN_SIMD_USE_ALTIVEC) - #define BOTAN_VPERM_ISA "altivec" -#endif - namespace Botan { namespace { @@ -63,34 +55,12 @@ inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32 #endif } -template<size_t I> -inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shift_elems_left(SIMD_4x32 x) - { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I)); -#elif defined(BOTAN_SIMD_USE_NEON) - return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u32(x.raw()), 16 - 4*I))); -#elif defined(BOTAN_SIMD_USE_ALTIVEC) - const __vector unsigned int zero = vec_splat_u32(0); - - const __vector unsigned char shuf[3] = { - { 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, - { 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 }, - { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3 }, - }; - - return SIMD_4x32(vec_perm(x.raw(), zero, shuf[I-1])); -#else - #error "No shift_elems_left implementation available" -#endif - } - inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b) { #if defined(BOTAN_SIMD_USE_SSE2) return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8)); #elif defined(BOTAN_SIMD_USE_NEON) - return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), 8))); + return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2)); #elif defined(BOTAN_SIMD_USE_ALTIVEC) const __vector unsigned char mask = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23}; return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask)); @@ -504,8 +474,8 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k) SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) { - SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2); - smeared ^= shift_elems_left<2>(smeared); + SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>(); + smeared ^= smeared.shift_elems_left<2>(); smeared ^= SIMD_4x32::splat_u8(0x5B); const SIMD_4x32 Bh = high_nibs(input1); @@ -588,7 +558,6 @@ void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t) // key2 with 8 high bytes masked off SIMD_4x32 t = key2; key2 = aes_schedule_round(rcon[2*i], key2, key1); - const SIMD_4x32 key2t = alignr8(key2, t); aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]); aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]); diff --git a/src/lib/modes/aead/gcm/clmul/clmul.cpp b/src/lib/modes/aead/gcm/clmul/clmul.cpp deleted file mode 100644 index 6f968866d..000000000 --- a/src/lib/modes/aead/gcm/clmul/clmul.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* -* CLMUL hook -* (C) 2013,2017 Jack Lloyd -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#include <botan/internal/clmul.h> -#include <immintrin.h> -#include <wmmintrin.h> - -namespace Botan { - -namespace { - -BOTAN_FUNC_ISA("sse2") -inline __m128i gcm_reduce(const __m128i& B0, const __m128i& B1) - { - __m128i T0, T1, T2, T3; - - T0 = _mm_srli_epi32(B1, 31); - T1 = _mm_slli_epi32(B1, 1); - T2 = _mm_srli_epi32(B0, 31); - T3 = _mm_slli_epi32(B0, 1); - - T3 = _mm_or_si128(T3, _mm_srli_si128(T0, 12)); - T3 = _mm_or_si128(T3, _mm_slli_si128(T2, 4)); - T1 = _mm_or_si128(T1, _mm_slli_si128(T0, 4)); - - T0 = _mm_xor_si128(_mm_slli_epi32(T1, 31), _mm_slli_epi32(T1, 30)); - T0 = _mm_xor_si128(T0, _mm_slli_epi32(T1, 25)); - - T1 = _mm_xor_si128(T1, _mm_slli_si128(T0, 12)); - - T0 = _mm_xor_si128(T3, _mm_srli_si128(T0, 4)); - T0 = _mm_xor_si128(T0, T1); - T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 7)); - T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 1)); - T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 2)); - return T0; - } - -BOTAN_FUNC_ISA("pclmul,sse2") -inline __m128i gcm_multiply(const __m128i& H, const __m128i& x) - { - __m128i T0, T1, T2, T3; - - T0 = _mm_clmulepi64_si128(x, H, 0x11); - T1 = _mm_clmulepi64_si128(x, H, 0x10); - T2 = _mm_clmulepi64_si128(x, H, 0x01); - T3 = _mm_clmulepi64_si128(x, H, 0x00); - - T1 = _mm_xor_si128(T1, T2); - T0 = _mm_xor_si128(T0, _mm_srli_si128(T1, 8)); - T3 = _mm_xor_si128(T3, _mm_slli_si128(T1, 8)); - - return gcm_reduce(T0, T3); - } - -BOTAN_FUNC_ISA("pclmul,sse2") -inline __m128i gcm_multiply_x4(const __m128i& H1, const __m128i& H2, const __m128i& H3, const __m128i& H4, - const __m128i& X1, const __m128i& X2, const __m128i& X3, const __m128i& X4) - { - /* - * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski - * and Pierre Laurent of Intel - */ - - const __m128i H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); - const __m128i H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); - const __m128i H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); - const __m128i H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); - - const __m128i lo = _mm_xor_si128( - _mm_xor_si128(H1_X1_lo, H2_X2_lo), - _mm_xor_si128(H3_X3_lo, H4_X4_lo)); - - const __m128i H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); - const __m128i H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); - const __m128i H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); - const __m128i H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); - - const __m128i hi = _mm_xor_si128( - _mm_xor_si128(H1_X1_hi, H2_X2_hi), - _mm_xor_si128(H3_X3_hi, H4_X4_hi)); - - __m128i T0 = _mm_xor_si128(lo, hi); - __m128i T1, T2, T3, T4; - - T1 = _mm_xor_si128(_mm_srli_si128(H1, 8), H1); - T2 = _mm_xor_si128(_mm_srli_si128(X1, 8), X1); - T3 = _mm_xor_si128(_mm_srli_si128(H2, 8), H2); - T4 = _mm_xor_si128(_mm_srli_si128(X2, 8), X2); - T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00)); - T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00)); - - T1 = _mm_xor_si128(_mm_srli_si128(H3, 8), H3); - T2 = _mm_xor_si128(_mm_srli_si128(X3, 8), X3); - T3 = _mm_xor_si128(_mm_srli_si128(H4, 8), H4); - T4 = _mm_xor_si128(_mm_srli_si128(X4, 8), X4); - T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00)); - T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00)); - - T1 = _mm_xor_si128(_mm_srli_si128(T0, 8), hi); - T2 = _mm_xor_si128(_mm_slli_si128(T0, 8), lo); - - return gcm_reduce(T1, T2); - } - -} - -BOTAN_FUNC_ISA("ssse3") -void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) - { - const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - const __m128i H = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H_bytes)), BSWAP_MASK); - const __m128i H2 = gcm_multiply(H, H); - const __m128i H3 = gcm_multiply(H, H2); - const __m128i H4 = gcm_multiply(H, H3); - - __m128i* H_pow_mm = reinterpret_cast<__m128i*>(H_pow); - - _mm_storeu_si128(H_pow_mm+0, H); - _mm_storeu_si128(H_pow_mm+1, H2); - _mm_storeu_si128(H_pow_mm+2, H3); - _mm_storeu_si128(H_pow_mm+3, H4); - } - -BOTAN_FUNC_ISA("ssse3") -void gcm_multiply_clmul(uint8_t x[16], - const uint64_t H_pow[8], - const uint8_t input_bytes[], size_t blocks) - { - /* - * Algorithms 1 and 5 from Intel's CLMUL guide - */ - const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - const __m128i* input = reinterpret_cast<const __m128i*>(input_bytes); - - const __m128i* H_pow_mm = reinterpret_cast<const __m128i*>(H_pow); - - const __m128i H = _mm_loadu_si128(H_pow_mm); - - __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x)); - a = _mm_shuffle_epi8(a, BSWAP_MASK); - - if(blocks >= 4) - { - const __m128i H2 = _mm_loadu_si128(H_pow_mm + 1); - const __m128i H3 = _mm_loadu_si128(H_pow_mm + 2); - const __m128i H4 = _mm_loadu_si128(H_pow_mm + 3); - - while(blocks >= 4) - { - const __m128i m0 = _mm_shuffle_epi8(_mm_loadu_si128(input + 0), BSWAP_MASK); - const __m128i m1 = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), BSWAP_MASK); - const __m128i m2 = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), BSWAP_MASK); - const __m128i m3 = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), BSWAP_MASK); - - a = _mm_xor_si128(a, m0); - a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a); - - input += 4; - blocks -= 4; - } - } - - for(size_t i = 0; i != blocks; ++i) - { - const __m128i m = _mm_shuffle_epi8(_mm_loadu_si128(input + i), BSWAP_MASK); - - a = _mm_xor_si128(a, m); - a = gcm_multiply(H, a); - } - - a = _mm_shuffle_epi8(a, BSWAP_MASK); - _mm_storeu_si128(reinterpret_cast<__m128i*>(x), a); - } - -} diff --git a/src/lib/modes/aead/gcm/clmul/info.txt b/src/lib/modes/aead/gcm/clmul/info.txt deleted file mode 100644 index d4b6a1c1f..000000000 --- a/src/lib/modes/aead/gcm/clmul/info.txt +++ /dev/null @@ -1,13 +0,0 @@ -<defines> -GCM_CLMUL -> 20131227 -</defines> - -<isa> -sse2 -ssse3 -aesni -</isa> - -<header:internal> -clmul.h -</header:internal> diff --git a/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp new file mode 100644 index 000000000..2a41121d1 --- /dev/null +++ b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp @@ -0,0 +1,169 @@ +/* +* Hook for CLMUL/PMULL +* (C) 2013,2017,2019 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/internal/clmul_cpu.h> +#include <botan/internal/simd_32.h> + +#if defined(BOTAN_SIMD_USE_SSE2) + #include <immintrin.h> + #include <wmmintrin.h> +#endif + +namespace Botan { + +namespace { + +BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in) + { +#if defined(BOTAN_SIMD_USE_SSE2) + const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK)); +#elif defined(BOTAN_SIMD_USE_NEON) + const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + const uint8x16_t mask = vld1q_u8(maskb); + return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask))); +#endif + } + +template<int M> +BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x) + { + static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); + +#if defined(BOTAN_SIMD_USE_SSE2) + return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); +#elif defined(BOTAN_SIMD_USE_NEON) + const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); + const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); + return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); +#endif + } + +inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1) + { + SIMD_4x32 X0 = B1.shr<31>(); + SIMD_4x32 X1 = B1.shl<1>(); + SIMD_4x32 X2 = B0.shr<31>(); + SIMD_4x32 X3 = B0.shl<1>(); + + X3 |= X0.shift_elems_right<3>(); + X3 |= X2.shift_elems_left<1>(); + X1 |= X0.shift_elems_left<1>(); + + X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>(); + + X1 ^= X0.shift_elems_left<3>(); + + X0 = X1 ^ X3 ^ X0.shift_elems_right<1>(); + X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>(); + return X0; + } + +inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x) + { + SIMD_4x32 T0 = clmul<0x11>(H, x); + SIMD_4x32 T1 = clmul<0x10>(H, x); + SIMD_4x32 T2 = clmul<0x01>(H, x); + SIMD_4x32 T3 = clmul<0x00>(H, x); + + T1 ^= T2; + T0 ^= T1.shift_elems_right<2>(); + T3 ^= T1.shift_elems_left<2>(); + + return gcm_reduce(T0, T3); + } + +inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) + gcm_multiply_x4(const SIMD_4x32& H1, const SIMD_4x32& H2, const SIMD_4x32& H3, const SIMD_4x32& H4, + const SIMD_4x32& X1, const SIMD_4x32& X2, const SIMD_4x32& X3, const SIMD_4x32& X4) + { + /* + * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski + * and Pierre Laurent of Intel + */ + + const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^ + (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4)); + + const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^ + (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4)); + + SIMD_4x32 T; + + T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>()); + T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>()); + T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>()); + T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>()); + T ^= lo; + T ^= hi; + + return gcm_reduce(hi ^ T.shift_elems_right<2>(), + lo ^ T.shift_elems_left<2>()); + } + +} + +BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) +void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) + { + const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes)); + const SIMD_4x32 H2 = gcm_multiply(H1, H1); + const SIMD_4x32 H3 = gcm_multiply(H1, H2); + const SIMD_4x32 H4 = gcm_multiply(H2, H2); + + H1.store_le(H_pow); + H2.store_le(H_pow + 2); + H3.store_le(H_pow + 4); + H4.store_le(H_pow + 6); + } + +BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) +void gcm_multiply_clmul(uint8_t x[16], + const uint64_t H_pow[8], + const uint8_t input[], size_t blocks) + { + /* + * Algorithms 1 and 5 from Intel's CLMUL guide + */ + const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow); + + SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x)); + + if(blocks >= 4) + { + const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2); + const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4); + const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6); + + while(blocks >= 4) + { + const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input )); + const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16*1)); + const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16*2)); + const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16*3)); + + a ^= m0; + a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a); + + input += 4*16; + blocks -= 4; + } + } + + for(size_t i = 0; i != blocks; ++i) + { + const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16*i)); + + a ^= m; + a = gcm_multiply(H1, a); + } + + a = reverse_vector(a); + a.store_le(x); + } + +} diff --git a/src/lib/modes/aead/gcm/clmul/clmul.h b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.h index 25cdfbd96..25cdfbd96 100644 --- a/src/lib/modes/aead/gcm/clmul/clmul.h +++ b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.h diff --git a/src/lib/modes/aead/gcm/clmul_cpu/info.txt b/src/lib/modes/aead/gcm/clmul_cpu/info.txt new file mode 100644 index 000000000..bc018c2ed --- /dev/null +++ b/src/lib/modes/aead/gcm/clmul_cpu/info.txt @@ -0,0 +1,33 @@ +<defines> +GCM_CLMUL_CPU -> 20131227 +</defines> + +<requires> +simd +</requires> + +<header:internal> +clmul_cpu.h +</header:internal> + +<isa> +x86_32:sse2 +x86_32:ssse3 +x86_32:aesni +x86_64:sse2 +x86_64:ssse3 +x86_64:aesni +arm64:neon +arm64:armv8crypto +</isa> + +<arch> +x86_32 +x86_64 +arm64 +</arch> + +<cc> +gcc:4.9 +clang:3.8 +</cc> diff --git a/src/lib/modes/aead/gcm/ghash.cpp b/src/lib/modes/aead/gcm/ghash.cpp index 8b8d3e337..3a4301113 100644 --- a/src/lib/modes/aead/gcm/ghash.cpp +++ b/src/lib/modes/aead/gcm/ghash.cpp @@ -12,24 +12,20 @@ #include <botan/cpuid.h> #include <botan/exceptn.h> -#if defined(BOTAN_HAS_GCM_CLMUL) - #include <botan/internal/clmul.h> +#if defined(BOTAN_HAS_GCM_CLMUL_CPU) + #include <botan/internal/clmul_cpu.h> #endif #if defined(BOTAN_HAS_GCM_CLMUL_SSSE3) #include <botan/internal/clmul_ssse3.h> #endif -#if defined(BOTAN_HAS_GCM_PMULL) - #include <botan/internal/pmull.h> -#endif - namespace Botan { std::string GHASH::provider() const { -#if defined(BOTAN_HAS_GCM_CLMUL) - if(CPUID::has_clmul()) +#if defined(BOTAN_HAS_GCM_CLMUL_CPU) + if(CPUID::has_carryless_multiply()) return "clmul"; #endif @@ -38,11 +34,6 @@ std::string GHASH::provider() const return "ssse3"; #endif -#if defined(BOTAN_HAS_GCM_PMULL) - if(CPUID::has_arm_pmull()) - return "pmull"; -#endif - return "base"; } @@ -50,8 +41,8 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x, const uint8_t input[], size_t blocks) { -#if defined(BOTAN_HAS_GCM_CLMUL) - if(CPUID::has_clmul()) +#if defined(BOTAN_HAS_GCM_CLMUL_CPU) + if(CPUID::has_carryless_multiply()) { return gcm_multiply_clmul(x.data(), m_H_pow.data(), input, blocks); } @@ -64,13 +55,6 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x, } #endif -#if defined(BOTAN_HAS_GCM_PMULL) - if(CPUID::has_arm_pmull()) - { - return gcm_multiply_pmull(x.data(), m_H_pow.data(), input, blocks); - } -#endif - CT::poison(x.data(), x.size()); const uint64_t ALL_BITS = 0xFFFFFFFFFFFFFFFF; @@ -169,22 +153,13 @@ void GHASH::key_schedule(const uint8_t key[], size_t length) } } -#if defined(BOTAN_HAS_GCM_CLMUL) - if(CPUID::has_clmul()) +#if defined(BOTAN_HAS_GCM_CLMUL_CPU) + if(CPUID::has_carryless_multiply()) { m_H_pow.resize(8); gcm_clmul_precompute(m_H.data(), m_H_pow.data()); } #endif - -#if defined(BOTAN_HAS_GCM_PMULL) - if(CPUID::has_arm_pmull()) - { - m_H_pow.resize(8); - gcm_pmull_precompute(m_H.data(), m_H_pow.data()); - } -#endif - } void GHASH::start(const uint8_t nonce[], size_t len) diff --git a/src/lib/modes/aead/gcm/pmull/info.txt b/src/lib/modes/aead/gcm/pmull/info.txt deleted file mode 100644 index 231a1989e..000000000 --- a/src/lib/modes/aead/gcm/pmull/info.txt +++ /dev/null @@ -1,16 +0,0 @@ -<defines> -GCM_PMULL -> 20170903 -</defines> - -<isa> -armv8crypto -</isa> - -<cc> -gcc:4.9 -clang:3.8 -</cc> - -<header:internal> -pmull.h -</header:internal> diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp deleted file mode 100644 index 9d6ceb105..000000000 --- a/src/lib/modes/aead/gcm/pmull/pmull.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* -* Contributed by Jeffrey Walton -* -* Further changes -* (C) 2017 Jack Lloyd -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#include <botan/internal/pmull.h> -#include <arm_neon.h> - -namespace Botan { - -/* -This follows the same pattern as the clmul implementation. - -See also https://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf -*/ - -namespace { - -BOTAN_FUNC_ISA("+simd") -inline uint64x2_t gcm_reduce(uint32x4_t B0, uint32x4_t B1) - { - const uint32x4_t zero = vdupq_n_u32(0); - - uint32x4_t T0, T1, T2, T3, T4, T5; - - T4 = vshrq_n_u32(B0, 31); - T0 = vshlq_n_u32(B0, 1); - T5 = vshrq_n_u32(B1, 31); - T3 = vshlq_n_u32(B1, 1); - - T2 = vextq_u32(T4, zero, 3); - T5 = vextq_u32(zero, T5, 3); - T4 = vextq_u32(zero, T4, 3); - T0 = vorrq_u32(T0, T4); - T3 = vorrq_u32(T3, T5); - T3 = vorrq_u32(T3, T2); - - T4 = vshlq_n_u32(T0, 31); - T5 = vshlq_n_u32(T0, 30); - T2 = vshlq_n_u32(T0, 25); - - T4 = veorq_u32(T4, T5); - T4 = veorq_u32(T4, T2); - T5 = vextq_u32(T4, zero, 1); - T3 = veorq_u32(T3, T5); - T4 = vextq_u32(zero, T4, 1); - T0 = veorq_u32(T0, T4); - T3 = veorq_u32(T3, T0); - - T4 = vshrq_n_u32(T0, 1); - T1 = vshrq_n_u32(T0, 2); - T2 = vshrq_n_u32(T0, 7); - T3 = veorq_u32(T3, T1); - T3 = veorq_u32(T3, T2); - T3 = veorq_u32(T3, T4); - - return vreinterpretq_u64_u32(T3); - } - -BOTAN_FUNC_ISA("+crypto") -inline uint32x4_t vmull(uint64_t x, uint64_t y) - { - return reinterpret_cast<uint32x4_t>(vmull_p64(x, y)); - } - -BOTAN_FUNC_ISA("+crypto") -inline uint64x2_t gcm_multiply(uint64x2_t H, uint64x2_t x) - { - const uint32x4_t zero = vdupq_n_u32(0); - - const uint64_t x_hi = vgetq_lane_u64(x, 0); - const uint64_t x_lo = vgetq_lane_u64(x, 1); - const uint64_t H_hi = vgetq_lane_u64(H, 0); - const uint64_t H_lo = vgetq_lane_u64(H, 1); - - uint32x4_t T0 = vmull(x_hi, H_hi); - uint32x4_t T1 = vmull(x_lo, H_hi); - uint32x4_t T2 = vmull(x_hi, H_lo); - uint32x4_t T3 = vmull(x_lo, H_lo); - - T1 = veorq_u32(T1, T2); - T0 = veorq_u32(T0, vextq_u32(zero, T1, 2)); - T3 = veorq_u32(T3, vextq_u32(T1, zero, 2)); - - return gcm_reduce(T0, T3); - } - -BOTAN_FUNC_ISA("+crypto") -inline uint64x2_t gcm_multiply_x4(uint64x2_t H1, uint64x2_t H2, uint64x2_t H3, uint64x2_t H4, - uint64x2_t X1, uint64x2_t X2, uint64x2_t X3, uint64x2_t X4) - { - const uint64_t H1_hi = vgetq_lane_u64(H1, 0); - const uint64_t H1_lo = vgetq_lane_u64(H1, 1); - const uint64_t H2_hi = vgetq_lane_u64(H2, 0); - const uint64_t H2_lo = vgetq_lane_u64(H2, 1); - const uint64_t H3_hi = vgetq_lane_u64(H3, 0); - const uint64_t H3_lo = vgetq_lane_u64(H3, 1); - const uint64_t H4_hi = vgetq_lane_u64(H4, 0); - const uint64_t H4_lo = vgetq_lane_u64(H4, 1); - - const uint64_t X1_hi = vgetq_lane_u64(X1, 0); - const uint64_t X1_lo = vgetq_lane_u64(X1, 1); - const uint64_t X2_hi = vgetq_lane_u64(X2, 0); - const uint64_t X2_lo = vgetq_lane_u64(X2, 1); - const uint64_t X3_hi = vgetq_lane_u64(X3, 0); - const uint64_t X3_lo = vgetq_lane_u64(X3, 1); - const uint64_t X4_hi = vgetq_lane_u64(X4, 0); - const uint64_t X4_lo = vgetq_lane_u64(X4, 1); - - const uint32x4_t H1_X1_lo = vmull(X1_lo, H1_lo); - const uint32x4_t H2_X2_lo = vmull(X2_lo, H2_lo); - const uint32x4_t H3_X3_lo = vmull(X3_lo, H3_lo); - const uint32x4_t H4_X4_lo = vmull(X4_lo, H4_lo); - - const uint32x4_t lo = veorq_u32( - veorq_u32(H1_X1_lo, H2_X2_lo), - veorq_u32(H3_X3_lo, H4_X4_lo)); - - const uint32x4_t H1_X1_hi = vmull(X1_hi, H1_hi); - const uint32x4_t H2_X2_hi = vmull(X2_hi, H2_hi); - const uint32x4_t H3_X3_hi = vmull(X3_hi, H3_hi); - const uint32x4_t H4_X4_hi = vmull(X4_hi, H4_hi); - - const uint32x4_t hi = veorq_u32( - veorq_u32(H1_X1_hi, H2_X2_hi), - veorq_u32(H3_X3_hi, H4_X4_hi)); - - uint32x4_t T0 = veorq_u32(lo, hi); - - T0 = veorq_u32(T0, vmull(X1_hi ^ X1_lo, H1_hi ^ H1_lo)); - T0 = veorq_u32(T0, vmull(X2_hi ^ X2_lo, H2_hi ^ H2_lo)); - T0 = veorq_u32(T0, vmull(X3_hi ^ X3_lo, H3_hi ^ H3_lo)); - T0 = veorq_u32(T0, vmull(X4_hi ^ X4_lo, H4_hi ^ H4_lo)); - - const uint32x4_t zero = vdupq_n_u32(0); - uint32x4_t B0 = veorq_u32(vextq_u32(zero, T0, 2), hi); - uint32x4_t B1 = veorq_u32(vextq_u32(T0, zero, 2), lo); - return gcm_reduce(B0, B1); - } - -BOTAN_FUNC_ISA("+simd") -inline uint8x16_t bswap_vec(uint8x16_t v) - { - const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; - const uint8x16_t mask = vld1q_u8(maskb); - return vqtbl1q_u8(v, mask); - } - -} - -BOTAN_FUNC_ISA("+simd") -void gcm_pmull_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) - { - const uint64x2_t H = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(H_bytes))); - const uint64x2_t H2 = gcm_multiply(H, H); - const uint64x2_t H3 = gcm_multiply(H, H2); - const uint64x2_t H4 = gcm_multiply(H, H3); - - vst1q_u64(H_pow , H); - vst1q_u64(H_pow+2, H2); - vst1q_u64(H_pow+4, H3); - vst1q_u64(H_pow+6, H4); - } - -BOTAN_FUNC_ISA("+simd") -void gcm_multiply_pmull(uint8_t x[16], - const uint64_t H64[8], - const uint8_t input[], size_t blocks) - { - const uint64x2_t H = vld1q_u64(H64); - uint64x2_t a = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(x))); - - if(blocks >= 4) - { - const uint64x2_t H2 = vld1q_u64(H64 + 2); - const uint64x2_t H3 = vld1q_u64(H64 + 4); - const uint64x2_t H4 = vld1q_u64(H64 + 6); - - while(blocks >= 4) - { - const uint64x2_t m0 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input))); - const uint64x2_t m1 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16))); - const uint64x2_t m2 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 32))); - const uint64x2_t m3 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 48))); - - a = veorq_u64(a, m0); - a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a); - - input += 64; - blocks -= 4; - } - } - - for(size_t i = 0; i != blocks; ++i) - { - const uint64x2_t m = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16*i))); - a = veorq_u64(a, m); - a = gcm_multiply(H, a); - } - - vst1q_u8(x, bswap_vec(vreinterpretq_u8_u64(a))); - } - -} diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h deleted file mode 100644 index 17e61097f..000000000 --- a/src/lib/modes/aead/gcm/pmull/pmull.h +++ /dev/null @@ -1,23 +0,0 @@ -/* -* PMULL hook -* (C) 2017 Jack Lloyd -* -* Botan is released under the Simplified BSD License (see license.txt) -*/ - -#ifndef BOTAN_GCM_PMULL_H_ -#define BOTAN_GCM_PMULL_H_ - -#include <botan/types.h> - -namespace Botan { - -void gcm_pmull_precompute(const uint8_t H[16], uint64_t H_pow[4*2]); - -void gcm_multiply_pmull(uint8_t x[16], - const uint64_t H[8], - const uint8_t input[], size_t blocks); - -} - -#endif diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h index 7b6b8ebc3..84201b910 100644 --- a/src/lib/utils/cpuid/cpuid.h +++ b/src/lib/utils/cpuid/cpuid.h @@ -334,6 +334,21 @@ class BOTAN_PUBLIC_API(2,1) CPUID final #endif } + /** + * Check if the processor supports carryless multiply + * (CLMUL, PMULL) + */ + static bool has_carryless_multiply() + { +#if defined(BOTAN_TARGET_CPU_IS_X86_FAMILY) + return has_clmul(); +#elif defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY) + return has_arm_pmull(); +#else + return false; +#endif + } + /* * Clear a CPUID bit * Call CPUID::initialize to reset diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h index 4c1599842..2f662b98d 100644 --- a/src/lib/utils/simd/simd_32.h +++ b/src/lib/utils/simd/simd_32.h @@ -31,6 +31,19 @@ #error "No SIMD instruction set enabled" #endif +#if defined(BOTAN_SIMD_USE_SSE2) + #define BOTAN_SIMD_ISA "sse2" + #define BOTAN_VPERM_ISA "ssse3" + #define BOTAN_CLMUL_ISA "pclmul" +#elif defined(BOTAN_SIMD_USE_NEON) + #define BOTAN_SIMD_ISA "+simd" + #define BOTAN_VPERM_ISA "+simd" + #define BOTAN_CLMUL_ISA "+crypto" +#elif defined(BOTAN_SIMD_USE_ALTIVEC) + #define BOTAN_SIMD_ISA "altivec" + #define BOTAN_VPERM_ISA "altivec" +#endif + namespace Botan { #if defined(BOTAN_SIMD_USE_SSE2) @@ -172,7 +185,12 @@ class SIMD_4x32 final #endif } - void store_le(uint32_t out[]) const + void store_le(uint32_t out[4]) const + { + this->store_le(reinterpret_cast<uint8_t*>(out)); + } + + void store_le(uint64_t out[2]) const { this->store_le(reinterpret_cast<uint8_t*>(out)); } @@ -489,6 +507,46 @@ class SIMD_4x32 final #endif } + template<size_t I> + SIMD_4x32 shift_elems_left() const + { +#if defined(BOTAN_SIMD_USE_SSE2) + return SIMD_4x32(_mm_slli_si128(raw(), 4*I)); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vextq_u32(vdupq_n_u32(0), raw(), 4-I)); +#elif defined(BOTAN_SIMD_USE_ALTIVEC) + const __vector unsigned int zero = vec_splat_u32(0); + + const __vector unsigned char shuf[3] = { + { 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3 }, + }; + + return SIMD_4x32(vec_perm(raw(), zero, shuf[I-1])); +#endif + } + + template<size_t I> + SIMD_4x32 shift_elems_right() const + { +#if defined(BOTAN_SIMD_USE_SSE2) + return SIMD_4x32(_mm_srli_si128(raw(), 4*I)); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vextq_u32(raw(), vdupq_n_u32(0), I)); +#elif defined(BOTAN_SIMD_USE_ALTIVEC) + const __vector unsigned int zero = vec_splat_u32(0); + + const __vector unsigned char shuf[3] = { + { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 }, + { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 }, + { 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }, + }; + + return SIMD_4x32(vec_perm(raw(), zero, shuf[I-1])); +#endif + } + /** * 4x4 Transposition on SIMD registers */ diff --git a/src/tests/data/block/aes.vec b/src/tests/data/block/aes.vec index 6a0b4430b..f65c00b7c 100644 --- a/src/tests/data/block/aes.vec +++ b/src/tests/data/block/aes.vec @@ -1,7 +1,7 @@ # Test vectors from NIST CAVP AESAVS # http://csrc.nist.gov/groups/STM/cavp/documents/aes/AESAVS.pdf -#test cpuid aesni ssse3 ppc_crypto +#test cpuid aesni armv8aes ppc_crypto ssse3 neon altivec [AES-128] Key = 000102030405060708090A0B0C0D0E0F |