aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2019-09-25 07:07:04 -0400
committerJack Lloyd <[email protected]>2019-09-25 07:54:09 -0400
commit9a0230b73f11654b181a04b4084af458c504552f (patch)
tree26e123e3d74ab41dab91bf8c6dfa43345292e89a /src
parent50e1552e49aeef26614c8f5317aa7b0f33272219 (diff)
Merge CLMUL and PMULL code
Same algorithms were used just using SSSE3 vs NEON
Diffstat (limited to 'src')
-rw-r--r--src/build-data/policy/bsi.txt3
-rw-r--r--src/build-data/policy/modern.txt3
-rw-r--r--src/build-data/policy/nist.txt3
-rw-r--r--src/lib/block/aes/aes_vperm/aes_vperm.cpp37
-rw-r--r--src/lib/modes/aead/gcm/clmul/clmul.cpp182
-rw-r--r--src/lib/modes/aead/gcm/clmul/info.txt13
-rw-r--r--src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp169
-rw-r--r--src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.h (renamed from src/lib/modes/aead/gcm/clmul/clmul.h)0
-rw-r--r--src/lib/modes/aead/gcm/clmul_cpu/info.txt33
-rw-r--r--src/lib/modes/aead/gcm/ghash.cpp41
-rw-r--r--src/lib/modes/aead/gcm/pmull/info.txt16
-rw-r--r--src/lib/modes/aead/gcm/pmull/pmull.cpp208
-rw-r--r--src/lib/modes/aead/gcm/pmull/pmull.h23
-rw-r--r--src/lib/utils/cpuid/cpuid.h15
-rw-r--r--src/lib/utils/simd/simd_32.h60
-rw-r--r--src/tests/data/block/aes.vec2
16 files changed, 291 insertions, 517 deletions
diff --git a/src/build-data/policy/bsi.txt b/src/build-data/policy/bsi.txt
index d89f4433d..a3e324268 100644
--- a/src/build-data/policy/bsi.txt
+++ b/src/build-data/policy/bsi.txt
@@ -56,9 +56,8 @@ aes_armv8
aes_power8
# modes
-clmul
+clmul_cpu
clmul_ssse3
-pmull
# hash
sha2_32_x86
diff --git a/src/build-data/policy/modern.txt b/src/build-data/policy/modern.txt
index 2a9c12613..0c2e7f8c3 100644
--- a/src/build-data/policy/modern.txt
+++ b/src/build-data/policy/modern.txt
@@ -51,9 +51,8 @@ prf_tls
newhope
ed25519
-clmul
+clmul_cpu
clmul_ssse3
-pmull
locking_allocator
http_util # needed by x509 for OCSP online checks
diff --git a/src/build-data/policy/nist.txt b/src/build-data/policy/nist.txt
index 2fdf60ea6..7eb0be23b 100644
--- a/src/build-data/policy/nist.txt
+++ b/src/build-data/policy/nist.txt
@@ -63,9 +63,8 @@ sha2_64_bmi2
sha3_bmi2
# modes
-clmul
+clmul_cpu
clmul_ssse3
-pmull
# hash
sha2_32_x86
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
index 24470bda3..7e7b36116 100644
--- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -18,14 +18,6 @@
#include <tmmintrin.h>
#endif
-#if defined(BOTAN_SIMD_USE_SSE2)
- #define BOTAN_VPERM_ISA "ssse3"
-#elif defined(BOTAN_SIMD_USE_NEON)
- #define BOTAN_VPERM_ISA "+simd"
-#elif defined(BOTAN_SIMD_USE_ALTIVEC)
- #define BOTAN_VPERM_ISA "altivec"
-#endif
-
namespace Botan {
namespace {
@@ -63,34 +55,12 @@ inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32
#endif
}
-template<size_t I>
-inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shift_elems_left(SIMD_4x32 x)
- {
-#if defined(BOTAN_SIMD_USE_SSE2)
- return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I));
-#elif defined(BOTAN_SIMD_USE_NEON)
- return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u32(x.raw()), 16 - 4*I)));
-#elif defined(BOTAN_SIMD_USE_ALTIVEC)
- const __vector unsigned int zero = vec_splat_u32(0);
-
- const __vector unsigned char shuf[3] = {
- { 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
- { 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 },
- { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3 },
- };
-
- return SIMD_4x32(vec_perm(x.raw(), zero, shuf[I-1]));
-#else
- #error "No shift_elems_left implementation available"
-#endif
- }
-
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b)
{
#if defined(BOTAN_SIMD_USE_SSE2)
return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
#elif defined(BOTAN_SIMD_USE_NEON)
- return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), 8)));
+ return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2));
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
const __vector unsigned char mask = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
@@ -504,8 +474,8 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
{
- SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2);
- smeared ^= shift_elems_left<2>(smeared);
+ SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
+ smeared ^= smeared.shift_elems_left<2>();
smeared ^= SIMD_4x32::splat_u8(0x5B);
const SIMD_4x32 Bh = high_nibs(input1);
@@ -588,7 +558,6 @@ void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
// key2 with 8 high bytes masked off
SIMD_4x32 t = key2;
key2 = aes_schedule_round(rcon[2*i], key2, key1);
-
const SIMD_4x32 key2t = alignr8(key2, t);
aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
diff --git a/src/lib/modes/aead/gcm/clmul/clmul.cpp b/src/lib/modes/aead/gcm/clmul/clmul.cpp
deleted file mode 100644
index 6f968866d..000000000
--- a/src/lib/modes/aead/gcm/clmul/clmul.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
-* CLMUL hook
-* (C) 2013,2017 Jack Lloyd
-*
-* Botan is released under the Simplified BSD License (see license.txt)
-*/
-
-#include <botan/internal/clmul.h>
-#include <immintrin.h>
-#include <wmmintrin.h>
-
-namespace Botan {
-
-namespace {
-
-BOTAN_FUNC_ISA("sse2")
-inline __m128i gcm_reduce(const __m128i& B0, const __m128i& B1)
- {
- __m128i T0, T1, T2, T3;
-
- T0 = _mm_srli_epi32(B1, 31);
- T1 = _mm_slli_epi32(B1, 1);
- T2 = _mm_srli_epi32(B0, 31);
- T3 = _mm_slli_epi32(B0, 1);
-
- T3 = _mm_or_si128(T3, _mm_srli_si128(T0, 12));
- T3 = _mm_or_si128(T3, _mm_slli_si128(T2, 4));
- T1 = _mm_or_si128(T1, _mm_slli_si128(T0, 4));
-
- T0 = _mm_xor_si128(_mm_slli_epi32(T1, 31), _mm_slli_epi32(T1, 30));
- T0 = _mm_xor_si128(T0, _mm_slli_epi32(T1, 25));
-
- T1 = _mm_xor_si128(T1, _mm_slli_si128(T0, 12));
-
- T0 = _mm_xor_si128(T3, _mm_srli_si128(T0, 4));
- T0 = _mm_xor_si128(T0, T1);
- T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 7));
- T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 1));
- T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 2));
- return T0;
- }
-
-BOTAN_FUNC_ISA("pclmul,sse2")
-inline __m128i gcm_multiply(const __m128i& H, const __m128i& x)
- {
- __m128i T0, T1, T2, T3;
-
- T0 = _mm_clmulepi64_si128(x, H, 0x11);
- T1 = _mm_clmulepi64_si128(x, H, 0x10);
- T2 = _mm_clmulepi64_si128(x, H, 0x01);
- T3 = _mm_clmulepi64_si128(x, H, 0x00);
-
- T1 = _mm_xor_si128(T1, T2);
- T0 = _mm_xor_si128(T0, _mm_srli_si128(T1, 8));
- T3 = _mm_xor_si128(T3, _mm_slli_si128(T1, 8));
-
- return gcm_reduce(T0, T3);
- }
-
-BOTAN_FUNC_ISA("pclmul,sse2")
-inline __m128i gcm_multiply_x4(const __m128i& H1, const __m128i& H2, const __m128i& H3, const __m128i& H4,
- const __m128i& X1, const __m128i& X2, const __m128i& X3, const __m128i& X4)
- {
- /*
- * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
- * and Pierre Laurent of Intel
- */
-
- const __m128i H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
- const __m128i H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
- const __m128i H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
- const __m128i H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
-
- const __m128i lo = _mm_xor_si128(
- _mm_xor_si128(H1_X1_lo, H2_X2_lo),
- _mm_xor_si128(H3_X3_lo, H4_X4_lo));
-
- const __m128i H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
- const __m128i H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
- const __m128i H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
- const __m128i H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
-
- const __m128i hi = _mm_xor_si128(
- _mm_xor_si128(H1_X1_hi, H2_X2_hi),
- _mm_xor_si128(H3_X3_hi, H4_X4_hi));
-
- __m128i T0 = _mm_xor_si128(lo, hi);
- __m128i T1, T2, T3, T4;
-
- T1 = _mm_xor_si128(_mm_srli_si128(H1, 8), H1);
- T2 = _mm_xor_si128(_mm_srli_si128(X1, 8), X1);
- T3 = _mm_xor_si128(_mm_srli_si128(H2, 8), H2);
- T4 = _mm_xor_si128(_mm_srli_si128(X2, 8), X2);
- T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00));
- T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00));
-
- T1 = _mm_xor_si128(_mm_srli_si128(H3, 8), H3);
- T2 = _mm_xor_si128(_mm_srli_si128(X3, 8), X3);
- T3 = _mm_xor_si128(_mm_srli_si128(H4, 8), H4);
- T4 = _mm_xor_si128(_mm_srli_si128(X4, 8), X4);
- T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00));
- T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00));
-
- T1 = _mm_xor_si128(_mm_srli_si128(T0, 8), hi);
- T2 = _mm_xor_si128(_mm_slli_si128(T0, 8), lo);
-
- return gcm_reduce(T1, T2);
- }
-
-}
-
-BOTAN_FUNC_ISA("ssse3")
-void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
- {
- const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
- const __m128i H = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H_bytes)), BSWAP_MASK);
- const __m128i H2 = gcm_multiply(H, H);
- const __m128i H3 = gcm_multiply(H, H2);
- const __m128i H4 = gcm_multiply(H, H3);
-
- __m128i* H_pow_mm = reinterpret_cast<__m128i*>(H_pow);
-
- _mm_storeu_si128(H_pow_mm+0, H);
- _mm_storeu_si128(H_pow_mm+1, H2);
- _mm_storeu_si128(H_pow_mm+2, H3);
- _mm_storeu_si128(H_pow_mm+3, H4);
- }
-
-BOTAN_FUNC_ISA("ssse3")
-void gcm_multiply_clmul(uint8_t x[16],
- const uint64_t H_pow[8],
- const uint8_t input_bytes[], size_t blocks)
- {
- /*
- * Algorithms 1 and 5 from Intel's CLMUL guide
- */
- const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
- const __m128i* input = reinterpret_cast<const __m128i*>(input_bytes);
-
- const __m128i* H_pow_mm = reinterpret_cast<const __m128i*>(H_pow);
-
- const __m128i H = _mm_loadu_si128(H_pow_mm);
-
- __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x));
- a = _mm_shuffle_epi8(a, BSWAP_MASK);
-
- if(blocks >= 4)
- {
- const __m128i H2 = _mm_loadu_si128(H_pow_mm + 1);
- const __m128i H3 = _mm_loadu_si128(H_pow_mm + 2);
- const __m128i H4 = _mm_loadu_si128(H_pow_mm + 3);
-
- while(blocks >= 4)
- {
- const __m128i m0 = _mm_shuffle_epi8(_mm_loadu_si128(input + 0), BSWAP_MASK);
- const __m128i m1 = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), BSWAP_MASK);
- const __m128i m2 = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), BSWAP_MASK);
- const __m128i m3 = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), BSWAP_MASK);
-
- a = _mm_xor_si128(a, m0);
- a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a);
-
- input += 4;
- blocks -= 4;
- }
- }
-
- for(size_t i = 0; i != blocks; ++i)
- {
- const __m128i m = _mm_shuffle_epi8(_mm_loadu_si128(input + i), BSWAP_MASK);
-
- a = _mm_xor_si128(a, m);
- a = gcm_multiply(H, a);
- }
-
- a = _mm_shuffle_epi8(a, BSWAP_MASK);
- _mm_storeu_si128(reinterpret_cast<__m128i*>(x), a);
- }
-
-}
diff --git a/src/lib/modes/aead/gcm/clmul/info.txt b/src/lib/modes/aead/gcm/clmul/info.txt
deleted file mode 100644
index d4b6a1c1f..000000000
--- a/src/lib/modes/aead/gcm/clmul/info.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-<defines>
-GCM_CLMUL -> 20131227
-</defines>
-
-<isa>
-sse2
-ssse3
-aesni
-</isa>
-
-<header:internal>
-clmul.h
-</header:internal>
diff --git a/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp
new file mode 100644
index 000000000..2a41121d1
--- /dev/null
+++ b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp
@@ -0,0 +1,169 @@
+/*
+* Hook for CLMUL/PMULL
+* (C) 2013,2017,2019 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/internal/clmul_cpu.h>
+#include <botan/internal/simd_32.h>
+
+#if defined(BOTAN_SIMD_USE_SSE2)
+ #include <immintrin.h>
+ #include <wmmintrin.h>
+#endif
+
+namespace Botan {
+
+namespace {
+
+BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in)
+ {
+#if defined(BOTAN_SIMD_USE_SSE2)
+ const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
+#elif defined(BOTAN_SIMD_USE_NEON)
+ const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+ const uint8x16_t mask = vld1q_u8(maskb);
+ return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
+#endif
+ }
+
+template<int M>
+BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x)
+ {
+ static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
+
+#if defined(BOTAN_SIMD_USE_SSE2)
+ return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
+#elif defined(BOTAN_SIMD_USE_NEON)
+ const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
+ const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
+ return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
+#endif
+ }
+
+inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1)
+ {
+ SIMD_4x32 X0 = B1.shr<31>();
+ SIMD_4x32 X1 = B1.shl<1>();
+ SIMD_4x32 X2 = B0.shr<31>();
+ SIMD_4x32 X3 = B0.shl<1>();
+
+ X3 |= X0.shift_elems_right<3>();
+ X3 |= X2.shift_elems_left<1>();
+ X1 |= X0.shift_elems_left<1>();
+
+ X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();
+
+ X1 ^= X0.shift_elems_left<3>();
+
+ X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
+ X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
+ return X0;
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x)
+ {
+ SIMD_4x32 T0 = clmul<0x11>(H, x);
+ SIMD_4x32 T1 = clmul<0x10>(H, x);
+ SIMD_4x32 T2 = clmul<0x01>(H, x);
+ SIMD_4x32 T3 = clmul<0x00>(H, x);
+
+ T1 ^= T2;
+ T0 ^= T1.shift_elems_right<2>();
+ T3 ^= T1.shift_elems_left<2>();
+
+ return gcm_reduce(T0, T3);
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA)
+ gcm_multiply_x4(const SIMD_4x32& H1, const SIMD_4x32& H2, const SIMD_4x32& H3, const SIMD_4x32& H4,
+ const SIMD_4x32& X1, const SIMD_4x32& X2, const SIMD_4x32& X3, const SIMD_4x32& X4)
+ {
+ /*
+ * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
+ * and Pierre Laurent of Intel
+ */
+
+ const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^
+ (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));
+
+ const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^
+ (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));
+
+ SIMD_4x32 T;
+
+ T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
+ T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
+ T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
+ T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
+ T ^= lo;
+ T ^= hi;
+
+ return gcm_reduce(hi ^ T.shift_elems_right<2>(),
+ lo ^ T.shift_elems_left<2>());
+ }
+
+}
+
+BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
+void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
+ {
+ const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes));
+ const SIMD_4x32 H2 = gcm_multiply(H1, H1);
+ const SIMD_4x32 H3 = gcm_multiply(H1, H2);
+ const SIMD_4x32 H4 = gcm_multiply(H2, H2);
+
+ H1.store_le(H_pow);
+ H2.store_le(H_pow + 2);
+ H3.store_le(H_pow + 4);
+ H4.store_le(H_pow + 6);
+ }
+
+BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
+void gcm_multiply_clmul(uint8_t x[16],
+ const uint64_t H_pow[8],
+ const uint8_t input[], size_t blocks)
+ {
+ /*
+ * Algorithms 1 and 5 from Intel's CLMUL guide
+ */
+ const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow);
+
+ SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));
+
+ if(blocks >= 4)
+ {
+ const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2);
+ const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4);
+ const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6);
+
+ while(blocks >= 4)
+ {
+ const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input ));
+ const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16*1));
+ const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16*2));
+ const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16*3));
+
+ a ^= m0;
+ a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);
+
+ input += 4*16;
+ blocks -= 4;
+ }
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16*i));
+
+ a ^= m;
+ a = gcm_multiply(H1, a);
+ }
+
+ a = reverse_vector(a);
+ a.store_le(x);
+ }
+
+}
diff --git a/src/lib/modes/aead/gcm/clmul/clmul.h b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.h
index 25cdfbd96..25cdfbd96 100644
--- a/src/lib/modes/aead/gcm/clmul/clmul.h
+++ b/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.h
diff --git a/src/lib/modes/aead/gcm/clmul_cpu/info.txt b/src/lib/modes/aead/gcm/clmul_cpu/info.txt
new file mode 100644
index 000000000..bc018c2ed
--- /dev/null
+++ b/src/lib/modes/aead/gcm/clmul_cpu/info.txt
@@ -0,0 +1,33 @@
+<defines>
+GCM_CLMUL_CPU -> 20131227
+</defines>
+
+<requires>
+simd
+</requires>
+
+<header:internal>
+clmul_cpu.h
+</header:internal>
+
+<isa>
+x86_32:sse2
+x86_32:ssse3
+x86_32:aesni
+x86_64:sse2
+x86_64:ssse3
+x86_64:aesni
+arm64:neon
+arm64:armv8crypto
+</isa>
+
+<arch>
+x86_32
+x86_64
+arm64
+</arch>
+
+<cc>
+gcc:4.9
+clang:3.8
+</cc>
diff --git a/src/lib/modes/aead/gcm/ghash.cpp b/src/lib/modes/aead/gcm/ghash.cpp
index 8b8d3e337..3a4301113 100644
--- a/src/lib/modes/aead/gcm/ghash.cpp
+++ b/src/lib/modes/aead/gcm/ghash.cpp
@@ -12,24 +12,20 @@
#include <botan/cpuid.h>
#include <botan/exceptn.h>
-#if defined(BOTAN_HAS_GCM_CLMUL)
- #include <botan/internal/clmul.h>
+#if defined(BOTAN_HAS_GCM_CLMUL_CPU)
+ #include <botan/internal/clmul_cpu.h>
#endif
#if defined(BOTAN_HAS_GCM_CLMUL_SSSE3)
#include <botan/internal/clmul_ssse3.h>
#endif
-#if defined(BOTAN_HAS_GCM_PMULL)
- #include <botan/internal/pmull.h>
-#endif
-
namespace Botan {
std::string GHASH::provider() const
{
-#if defined(BOTAN_HAS_GCM_CLMUL)
- if(CPUID::has_clmul())
+#if defined(BOTAN_HAS_GCM_CLMUL_CPU)
+ if(CPUID::has_carryless_multiply())
return "clmul";
#endif
@@ -38,11 +34,6 @@ std::string GHASH::provider() const
return "ssse3";
#endif
-#if defined(BOTAN_HAS_GCM_PMULL)
- if(CPUID::has_arm_pmull())
- return "pmull";
-#endif
-
return "base";
}
@@ -50,8 +41,8 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x,
const uint8_t input[],
size_t blocks)
{
-#if defined(BOTAN_HAS_GCM_CLMUL)
- if(CPUID::has_clmul())
+#if defined(BOTAN_HAS_GCM_CLMUL_CPU)
+ if(CPUID::has_carryless_multiply())
{
return gcm_multiply_clmul(x.data(), m_H_pow.data(), input, blocks);
}
@@ -64,13 +55,6 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x,
}
#endif
-#if defined(BOTAN_HAS_GCM_PMULL)
- if(CPUID::has_arm_pmull())
- {
- return gcm_multiply_pmull(x.data(), m_H_pow.data(), input, blocks);
- }
-#endif
-
CT::poison(x.data(), x.size());
const uint64_t ALL_BITS = 0xFFFFFFFFFFFFFFFF;
@@ -169,22 +153,13 @@ void GHASH::key_schedule(const uint8_t key[], size_t length)
}
}
-#if defined(BOTAN_HAS_GCM_CLMUL)
- if(CPUID::has_clmul())
+#if defined(BOTAN_HAS_GCM_CLMUL_CPU)
+ if(CPUID::has_carryless_multiply())
{
m_H_pow.resize(8);
gcm_clmul_precompute(m_H.data(), m_H_pow.data());
}
#endif
-
-#if defined(BOTAN_HAS_GCM_PMULL)
- if(CPUID::has_arm_pmull())
- {
- m_H_pow.resize(8);
- gcm_pmull_precompute(m_H.data(), m_H_pow.data());
- }
-#endif
-
}
void GHASH::start(const uint8_t nonce[], size_t len)
diff --git a/src/lib/modes/aead/gcm/pmull/info.txt b/src/lib/modes/aead/gcm/pmull/info.txt
deleted file mode 100644
index 231a1989e..000000000
--- a/src/lib/modes/aead/gcm/pmull/info.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-<defines>
-GCM_PMULL -> 20170903
-</defines>
-
-<isa>
-armv8crypto
-</isa>
-
-<cc>
-gcc:4.9
-clang:3.8
-</cc>
-
-<header:internal>
-pmull.h
-</header:internal>
diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp
deleted file mode 100644
index 9d6ceb105..000000000
--- a/src/lib/modes/aead/gcm/pmull/pmull.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
-* Contributed by Jeffrey Walton
-*
-* Further changes
-* (C) 2017 Jack Lloyd
-*
-* Botan is released under the Simplified BSD License (see license.txt)
-*/
-
-#include <botan/internal/pmull.h>
-#include <arm_neon.h>
-
-namespace Botan {
-
-/*
-This follows the same pattern as the clmul implementation.
-
-See also https://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf
-*/
-
-namespace {
-
-BOTAN_FUNC_ISA("+simd")
-inline uint64x2_t gcm_reduce(uint32x4_t B0, uint32x4_t B1)
- {
- const uint32x4_t zero = vdupq_n_u32(0);
-
- uint32x4_t T0, T1, T2, T3, T4, T5;
-
- T4 = vshrq_n_u32(B0, 31);
- T0 = vshlq_n_u32(B0, 1);
- T5 = vshrq_n_u32(B1, 31);
- T3 = vshlq_n_u32(B1, 1);
-
- T2 = vextq_u32(T4, zero, 3);
- T5 = vextq_u32(zero, T5, 3);
- T4 = vextq_u32(zero, T4, 3);
- T0 = vorrq_u32(T0, T4);
- T3 = vorrq_u32(T3, T5);
- T3 = vorrq_u32(T3, T2);
-
- T4 = vshlq_n_u32(T0, 31);
- T5 = vshlq_n_u32(T0, 30);
- T2 = vshlq_n_u32(T0, 25);
-
- T4 = veorq_u32(T4, T5);
- T4 = veorq_u32(T4, T2);
- T5 = vextq_u32(T4, zero, 1);
- T3 = veorq_u32(T3, T5);
- T4 = vextq_u32(zero, T4, 1);
- T0 = veorq_u32(T0, T4);
- T3 = veorq_u32(T3, T0);
-
- T4 = vshrq_n_u32(T0, 1);
- T1 = vshrq_n_u32(T0, 2);
- T2 = vshrq_n_u32(T0, 7);
- T3 = veorq_u32(T3, T1);
- T3 = veorq_u32(T3, T2);
- T3 = veorq_u32(T3, T4);
-
- return vreinterpretq_u64_u32(T3);
- }
-
-BOTAN_FUNC_ISA("+crypto")
-inline uint32x4_t vmull(uint64_t x, uint64_t y)
- {
- return reinterpret_cast<uint32x4_t>(vmull_p64(x, y));
- }
-
-BOTAN_FUNC_ISA("+crypto")
-inline uint64x2_t gcm_multiply(uint64x2_t H, uint64x2_t x)
- {
- const uint32x4_t zero = vdupq_n_u32(0);
-
- const uint64_t x_hi = vgetq_lane_u64(x, 0);
- const uint64_t x_lo = vgetq_lane_u64(x, 1);
- const uint64_t H_hi = vgetq_lane_u64(H, 0);
- const uint64_t H_lo = vgetq_lane_u64(H, 1);
-
- uint32x4_t T0 = vmull(x_hi, H_hi);
- uint32x4_t T1 = vmull(x_lo, H_hi);
- uint32x4_t T2 = vmull(x_hi, H_lo);
- uint32x4_t T3 = vmull(x_lo, H_lo);
-
- T1 = veorq_u32(T1, T2);
- T0 = veorq_u32(T0, vextq_u32(zero, T1, 2));
- T3 = veorq_u32(T3, vextq_u32(T1, zero, 2));
-
- return gcm_reduce(T0, T3);
- }
-
-BOTAN_FUNC_ISA("+crypto")
-inline uint64x2_t gcm_multiply_x4(uint64x2_t H1, uint64x2_t H2, uint64x2_t H3, uint64x2_t H4,
- uint64x2_t X1, uint64x2_t X2, uint64x2_t X3, uint64x2_t X4)
- {
- const uint64_t H1_hi = vgetq_lane_u64(H1, 0);
- const uint64_t H1_lo = vgetq_lane_u64(H1, 1);
- const uint64_t H2_hi = vgetq_lane_u64(H2, 0);
- const uint64_t H2_lo = vgetq_lane_u64(H2, 1);
- const uint64_t H3_hi = vgetq_lane_u64(H3, 0);
- const uint64_t H3_lo = vgetq_lane_u64(H3, 1);
- const uint64_t H4_hi = vgetq_lane_u64(H4, 0);
- const uint64_t H4_lo = vgetq_lane_u64(H4, 1);
-
- const uint64_t X1_hi = vgetq_lane_u64(X1, 0);
- const uint64_t X1_lo = vgetq_lane_u64(X1, 1);
- const uint64_t X2_hi = vgetq_lane_u64(X2, 0);
- const uint64_t X2_lo = vgetq_lane_u64(X2, 1);
- const uint64_t X3_hi = vgetq_lane_u64(X3, 0);
- const uint64_t X3_lo = vgetq_lane_u64(X3, 1);
- const uint64_t X4_hi = vgetq_lane_u64(X4, 0);
- const uint64_t X4_lo = vgetq_lane_u64(X4, 1);
-
- const uint32x4_t H1_X1_lo = vmull(X1_lo, H1_lo);
- const uint32x4_t H2_X2_lo = vmull(X2_lo, H2_lo);
- const uint32x4_t H3_X3_lo = vmull(X3_lo, H3_lo);
- const uint32x4_t H4_X4_lo = vmull(X4_lo, H4_lo);
-
- const uint32x4_t lo = veorq_u32(
- veorq_u32(H1_X1_lo, H2_X2_lo),
- veorq_u32(H3_X3_lo, H4_X4_lo));
-
- const uint32x4_t H1_X1_hi = vmull(X1_hi, H1_hi);
- const uint32x4_t H2_X2_hi = vmull(X2_hi, H2_hi);
- const uint32x4_t H3_X3_hi = vmull(X3_hi, H3_hi);
- const uint32x4_t H4_X4_hi = vmull(X4_hi, H4_hi);
-
- const uint32x4_t hi = veorq_u32(
- veorq_u32(H1_X1_hi, H2_X2_hi),
- veorq_u32(H3_X3_hi, H4_X4_hi));
-
- uint32x4_t T0 = veorq_u32(lo, hi);
-
- T0 = veorq_u32(T0, vmull(X1_hi ^ X1_lo, H1_hi ^ H1_lo));
- T0 = veorq_u32(T0, vmull(X2_hi ^ X2_lo, H2_hi ^ H2_lo));
- T0 = veorq_u32(T0, vmull(X3_hi ^ X3_lo, H3_hi ^ H3_lo));
- T0 = veorq_u32(T0, vmull(X4_hi ^ X4_lo, H4_hi ^ H4_lo));
-
- const uint32x4_t zero = vdupq_n_u32(0);
- uint32x4_t B0 = veorq_u32(vextq_u32(zero, T0, 2), hi);
- uint32x4_t B1 = veorq_u32(vextq_u32(T0, zero, 2), lo);
- return gcm_reduce(B0, B1);
- }
-
-BOTAN_FUNC_ISA("+simd")
-inline uint8x16_t bswap_vec(uint8x16_t v)
- {
- const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
- const uint8x16_t mask = vld1q_u8(maskb);
- return vqtbl1q_u8(v, mask);
- }
-
-}
-
-BOTAN_FUNC_ISA("+simd")
-void gcm_pmull_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
- {
- const uint64x2_t H = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(H_bytes)));
- const uint64x2_t H2 = gcm_multiply(H, H);
- const uint64x2_t H3 = gcm_multiply(H, H2);
- const uint64x2_t H4 = gcm_multiply(H, H3);
-
- vst1q_u64(H_pow , H);
- vst1q_u64(H_pow+2, H2);
- vst1q_u64(H_pow+4, H3);
- vst1q_u64(H_pow+6, H4);
- }
-
-BOTAN_FUNC_ISA("+simd")
-void gcm_multiply_pmull(uint8_t x[16],
- const uint64_t H64[8],
- const uint8_t input[], size_t blocks)
- {
- const uint64x2_t H = vld1q_u64(H64);
- uint64x2_t a = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(x)));
-
- if(blocks >= 4)
- {
- const uint64x2_t H2 = vld1q_u64(H64 + 2);
- const uint64x2_t H3 = vld1q_u64(H64 + 4);
- const uint64x2_t H4 = vld1q_u64(H64 + 6);
-
- while(blocks >= 4)
- {
- const uint64x2_t m0 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input)));
- const uint64x2_t m1 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16)));
- const uint64x2_t m2 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 32)));
- const uint64x2_t m3 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 48)));
-
- a = veorq_u64(a, m0);
- a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a);
-
- input += 64;
- blocks -= 4;
- }
- }
-
- for(size_t i = 0; i != blocks; ++i)
- {
- const uint64x2_t m = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16*i)));
- a = veorq_u64(a, m);
- a = gcm_multiply(H, a);
- }
-
- vst1q_u8(x, bswap_vec(vreinterpretq_u8_u64(a)));
- }
-
-}
diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h
deleted file mode 100644
index 17e61097f..000000000
--- a/src/lib/modes/aead/gcm/pmull/pmull.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
-* PMULL hook
-* (C) 2017 Jack Lloyd
-*
-* Botan is released under the Simplified BSD License (see license.txt)
-*/
-
-#ifndef BOTAN_GCM_PMULL_H_
-#define BOTAN_GCM_PMULL_H_
-
-#include <botan/types.h>
-
-namespace Botan {
-
-void gcm_pmull_precompute(const uint8_t H[16], uint64_t H_pow[4*2]);
-
-void gcm_multiply_pmull(uint8_t x[16],
- const uint64_t H[8],
- const uint8_t input[], size_t blocks);
-
-}
-
-#endif
diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h
index 7b6b8ebc3..84201b910 100644
--- a/src/lib/utils/cpuid/cpuid.h
+++ b/src/lib/utils/cpuid/cpuid.h
@@ -334,6 +334,21 @@ class BOTAN_PUBLIC_API(2,1) CPUID final
#endif
}
+ /**
+ * Check if the processor supports carryless multiply
+ * (CLMUL, PMULL)
+ */
+ static bool has_carryless_multiply()
+ {
+#if defined(BOTAN_TARGET_CPU_IS_X86_FAMILY)
+ return has_clmul();
+#elif defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY)
+ return has_arm_pmull();
+#else
+ return false;
+#endif
+ }
+
/*
* Clear a CPUID bit
* Call CPUID::initialize to reset
diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h
index 4c1599842..2f662b98d 100644
--- a/src/lib/utils/simd/simd_32.h
+++ b/src/lib/utils/simd/simd_32.h
@@ -31,6 +31,19 @@
#error "No SIMD instruction set enabled"
#endif
+#if defined(BOTAN_SIMD_USE_SSE2)
+ #define BOTAN_SIMD_ISA "sse2"
+ #define BOTAN_VPERM_ISA "ssse3"
+ #define BOTAN_CLMUL_ISA "pclmul"
+#elif defined(BOTAN_SIMD_USE_NEON)
+ #define BOTAN_SIMD_ISA "+simd"
+ #define BOTAN_VPERM_ISA "+simd"
+ #define BOTAN_CLMUL_ISA "+crypto"
+#elif defined(BOTAN_SIMD_USE_ALTIVEC)
+ #define BOTAN_SIMD_ISA "altivec"
+ #define BOTAN_VPERM_ISA "altivec"
+#endif
+
namespace Botan {
#if defined(BOTAN_SIMD_USE_SSE2)
@@ -172,7 +185,12 @@ class SIMD_4x32 final
#endif
}
- void store_le(uint32_t out[]) const
+ void store_le(uint32_t out[4]) const
+ {
+ this->store_le(reinterpret_cast<uint8_t*>(out));
+ }
+
+ void store_le(uint64_t out[2]) const
{
this->store_le(reinterpret_cast<uint8_t*>(out));
}
@@ -489,6 +507,46 @@ class SIMD_4x32 final
#endif
}
+ template<size_t I>
+ SIMD_4x32 shift_elems_left() const
+ {
+#if defined(BOTAN_SIMD_USE_SSE2)
+ return SIMD_4x32(_mm_slli_si128(raw(), 4*I));
+#elif defined(BOTAN_SIMD_USE_NEON)
+ return SIMD_4x32(vextq_u32(vdupq_n_u32(0), raw(), 4-I));
+#elif defined(BOTAN_SIMD_USE_ALTIVEC)
+ const __vector unsigned int zero = vec_splat_u32(0);
+
+ const __vector unsigned char shuf[3] = {
+ { 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3 },
+ };
+
+ return SIMD_4x32(vec_perm(raw(), zero, shuf[I-1]));
+#endif
+ }
+
+ template<size_t I>
+ SIMD_4x32 shift_elems_right() const
+ {
+#if defined(BOTAN_SIMD_USE_SSE2)
+ return SIMD_4x32(_mm_srli_si128(raw(), 4*I));
+#elif defined(BOTAN_SIMD_USE_NEON)
+ return SIMD_4x32(vextq_u32(raw(), vdupq_n_u32(0), I));
+#elif defined(BOTAN_SIMD_USE_ALTIVEC)
+ const __vector unsigned int zero = vec_splat_u32(0);
+
+ const __vector unsigned char shuf[3] = {
+ { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 },
+ { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
+ { 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
+ };
+
+ return SIMD_4x32(vec_perm(raw(), zero, shuf[I-1]));
+#endif
+ }
+
/**
* 4x4 Transposition on SIMD registers
*/
diff --git a/src/tests/data/block/aes.vec b/src/tests/data/block/aes.vec
index 6a0b4430b..f65c00b7c 100644
--- a/src/tests/data/block/aes.vec
+++ b/src/tests/data/block/aes.vec
@@ -1,7 +1,7 @@
# Test vectors from NIST CAVP AESAVS
# http://csrc.nist.gov/groups/STM/cavp/documents/aes/AESAVS.pdf
-#test cpuid aesni ssse3 ppc_crypto
+#test cpuid aesni armv8aes ppc_crypto ssse3 neon altivec
[AES-128]
Key = 000102030405060708090A0B0C0D0E0F