aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2018-05-27 13:18:22 -0400
committerJack Lloyd <[email protected]>2018-05-27 13:18:22 -0400
commit01d250f78644fccc1db0bdc8da6577fae90afe01 (patch)
tree806a072f0d5c93b9d9e668641cf131ae11eacb54
parent4f5e266ad895bbcf6adf970c06f3999324f1b2ec (diff)
parent8df48e74987fb2ab3c97adb2b48c2cafc0ea381b (diff)
Merge GH #1584 Add BMI2 optimization for SHA-256
-rw-r--r--src/build-data/cc/clang.txt2
-rw-r--r--src/build-data/cc/gcc.txt2
-rw-r--r--src/lib/hash/sha2_32/sha2_32.cpp14
-rw-r--r--src/lib/hash/sha2_32/sha2_32.h6
-rw-r--r--src/lib/hash/sha2_32/sha2_32_bmi2/info.txt10
-rw-r--r--src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp139
-rw-r--r--src/lib/utils/cpuid/cpuid.cpp5
-rw-r--r--src/lib/utils/cpuid/cpuid.h7
-rw-r--r--src/lib/utils/cpuid/cpuid_x86.cpp15
-rw-r--r--src/tests/data/hash/sha2_32.vec2
10 files changed, 195 insertions, 7 deletions
diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt
index d8c028191..65586088b 100644
--- a/src/build-data/cc/clang.txt
+++ b/src/build-data/cc/clang.txt
@@ -48,7 +48,7 @@ ssse3 -> "-mssse3"
sse41 -> "-msse4.1"
sse42 -> "-msse4.2"
avx2 -> "-mavx2"
-bmi2 -> "-mbmi2"
+bmi2 -> "-mbmi -mbmi2"
aesni -> "-maes -mpclmul -mssse3"
rdrand -> "-mrdrnd"
rdseed -> "-mrdseed"
diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt
index 0b12e00bc..a1e45b428 100644
--- a/src/build-data/cc/gcc.txt
+++ b/src/build-data/cc/gcc.txt
@@ -53,7 +53,7 @@ ssse3 -> "-mssse3"
sse41 -> "-msse4.1"
sse42 -> "-msse4.2"
avx2 -> "-mavx2"
-bmi2 -> "-mbmi2"
+bmi2 -> "-mbmi -mbmi2"
aesni -> "-maes -mpclmul -mssse3"
rdrand -> "-mrdrnd"
rdseed -> "-mrdseed"
diff --git a/src/lib/hash/sha2_32/sha2_32.cpp b/src/lib/hash/sha2_32/sha2_32.cpp
index 0710747d0..99cc2a6ff 100644
--- a/src/lib/hash/sha2_32/sha2_32.cpp
+++ b/src/lib/hash/sha2_32/sha2_32.cpp
@@ -51,6 +51,13 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest,
}
#endif
+#if defined(BOTAN_HAS_SHA2_32_X86_BMI2)
+ if(CPUID::has_bmi2())
+ {
+ return SHA_256::compress_digest_x86_bmi2(digest, input, blocks);
+ }
+#endif
+
#if defined(BOTAN_HAS_SHA2_32_ARMV8)
if(CPUID::has_arm_sha2())
{
@@ -59,8 +66,8 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest,
#endif
uint32_t A = digest[0], B = digest[1], C = digest[2],
- D = digest[3], E = digest[4], F = digest[5],
- G = digest[6], H = digest[7];
+ D = digest[3], E = digest[4], F = digest[5],
+ G = digest[6], H = digest[7];
for(size_t i = 0; i != blocks; ++i)
{
@@ -97,6 +104,7 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest,
SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0x80DEB1FE);
SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x9BDC06A7);
SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC19BF174);
+
SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0xE49B69C1);
SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0xEFBE4786);
SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x0FC19DC6);
@@ -113,6 +121,7 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest,
SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD5A79147);
SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x06CA6351);
SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x14292967);
+
SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x27B70A85);
SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x2E1B2138);
SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x4D2C6DFC);
@@ -129,6 +138,7 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest,
SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD6990624);
SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xF40E3585);
SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x106AA070);
+
SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x19A4C116);
SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x1E376C08);
SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x2748774C);
diff --git a/src/lib/hash/sha2_32/sha2_32.h b/src/lib/hash/sha2_32/sha2_32.h
index 6a0d87ac7..bc883f77a 100644
--- a/src/lib/hash/sha2_32/sha2_32.h
+++ b/src/lib/hash/sha2_32/sha2_32.h
@@ -66,6 +66,12 @@ class BOTAN_PUBLIC_API(2,0) SHA_256 final : public MDx_HashFunction
size_t blocks);
#endif
+#if defined(BOTAN_HAS_SHA2_32_X86_BMI2)
+ static void compress_digest_x86_bmi2(secure_vector<uint32_t>& digest,
+ const uint8_t input[],
+ size_t blocks);
+#endif
+
#if defined(BOTAN_HAS_SHA2_32_X86)
static void compress_digest_x86(secure_vector<uint32_t>& digest,
const uint8_t input[],
diff --git a/src/lib/hash/sha2_32/sha2_32_bmi2/info.txt b/src/lib/hash/sha2_32/sha2_32_bmi2/info.txt
new file mode 100644
index 000000000..dc7349716
--- /dev/null
+++ b/src/lib/hash/sha2_32/sha2_32_bmi2/info.txt
@@ -0,0 +1,10 @@
+<defines>
+SHA2_32_X86_BMI2 -> 20180526
+</defines>
+
+need_isa bmi2
+
+<cc>
+gcc
+clang
+</cc>
diff --git a/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp b/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp
new file mode 100644
index 000000000..12ceb11c4
--- /dev/null
+++ b/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp
@@ -0,0 +1,139 @@
+/*
+* (C) 2018 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/sha2_32.h>
+#include <botan/rotate.h>
+
+namespace Botan {
+
+/*
+Your eyes do not decieve you; this is currently just a copy of the
+baseline SHA-256 implementation. Because we compile it with BMI2
+flags, GCC and Clang use the BMI2 instructions without further help.
+
+Likely instruction scheduling could be improved by using inline asm.
+*/
+
+#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) do { \
+ uint32_t A_rho = rotr<2>(A) ^ rotr<13>(A) ^ rotr<22>(A); \
+ uint32_t E_rho = rotr<6>(E) ^ rotr<11>(E) ^ rotr<25>(E); \
+ uint32_t M2_sigma = rotr<17>(M2) ^ rotr<19>(M2) ^ (M2 >> 10); \
+ uint32_t M4_sigma = rotr<7>(M4) ^ rotr<18>(M4) ^ (M4 >> 3); \
+ H += magic + E_rho + ((E & F) ^ (~E & G)) + M1; \
+ D += H; \
+ H += A_rho + ((A & B) | ((A | B) & C)); \
+ M1 += M2_sigma + M3 + M4_sigma; \
+ } while(0);
+
+void SHA_256::compress_digest_x86_bmi2(secure_vector<uint32_t>& digest,
+ const uint8_t input[],
+ size_t blocks)
+ {
+ uint32_t A = digest[0], B = digest[1], C = digest[2],
+ D = digest[3], E = digest[4], F = digest[5],
+ G = digest[6], H = digest[7];
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint32_t W00 = load_be<uint32_t>(input, 0);
+ uint32_t W01 = load_be<uint32_t>(input, 1);
+ uint32_t W02 = load_be<uint32_t>(input, 2);
+ uint32_t W03 = load_be<uint32_t>(input, 3);
+ uint32_t W04 = load_be<uint32_t>(input, 4);
+ uint32_t W05 = load_be<uint32_t>(input, 5);
+ uint32_t W06 = load_be<uint32_t>(input, 6);
+ uint32_t W07 = load_be<uint32_t>(input, 7);
+ uint32_t W08 = load_be<uint32_t>(input, 8);
+ uint32_t W09 = load_be<uint32_t>(input, 9);
+ uint32_t W10 = load_be<uint32_t>(input, 10);
+ uint32_t W11 = load_be<uint32_t>(input, 11);
+ uint32_t W12 = load_be<uint32_t>(input, 12);
+ uint32_t W13 = load_be<uint32_t>(input, 13);
+ uint32_t W14 = load_be<uint32_t>(input, 14);
+ uint32_t W15 = load_be<uint32_t>(input, 15);
+
+ SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x428A2F98);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x71374491);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0xB5C0FBCF);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0xE9B5DBA5);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x3956C25B);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x59F111F1);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x923F82A4);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0xAB1C5ED5);
+ SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xD807AA98);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x12835B01);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x243185BE);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x550C7DC3);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x72BE5D74);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0x80DEB1FE);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x9BDC06A7);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC19BF174);
+
+ SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0xE49B69C1);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0xEFBE4786);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x0FC19DC6);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x240CA1CC);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x2DE92C6F);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4A7484AA);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5CB0A9DC);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x76F988DA);
+ SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x983E5152);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA831C66D);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xB00327C8);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xBF597FC7);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xC6E00BF3);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD5A79147);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x06CA6351);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x14292967);
+
+ SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x27B70A85);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x2E1B2138);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x4D2C6DFC);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x53380D13);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x650A7354);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x766A0ABB);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x81C2C92E);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x92722C85);
+ SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xA2BFE8A1);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA81A664B);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xC24B8B70);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xC76C51A3);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xD192E819);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD6990624);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xF40E3585);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x106AA070);
+
+ SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x19A4C116);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x1E376C08);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x2748774C);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x34B0BCB5);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x391C0CB3);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4ED8AA4A);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5B9CCA4F);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x682E6FF3);
+ SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x748F82EE);
+ SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x78A5636F);
+ SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x84C87814);
+ SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x8CC70208);
+ SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x90BEFFFA);
+ SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xA4506CEB);
+ SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xBEF9A3F7);
+ SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC67178F2);
+
+ A = (digest[0] += A);
+ B = (digest[1] += B);
+ C = (digest[2] += C);
+ D = (digest[3] += D);
+ E = (digest[4] += E);
+ F = (digest[5] += F);
+ G = (digest[6] += G);
+ H = (digest[7] += H);
+
+ input += 64;
+ }
+ }
+
+}
diff --git a/src/lib/utils/cpuid/cpuid.cpp b/src/lib/utils/cpuid/cpuid.cpp
index 9dc56d59c..d0489cb67 100644
--- a/src/lib/utils/cpuid/cpuid.cpp
+++ b/src/lib/utils/cpuid/cpuid.cpp
@@ -46,6 +46,7 @@ std::string CPUID::to_string()
CPUID_PRINT(avx512f);
CPUID_PRINT(rdtsc);
+ CPUID_PRINT(bmi1);
CPUID_PRINT(bmi2);
CPUID_PRINT(adx);
@@ -145,6 +146,10 @@ CPUID::bit_from_string(const std::string& tok)
return {Botan::CPUID::CPUID_AVX2_BIT};
if(tok == "sha")
return {Botan::CPUID::CPUID_SHA_BIT};
+ if(tok == "bmi2")
+ return {Botan::CPUID::CPUID_BMI2_BIT};
+ if(tok == "adx")
+ return {Botan::CPUID::CPUID_ADX_BIT};
#elif defined(BOTAN_TARGET_CPU_IS_PPC_FAMILY)
if(tok == "altivec" || tok == "simd")
diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h
index 4c0f1668b..633824a6c 100644
--- a/src/lib/utils/cpuid/cpuid.h
+++ b/src/lib/utils/cpuid/cpuid.h
@@ -98,6 +98,7 @@ class BOTAN_PUBLIC_API(2,1) CPUID final
CPUID_RDTSC_BIT = (1ULL << 10),
CPUID_BMI2_BIT = (1ULL << 11),
CPUID_ADX_BIT = (1ULL << 12),
+ CPUID_BMI1_BIT = (1ULL << 13),
// Crypto-specific ISAs
CPUID_AESNI_BIT = (1ULL << 16),
@@ -215,6 +216,12 @@ class BOTAN_PUBLIC_API(2,1) CPUID final
{ return has_cpuid_bit(CPUID_AVX512F_BIT); }
/**
+ * Check if the processor supports BMI1
+ */
+ static bool has_bmi1()
+ { return has_cpuid_bit(CPUID_BMI1_BIT); }
+
+ /**
* Check if the processor supports BMI2
*/
static bool has_bmi2()
diff --git a/src/lib/utils/cpuid/cpuid_x86.cpp b/src/lib/utils/cpuid/cpuid_x86.cpp
index be6c75a55..5387a801e 100644
--- a/src/lib/utils/cpuid/cpuid_x86.cpp
+++ b/src/lib/utils/cpuid/cpuid_x86.cpp
@@ -121,6 +121,7 @@ uint64_t CPUID::detect_cpu_features(size_t* cache_line_size)
X86_CPUID_SUBLEVEL(7, 0, cpuid);
enum x86_CPUID_7_bits : uint64_t {
+ BMI1 = (1ULL << 3),
AVX2 = (1ULL << 5),
BMI2 = (1ULL << 8),
AVX512F = (1ULL << 16),
@@ -132,8 +133,18 @@ uint64_t CPUID::detect_cpu_features(size_t* cache_line_size)
if(flags7 & x86_CPUID_7_bits::AVX2)
features_detected |= CPUID::CPUID_AVX2_BIT;
- if(flags7 & x86_CPUID_7_bits::BMI2)
- features_detected |= CPUID::CPUID_BMI2_BIT;
+ if(flags7 & x86_CPUID_7_bits::BMI1)
+ {
+ features_detected |= CPUID::CPUID_BMI1_BIT;
+ /*
+ We only set the BMI2 bit if BMI1 is also supported, so BMI2
+ code can safely use both extensions. No known processor
+ implements BMI2 but not BMI1.
+ */
+ if(flags7 & x86_CPUID_7_bits::BMI2)
+ features_detected |= CPUID::CPUID_BMI2_BIT;
+ }
+
if(flags7 & x86_CPUID_7_bits::AVX512F)
features_detected |= CPUID::CPUID_AVX512F_BIT;
if(flags7 & x86_CPUID_7_bits::RDSEED)
diff --git a/src/tests/data/hash/sha2_32.vec b/src/tests/data/hash/sha2_32.vec
index 3ff472e89..af2491fd6 100644
--- a/src/tests/data/hash/sha2_32.vec
+++ b/src/tests/data/hash/sha2_32.vec
@@ -1,4 +1,4 @@
-#test cpuid sha armv8sha2
+#test cpuid sha armv8sha2 bmi2
[SHA-224]
In =