diff options
author | Jack Lloyd <[email protected]> | 2018-05-27 13:18:22 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2018-05-27 13:18:22 -0400 |
commit | 01d250f78644fccc1db0bdc8da6577fae90afe01 (patch) | |
tree | 806a072f0d5c93b9d9e668641cf131ae11eacb54 | |
parent | 4f5e266ad895bbcf6adf970c06f3999324f1b2ec (diff) | |
parent | 8df48e74987fb2ab3c97adb2b48c2cafc0ea381b (diff) |
Merge GH #1584 Add BMI2 optimization for SHA-256
-rw-r--r-- | src/build-data/cc/clang.txt | 2 | ||||
-rw-r--r-- | src/build-data/cc/gcc.txt | 2 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32.cpp | 14 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32.h | 6 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_bmi2/info.txt | 10 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp | 139 | ||||
-rw-r--r-- | src/lib/utils/cpuid/cpuid.cpp | 5 | ||||
-rw-r--r-- | src/lib/utils/cpuid/cpuid.h | 7 | ||||
-rw-r--r-- | src/lib/utils/cpuid/cpuid_x86.cpp | 15 | ||||
-rw-r--r-- | src/tests/data/hash/sha2_32.vec | 2 |
10 files changed, 195 insertions, 7 deletions
diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt index d8c028191..65586088b 100644 --- a/src/build-data/cc/clang.txt +++ b/src/build-data/cc/clang.txt @@ -48,7 +48,7 @@ ssse3 -> "-mssse3" sse41 -> "-msse4.1" sse42 -> "-msse4.2" avx2 -> "-mavx2" -bmi2 -> "-mbmi2" +bmi2 -> "-mbmi -mbmi2" aesni -> "-maes -mpclmul -mssse3" rdrand -> "-mrdrnd" rdseed -> "-mrdseed" diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt index 0b12e00bc..a1e45b428 100644 --- a/src/build-data/cc/gcc.txt +++ b/src/build-data/cc/gcc.txt @@ -53,7 +53,7 @@ ssse3 -> "-mssse3" sse41 -> "-msse4.1" sse42 -> "-msse4.2" avx2 -> "-mavx2" -bmi2 -> "-mbmi2" +bmi2 -> "-mbmi -mbmi2" aesni -> "-maes -mpclmul -mssse3" rdrand -> "-mrdrnd" rdseed -> "-mrdseed" diff --git a/src/lib/hash/sha2_32/sha2_32.cpp b/src/lib/hash/sha2_32/sha2_32.cpp index 0710747d0..99cc2a6ff 100644 --- a/src/lib/hash/sha2_32/sha2_32.cpp +++ b/src/lib/hash/sha2_32/sha2_32.cpp @@ -51,6 +51,13 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest, } #endif +#if defined(BOTAN_HAS_SHA2_32_X86_BMI2) + if(CPUID::has_bmi2()) + { + return SHA_256::compress_digest_x86_bmi2(digest, input, blocks); + } +#endif + #if defined(BOTAN_HAS_SHA2_32_ARMV8) if(CPUID::has_arm_sha2()) { @@ -59,8 +66,8 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest, #endif uint32_t A = digest[0], B = digest[1], C = digest[2], - D = digest[3], E = digest[4], F = digest[5], - G = digest[6], H = digest[7]; + D = digest[3], E = digest[4], F = digest[5], + G = digest[6], H = digest[7]; for(size_t i = 0; i != blocks; ++i) { @@ -97,6 +104,7 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest, SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0x80DEB1FE); SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x9BDC06A7); SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC19BF174); + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0xE49B69C1); SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0xEFBE4786); SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x0FC19DC6); @@ -113,6 +121,7 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest, SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD5A79147); SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x06CA6351); SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x14292967); + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x27B70A85); SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x2E1B2138); SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x4D2C6DFC); @@ -129,6 +138,7 @@ void SHA_256::compress_digest(secure_vector<uint32_t>& digest, SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD6990624); SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xF40E3585); SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x106AA070); + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x19A4C116); SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x1E376C08); SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x2748774C); diff --git a/src/lib/hash/sha2_32/sha2_32.h b/src/lib/hash/sha2_32/sha2_32.h index 6a0d87ac7..bc883f77a 100644 --- a/src/lib/hash/sha2_32/sha2_32.h +++ b/src/lib/hash/sha2_32/sha2_32.h @@ -66,6 +66,12 @@ class BOTAN_PUBLIC_API(2,0) SHA_256 final : public MDx_HashFunction size_t blocks); #endif +#if defined(BOTAN_HAS_SHA2_32_X86_BMI2) + static void compress_digest_x86_bmi2(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks); +#endif + #if defined(BOTAN_HAS_SHA2_32_X86) static void compress_digest_x86(secure_vector<uint32_t>& digest, const uint8_t input[], diff --git a/src/lib/hash/sha2_32/sha2_32_bmi2/info.txt b/src/lib/hash/sha2_32/sha2_32_bmi2/info.txt new file mode 100644 index 000000000..dc7349716 --- /dev/null +++ b/src/lib/hash/sha2_32/sha2_32_bmi2/info.txt @@ -0,0 +1,10 @@ +<defines> +SHA2_32_X86_BMI2 -> 20180526 +</defines> + +need_isa bmi2 + +<cc> +gcc +clang +</cc> diff --git a/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp b/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp new file mode 100644 index 000000000..12ceb11c4 --- /dev/null +++ b/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp @@ -0,0 +1,139 @@ +/* +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/sha2_32.h> +#include <botan/rotate.h> + +namespace Botan { + +/* +Your eyes do not decieve you; this is currently just a copy of the +baseline SHA-256 implementation. Because we compile it with BMI2 +flags, GCC and Clang use the BMI2 instructions without further help. + +Likely instruction scheduling could be improved by using inline asm. +*/ + +#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) do { \ + uint32_t A_rho = rotr<2>(A) ^ rotr<13>(A) ^ rotr<22>(A); \ + uint32_t E_rho = rotr<6>(E) ^ rotr<11>(E) ^ rotr<25>(E); \ + uint32_t M2_sigma = rotr<17>(M2) ^ rotr<19>(M2) ^ (M2 >> 10); \ + uint32_t M4_sigma = rotr<7>(M4) ^ rotr<18>(M4) ^ (M4 >> 3); \ + H += magic + E_rho + ((E & F) ^ (~E & G)) + M1; \ + D += H; \ + H += A_rho + ((A & B) | ((A | B) & C)); \ + M1 += M2_sigma + M3 + M4_sigma; \ + } while(0); + +void SHA_256::compress_digest_x86_bmi2(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks) + { + uint32_t A = digest[0], B = digest[1], C = digest[2], + D = digest[3], E = digest[4], F = digest[5], + G = digest[6], H = digest[7]; + + for(size_t i = 0; i != blocks; ++i) + { + uint32_t W00 = load_be<uint32_t>(input, 0); + uint32_t W01 = load_be<uint32_t>(input, 1); + uint32_t W02 = load_be<uint32_t>(input, 2); + uint32_t W03 = load_be<uint32_t>(input, 3); + uint32_t W04 = load_be<uint32_t>(input, 4); + uint32_t W05 = load_be<uint32_t>(input, 5); + uint32_t W06 = load_be<uint32_t>(input, 6); + uint32_t W07 = load_be<uint32_t>(input, 7); + uint32_t W08 = load_be<uint32_t>(input, 8); + uint32_t W09 = load_be<uint32_t>(input, 9); + uint32_t W10 = load_be<uint32_t>(input, 10); + uint32_t W11 = load_be<uint32_t>(input, 11); + uint32_t W12 = load_be<uint32_t>(input, 12); + uint32_t W13 = load_be<uint32_t>(input, 13); + uint32_t W14 = load_be<uint32_t>(input, 14); + uint32_t W15 = load_be<uint32_t>(input, 15); + + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x428A2F98); + SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x71374491); + SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0xB5C0FBCF); + SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0xE9B5DBA5); + SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x3956C25B); + SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x59F111F1); + SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x923F82A4); + SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0xAB1C5ED5); + SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xD807AA98); + SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x12835B01); + SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x243185BE); + SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x550C7DC3); + SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x72BE5D74); + SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0x80DEB1FE); + SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x9BDC06A7); + SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC19BF174); + + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0xE49B69C1); + SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0xEFBE4786); + SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x0FC19DC6); + SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x240CA1CC); + SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x2DE92C6F); + SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4A7484AA); + SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5CB0A9DC); + SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x76F988DA); + SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x983E5152); + SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA831C66D); + SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xB00327C8); + SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xBF597FC7); + SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xC6E00BF3); + SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD5A79147); + SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x06CA6351); + SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x14292967); + + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x27B70A85); + SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x2E1B2138); + SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x4D2C6DFC); + SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x53380D13); + SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x650A7354); + SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x766A0ABB); + SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x81C2C92E); + SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x92722C85); + SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xA2BFE8A1); + SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA81A664B); + SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xC24B8B70); + SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xC76C51A3); + SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xD192E819); + SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD6990624); + SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xF40E3585); + SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x106AA070); + + SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x19A4C116); + SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x1E376C08); + SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x2748774C); + SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x34B0BCB5); + SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x391C0CB3); + SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4ED8AA4A); + SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5B9CCA4F); + SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x682E6FF3); + SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x748F82EE); + SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x78A5636F); + SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x84C87814); + SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x8CC70208); + SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x90BEFFFA); + SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xA4506CEB); + SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xBEF9A3F7); + SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC67178F2); + + A = (digest[0] += A); + B = (digest[1] += B); + C = (digest[2] += C); + D = (digest[3] += D); + E = (digest[4] += E); + F = (digest[5] += F); + G = (digest[6] += G); + H = (digest[7] += H); + + input += 64; + } + } + +} diff --git a/src/lib/utils/cpuid/cpuid.cpp b/src/lib/utils/cpuid/cpuid.cpp index 9dc56d59c..d0489cb67 100644 --- a/src/lib/utils/cpuid/cpuid.cpp +++ b/src/lib/utils/cpuid/cpuid.cpp @@ -46,6 +46,7 @@ std::string CPUID::to_string() CPUID_PRINT(avx512f); CPUID_PRINT(rdtsc); + CPUID_PRINT(bmi1); CPUID_PRINT(bmi2); CPUID_PRINT(adx); @@ -145,6 +146,10 @@ CPUID::bit_from_string(const std::string& tok) return {Botan::CPUID::CPUID_AVX2_BIT}; if(tok == "sha") return {Botan::CPUID::CPUID_SHA_BIT}; + if(tok == "bmi2") + return {Botan::CPUID::CPUID_BMI2_BIT}; + if(tok == "adx") + return {Botan::CPUID::CPUID_ADX_BIT}; #elif defined(BOTAN_TARGET_CPU_IS_PPC_FAMILY) if(tok == "altivec" || tok == "simd") diff --git a/src/lib/utils/cpuid/cpuid.h b/src/lib/utils/cpuid/cpuid.h index 4c0f1668b..633824a6c 100644 --- a/src/lib/utils/cpuid/cpuid.h +++ b/src/lib/utils/cpuid/cpuid.h @@ -98,6 +98,7 @@ class BOTAN_PUBLIC_API(2,1) CPUID final CPUID_RDTSC_BIT = (1ULL << 10), CPUID_BMI2_BIT = (1ULL << 11), CPUID_ADX_BIT = (1ULL << 12), + CPUID_BMI1_BIT = (1ULL << 13), // Crypto-specific ISAs CPUID_AESNI_BIT = (1ULL << 16), @@ -215,6 +216,12 @@ class BOTAN_PUBLIC_API(2,1) CPUID final { return has_cpuid_bit(CPUID_AVX512F_BIT); } /** + * Check if the processor supports BMI1 + */ + static bool has_bmi1() + { return has_cpuid_bit(CPUID_BMI1_BIT); } + + /** * Check if the processor supports BMI2 */ static bool has_bmi2() diff --git a/src/lib/utils/cpuid/cpuid_x86.cpp b/src/lib/utils/cpuid/cpuid_x86.cpp index be6c75a55..5387a801e 100644 --- a/src/lib/utils/cpuid/cpuid_x86.cpp +++ b/src/lib/utils/cpuid/cpuid_x86.cpp @@ -121,6 +121,7 @@ uint64_t CPUID::detect_cpu_features(size_t* cache_line_size) X86_CPUID_SUBLEVEL(7, 0, cpuid); enum x86_CPUID_7_bits : uint64_t { + BMI1 = (1ULL << 3), AVX2 = (1ULL << 5), BMI2 = (1ULL << 8), AVX512F = (1ULL << 16), @@ -132,8 +133,18 @@ uint64_t CPUID::detect_cpu_features(size_t* cache_line_size) if(flags7 & x86_CPUID_7_bits::AVX2) features_detected |= CPUID::CPUID_AVX2_BIT; - if(flags7 & x86_CPUID_7_bits::BMI2) - features_detected |= CPUID::CPUID_BMI2_BIT; + if(flags7 & x86_CPUID_7_bits::BMI1) + { + features_detected |= CPUID::CPUID_BMI1_BIT; + /* + We only set the BMI2 bit if BMI1 is also supported, so BMI2 + code can safely use both extensions. No known processor + implements BMI2 but not BMI1. + */ + if(flags7 & x86_CPUID_7_bits::BMI2) + features_detected |= CPUID::CPUID_BMI2_BIT; + } + if(flags7 & x86_CPUID_7_bits::AVX512F) features_detected |= CPUID::CPUID_AVX512F_BIT; if(flags7 & x86_CPUID_7_bits::RDSEED) diff --git a/src/tests/data/hash/sha2_32.vec b/src/tests/data/hash/sha2_32.vec index 3ff472e89..af2491fd6 100644 --- a/src/tests/data/hash/sha2_32.vec +++ b/src/tests/data/hash/sha2_32.vec @@ -1,4 +1,4 @@ -#test cpuid sha armv8sha2 +#test cpuid sha armv8sha2 bmi2 [SHA-224] In = |