diff options
author | Jack Lloyd <[email protected]> | 2017-01-17 16:57:23 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-05-20 11:14:18 -0400 |
commit | 832200d0caaa0c0ebff7348bb72be14146476872 (patch) | |
tree | f6415acb4f5253b1d0383d9e901a368f4c3cc29a /src/lib/hash/sha2_32 | |
parent | 455a39f70e6de8376f78f318f04d07af7d245be3 (diff) |
Add SHA-256 using ARMv8 instructions
Based on patch from Jeffrey Walton in GH #841
Diffstat (limited to 'src/lib/hash/sha2_32')
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32.cpp | 55 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32.h | 14 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_armv8/info.txt | 6 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp | 200 |
4 files changed, 240 insertions, 35 deletions
diff --git a/src/lib/hash/sha2_32/sha2_32.cpp b/src/lib/hash/sha2_32/sha2_32.cpp index c4d76b0f6..77e930699 100644 --- a/src/lib/hash/sha2_32/sha2_32.cpp +++ b/src/lib/hash/sha2_32/sha2_32.cpp @@ -11,46 +11,28 @@ namespace Botan { -namespace { - -namespace SHA2_32 { - -/* -* SHA-256 Rho Function -*/ -inline uint32_t rho(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t rot3) - { - return (rotate_right(X, rot1) ^ rotate_right(X, rot2) ^ - rotate_right(X, rot3)); - } - -/* -* SHA-256 Sigma Function -*/ -inline uint32_t sigma(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t shift) - { - return (rotate_right(X, rot1) ^ rotate_right(X, rot2) ^ (X >> shift)); - } - /* * SHA-256 F1 Function * * Use a macro as many compilers won't inline a function this big, * even though it is much faster if inlined. */ -#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) \ - do { \ - H += magic + rho(E, 6, 11, 25) + ((E & F) ^ (~E & G)) + M1; \ - D += H; \ - H += rho(A, 2, 13, 22) + ((A & B) | ((A | B) & C)); \ - M1 += sigma(M2, 17, 19, 10) + M3 + sigma(M4, 7, 18, 3); \ +#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) do { \ + uint32_t A_rho = rotate_right(A, 2) ^ rotate_right(A, 13) ^ rotate_right(A, 22); \ + uint32_t E_rho = rotate_right(E, 6) ^ rotate_right(E, 11) ^ rotate_right(E, 25); \ + uint32_t M2_sigma = rotate_right(M2, 17) ^ rotate_right(M2, 19) ^ (M2 >> 10); \ + uint32_t M4_sigma = rotate_right(M4, 7) ^ rotate_right(M4, 18) ^ (M4 >> 3); \ + H += magic + E_rho + ((E & F) ^ (~E & G)) + M1; \ + D += H; \ + H += A_rho + ((A & B) | ((A | B) & C)); \ + M1 += M2_sigma + M3 + M4_sigma; \ } while(0); /* * SHA-224 / SHA-256 compression function */ -void compress(secure_vector<uint32_t>& digest, - const uint8_t input[], size_t blocks) +void SHA_256::compress_digest(secure_vector<uint32_t>& digest, + const uint8_t input[], size_t blocks) { #if defined(BOTAN_HAS_SHA2_32_X86) if(CPUID::has_intel_sha()) @@ -59,6 +41,13 @@ void compress(secure_vector<uint32_t>& digest, } #endif +#if defined(BOTAN_HAS_SHA2_32_ARMV8) + if(CPUID::has_arm_sha2()) + { + return SHA_256::compress_digest_armv8(digest, input, blocks); + } +#endif + uint32_t A = digest[0], B = digest[1], C = digest[2], D = digest[3], E = digest[4], F = digest[5], G = digest[6], H = digest[7]; @@ -160,16 +149,12 @@ void compress(secure_vector<uint32_t>& digest, } } -} - -} - /* * SHA-224 compression function */ void SHA_224::compress_n(const uint8_t input[], size_t blocks) { - SHA2_32::compress(m_digest, input, blocks); + SHA_256::compress_digest(m_digest, input, blocks); } /* @@ -201,7 +186,7 @@ void SHA_224::clear() */ void SHA_256::compress_n(const uint8_t input[], size_t blocks) { - SHA2_32::compress(m_digest, input, blocks); + SHA_256::compress_digest(m_digest, input, blocks); } /* diff --git a/src/lib/hash/sha2_32/sha2_32.h b/src/lib/hash/sha2_32/sha2_32.h index 5a687efbe..c65ae449d 100644 --- a/src/lib/hash/sha2_32/sha2_32.h +++ b/src/lib/hash/sha2_32/sha2_32.h @@ -49,7 +49,21 @@ class BOTAN_DLL SHA_256 final : public MDx_HashFunction SHA_256() : MDx_HashFunction(64, true, true), m_digest(8) { clear(); } + /* + * Perform a SHA-256 compression. For internal use + */ + static void compress_digest(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks); + private: + +#if defined(BOTAN_HAS_SHA2_32_ARMV8) + static void compress_digest_armv8(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks); +#endif + void compress_n(const uint8_t[], size_t blocks) override; void copy_out(uint8_t[]) override; diff --git a/src/lib/hash/sha2_32/sha2_32_armv8/info.txt b/src/lib/hash/sha2_32/sha2_32_armv8/info.txt new file mode 100644 index 000000000..0453416d6 --- /dev/null +++ b/src/lib/hash/sha2_32/sha2_32_armv8/info.txt @@ -0,0 +1,6 @@ +define SHA2_32_ARMV8 20170117 + +<arch> +arm32 +arm64 +</arch> diff --git a/src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp b/src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp new file mode 100644 index 000000000..715b683c9 --- /dev/null +++ b/src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp @@ -0,0 +1,200 @@ +/* +* SHA-256 using CPU instructions in ARMv8 +* +* Contributed by Jeffrey Walton. Based on public domain code by +* Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/sha2_32.h> +#include <arm_neon.h> + +namespace Botan { + +/* +* SHA-256 using CPU instructions in ARMv8 +*/ +//static +BOTAN_FUNC_ISA("+crypto") +void SHA_256::compress_digest_armv8(secure_vector<uint32_t>& digest, const uint8_t input[], size_t blocks) + { + static const uint32_t K[] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, + 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, + 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, + 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, + 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, + 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, + 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, + 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, + 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, + }; + + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1, TMP2; + + // Load initial values + STATE0 = vld1q_u32(&digest[0]); + STATE1 = vld1q_u32(&digest[4]); + + while (blocks) + { + // Save current state + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Intermediate void* cast due to http://llvm.org/bugs/show_bug.cgi?id=20670 + MSG0 = vld1q_u32((const uint32_t*)(const void*)(input + 0)); + MSG1 = vld1q_u32((const uint32_t*)(const void*)(input + 16)); + MSG2 = vld1q_u32((const uint32_t*)(const void*)(input + 32)); + MSG3 = vld1q_u32((const uint32_t*)(const void*)(input + 48)); + + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x00])); + + // Rounds 0-3 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x04])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + // Rounds 4-7 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x08])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + // Rounds 8-11 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x0c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + // Rounds 12-15 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x10])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + // Rounds 16-19 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x14])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + // Rounds 20-23 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x18])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + // Rounds 24-27 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x1c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + // Rounds 28-31 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x20])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + // Rounds 32-35 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x24])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + // Rounds 36-39 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x28])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + // Rounds 40-43 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x2c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + // Rounds 44-47 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x30])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + // Rounds 48-51 + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x34])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + // Rounds 52-55 + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x38])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + // Rounds 56-59 + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x3c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + // Rounds 60-63 + TMP2 = STATE0; + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + // Add back to state + STATE0 = vaddq_u32(STATE0, ABEF_SAVE); + STATE1 = vaddq_u32(STATE1, CDGH_SAVE); + + input += 64; + blocks--; + } + + // Save state + vst1q_u32(&digest[0], STATE0); + vst1q_u32(&digest[4], STATE1); + } + +} |