diff options
author | Jack Lloyd <[email protected]> | 2017-05-22 17:52:16 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-05-22 17:52:16 -0400 |
commit | 1da4a72b281ec25b8fc6c7d55d32c3f7558c63fb (patch) | |
tree | 57f2ae84fd406e647a918a6d8773f68403c00bef /src/lib | |
parent | 89e6af6b94de14a4325d6f761b1bf597b6eb450f (diff) | |
parent | e184202bdf92bf8c8b8a145e0d064251b6c06630 (diff) |
Merge GH #844 Add support for ARMv8 SHA instructions
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/hash/sha1/sha160.cpp | 7 | ||||
-rw-r--r-- | src/lib/hash/sha1/sha160.h | 6 | ||||
-rw-r--r-- | src/lib/hash/sha1/sha1_armv8/info.txt | 13 | ||||
-rw-r--r-- | src/lib/hash/sha1/sha1_armv8/sha1_armv8.cpp | 205 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32.cpp | 57 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32.h | 27 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_armv8/info.txt | 13 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp | 202 | ||||
-rw-r--r-- | src/lib/hash/sha2_32/sha2_32_x86/sha2_32_x86.cpp | 2 |
9 files changed, 488 insertions, 44 deletions
diff --git a/src/lib/hash/sha1/sha160.cpp b/src/lib/hash/sha1/sha160.cpp index 13f9c24d7..6ebdba73f 100644 --- a/src/lib/hash/sha1/sha160.cpp +++ b/src/lib/hash/sha1/sha160.cpp @@ -68,6 +68,13 @@ void SHA_160::compress_n(const uint8_t input[], size_t blocks) } #endif +#if defined(BOTAN_HAS_SHA1_ARMV8) + if(CPUID::has_arm_sha1()) + { + return sha1_armv8_compress_n(m_digest, input, blocks); + } +#endif + #if defined(BOTAN_HAS_SHA1_SSE2) if(CPUID::has_sse2()) { diff --git a/src/lib/hash/sha1/sha160.h b/src/lib/hash/sha1/sha160.h index 7333ca827..f2ed61b64 100644 --- a/src/lib/hash/sha1/sha160.h +++ b/src/lib/hash/sha1/sha160.h @@ -32,6 +32,12 @@ class BOTAN_DLL SHA_160 final : public MDx_HashFunction private: void compress_n(const uint8_t[], size_t blocks) override; +#if defined(BOTAN_HAS_SHA1_ARMV8) + static void sha1_armv8_compress_n(secure_vector<uint32_t>& digest, + const uint8_t blocks[], + size_t block_count); +#endif + #if defined(BOTAN_HAS_SHA1_SSE2) static void sse2_compress_n(secure_vector<uint32_t>& digest, const uint8_t blocks[], diff --git a/src/lib/hash/sha1/sha1_armv8/info.txt b/src/lib/hash/sha1/sha1_armv8/info.txt new file mode 100644 index 000000000..7e9f7ba22 --- /dev/null +++ b/src/lib/hash/sha1/sha1_armv8/info.txt @@ -0,0 +1,13 @@ +<defines> +SHA1_ARMV8 -> 20170117 +</defines> + +<arch> +#arm32 +arm64 +</arch> + +<cc> +gcc:4.9 +clang:3.8 +</cc> diff --git a/src/lib/hash/sha1/sha1_armv8/sha1_armv8.cpp b/src/lib/hash/sha1/sha1_armv8/sha1_armv8.cpp new file mode 100644 index 000000000..97e56bfd0 --- /dev/null +++ b/src/lib/hash/sha1/sha1_armv8/sha1_armv8.cpp @@ -0,0 +1,205 @@ +/* +* SHA-1 using CPU instructions in ARMv8 +* +* Contributed by Jeffrey Walton. Based on public domain code by +* Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/sha160.h> +#include <arm_neon.h> + +namespace Botan { + +/* +* SHA-1 using CPU instructions in ARMv8 +*/ +//static +BOTAN_FUNC_ISA("+crypto") +void SHA_160::sha1_armv8_compress_n(secure_vector<uint32_t>& digest, const uint8_t input8[], size_t blocks) + { + uint32x4_t C0, C1, C2, C3; + uint32x4_t ABCD, ABCD_SAVED; + uint32_t E0, E0_SAVED, E1; + + // Load initial values + C0 = vdupq_n_u32(0x5A827999); + C1 = vdupq_n_u32(0x6ED9EBA1); + C2 = vdupq_n_u32(0x8F1BBCDC); + C3 = vdupq_n_u32(0xCA62C1D6); + + ABCD = vld1q_u32(&digest[0]); + E0 = digest[4]; + + // Intermediate void* cast due to http://llvm.org/bugs/show_bug.cgi?id=20670 + const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8)); + + while (blocks) + { + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1; + + // Save current hash + ABCD_SAVED = ABCD; + E0_SAVED = E0; + + MSG0 = vld1q_u32(input32 + 0); + MSG1 = vld1q_u32(input32 + 4); + MSG2 = vld1q_u32(input32 + 8); + MSG3 = vld1q_u32(input32 + 12); + + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, C0); + TMP1 = vaddq_u32(MSG1, C0); + + // Rounds 0-3 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C0); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 4-7 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C0); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 8-11 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C0); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 12-15 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C1); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 16-19 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C1); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 20-23 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C1); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 24-27 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C1); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 28-31 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C1); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 32-35 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C2); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 36-39 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C2); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 40-43 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C2); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 44-47 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C2); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 48-51 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C2); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 52-55 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C3); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 56-59 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C3); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 60-63 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C3); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 64-67 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C3); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 68-71 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C3); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + + // Rounds 72-75 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + + // Rounds 76-79 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + + // Add state back + E0 += E0_SAVED; + ABCD = vaddq_u32(ABCD_SAVED, ABCD); + + input32 += 64/4; + blocks--; + } + + // Save digest + vst1q_u32(&digest[0], ABCD); + digest[4] = E0; + } + +} diff --git a/src/lib/hash/sha2_32/sha2_32.cpp b/src/lib/hash/sha2_32/sha2_32.cpp index c4d76b0f6..ab6903fa7 100644 --- a/src/lib/hash/sha2_32/sha2_32.cpp +++ b/src/lib/hash/sha2_32/sha2_32.cpp @@ -11,51 +11,40 @@ namespace Botan { -namespace { - -namespace SHA2_32 { - -/* -* SHA-256 Rho Function -*/ -inline uint32_t rho(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t rot3) - { - return (rotate_right(X, rot1) ^ rotate_right(X, rot2) ^ - rotate_right(X, rot3)); - } - -/* -* SHA-256 Sigma Function -*/ -inline uint32_t sigma(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t shift) - { - return (rotate_right(X, rot1) ^ rotate_right(X, rot2) ^ (X >> shift)); - } - /* * SHA-256 F1 Function * * Use a macro as many compilers won't inline a function this big, * even though it is much faster if inlined. */ -#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) \ - do { \ - H += magic + rho(E, 6, 11, 25) + ((E & F) ^ (~E & G)) + M1; \ - D += H; \ - H += rho(A, 2, 13, 22) + ((A & B) | ((A | B) & C)); \ - M1 += sigma(M2, 17, 19, 10) + M3 + sigma(M4, 7, 18, 3); \ +#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) do { \ + uint32_t A_rho = rotate_right(A, 2) ^ rotate_right(A, 13) ^ rotate_right(A, 22); \ + uint32_t E_rho = rotate_right(E, 6) ^ rotate_right(E, 11) ^ rotate_right(E, 25); \ + uint32_t M2_sigma = rotate_right(M2, 17) ^ rotate_right(M2, 19) ^ (M2 >> 10); \ + uint32_t M4_sigma = rotate_right(M4, 7) ^ rotate_right(M4, 18) ^ (M4 >> 3); \ + H += magic + E_rho + ((E & F) ^ (~E & G)) + M1; \ + D += H; \ + H += A_rho + ((A & B) | ((A | B) & C)); \ + M1 += M2_sigma + M3 + M4_sigma; \ } while(0); /* * SHA-224 / SHA-256 compression function */ -void compress(secure_vector<uint32_t>& digest, - const uint8_t input[], size_t blocks) +void SHA_256::compress_digest(secure_vector<uint32_t>& digest, + const uint8_t input[], size_t blocks) { #if defined(BOTAN_HAS_SHA2_32_X86) if(CPUID::has_intel_sha()) { - return sha2_compress_x86(digest.data(), input, blocks); + return SHA_256::compress_digest_x86(digest, input, blocks); + } +#endif + +#if defined(BOTAN_HAS_SHA2_32_ARMV8) + if(CPUID::has_arm_sha2()) + { + return SHA_256::compress_digest_armv8(digest, input, blocks); } #endif @@ -160,16 +149,12 @@ void compress(secure_vector<uint32_t>& digest, } } -} - -} - /* * SHA-224 compression function */ void SHA_224::compress_n(const uint8_t input[], size_t blocks) { - SHA2_32::compress(m_digest, input, blocks); + SHA_256::compress_digest(m_digest, input, blocks); } /* @@ -201,7 +186,7 @@ void SHA_224::clear() */ void SHA_256::compress_n(const uint8_t input[], size_t blocks) { - SHA2_32::compress(m_digest, input, blocks); + SHA_256::compress_digest(m_digest, input, blocks); } /* diff --git a/src/lib/hash/sha2_32/sha2_32.h b/src/lib/hash/sha2_32/sha2_32.h index 5a687efbe..ecf2e0ece 100644 --- a/src/lib/hash/sha2_32/sha2_32.h +++ b/src/lib/hash/sha2_32/sha2_32.h @@ -49,20 +49,33 @@ class BOTAN_DLL SHA_256 final : public MDx_HashFunction SHA_256() : MDx_HashFunction(64, true, true), m_digest(8) { clear(); } + /* + * Perform a SHA-256 compression. For internal use + */ + static void compress_digest(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks); + private: + +#if defined(BOTAN_HAS_SHA2_32_ARMV8) + static void compress_digest_armv8(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks); +#endif + +#if defined(BOTAN_HAS_SHA2_32_X86) + static void compress_digest_x86(secure_vector<uint32_t>& digest, + const uint8_t input[], + size_t blocks); +#endif + void compress_n(const uint8_t[], size_t blocks) override; void copy_out(uint8_t[]) override; secure_vector<uint32_t> m_digest; }; -#if defined(BOTAN_HAS_SHA2_32_X86) -/* -* SHA-256 compression using Goldmont x86 extensions. Not for public consumption. -*/ -void sha2_compress_x86(uint32_t digest[8], const uint8_t input[], size_t blocks); -#endif - } #endif diff --git a/src/lib/hash/sha2_32/sha2_32_armv8/info.txt b/src/lib/hash/sha2_32/sha2_32_armv8/info.txt new file mode 100644 index 000000000..88e5c2b80 --- /dev/null +++ b/src/lib/hash/sha2_32/sha2_32_armv8/info.txt @@ -0,0 +1,13 @@ +<defines> +SHA2_32_ARMV8 -> 20170117 +</defines> + +<arch> +#arm32 +arm64 +</arch> + +<cc> +gcc:4.9 +clang:3.8 +</cc> diff --git a/src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp b/src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp new file mode 100644 index 000000000..9bf05047d --- /dev/null +++ b/src/lib/hash/sha2_32/sha2_32_armv8/sha2_32_armv8.cpp @@ -0,0 +1,202 @@ +/* +* SHA-256 using CPU instructions in ARMv8 +* +* Contributed by Jeffrey Walton. Based on public domain code by +* Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/sha2_32.h> +#include <arm_neon.h> + +namespace Botan { + +/* +* SHA-256 using CPU instructions in ARMv8 +*/ +//static +BOTAN_FUNC_ISA("+crypto") +void SHA_256::compress_digest_armv8(secure_vector<uint32_t>& digest, const uint8_t input8[], size_t blocks) + { + static const uint32_t K[] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, + 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, + 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, + 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, + 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, + 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, + 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, + 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, + 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, + }; + + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1, TMP2; + + // Load initial values + STATE0 = vld1q_u32(&digest[0]); + STATE1 = vld1q_u32(&digest[4]); + + // Intermediate void* cast due to http://llvm.org/bugs/show_bug.cgi?id=20670 + const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8)); + + while (blocks) + { + // Save current state + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + MSG0 = vld1q_u32(input32 + 0); + MSG1 = vld1q_u32(input32 + 4); + MSG2 = vld1q_u32(input32 + 8); + MSG3 = vld1q_u32(input32 + 12); + + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x00])); + + // Rounds 0-3 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x04])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + // Rounds 4-7 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x08])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + // Rounds 8-11 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x0c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + // Rounds 12-15 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x10])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + // Rounds 16-19 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x14])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + // Rounds 20-23 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x18])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + // Rounds 24-27 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x1c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + // Rounds 28-31 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x20])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + // Rounds 32-35 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x24])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + // Rounds 36-39 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x28])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + // Rounds 40-43 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x2c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + // Rounds 44-47 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x30])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + // Rounds 48-51 + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x34])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + // Rounds 52-55 + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x38])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + // Rounds 56-59 + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x3c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + // Rounds 60-63 + TMP2 = STATE0; + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + // Add back to state + STATE0 = vaddq_u32(STATE0, ABEF_SAVE); + STATE1 = vaddq_u32(STATE1, CDGH_SAVE); + + input32 += 64/4; + blocks--; + } + + // Save state + vst1q_u32(&digest[0], STATE0); + vst1q_u32(&digest[4], STATE1); + } + +} diff --git a/src/lib/hash/sha2_32/sha2_32_x86/sha2_32_x86.cpp b/src/lib/hash/sha2_32/sha2_32_x86/sha2_32_x86.cpp index 8f90ec5a9..aadb48710 100644 --- a/src/lib/hash/sha2_32/sha2_32_x86/sha2_32_x86.cpp +++ b/src/lib/hash/sha2_32/sha2_32_x86/sha2_32_x86.cpp @@ -12,7 +12,7 @@ namespace Botan { // called from sha2_32.cpp -void sha2_compress_x86(uint32_t digest[8], const uint8_t input[], size_t blocks) +void SHA_256::compress_digest_x86(secure_vector<uint32_t>& digest, const uint8_t input[], size_t blocks) { __m128i STATE0, STATE1; __m128i MSG, TMP, MASK; |