diff options
author | lloyd <[email protected]> | 2010-09-21 15:39:02 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2010-09-21 15:39:02 +0000 |
commit | 23836879ecf24cd34850b6532854ccd895825c4d (patch) | |
tree | c4a3dde1c3682697ff78105af55059539596ec35 /src | |
parent | 911c0f587ba21c944cb420f9953ffc5e2bac7fb2 (diff) |
Clean up, hide union accesses with a macro to make it easier to test
alternative methods of getting pieces of the expanded message.
Diffstat (limited to 'src')
-rw-r--r-- | src/hash/sha1_sse2/sha1_sse2.cpp | 132 |
1 files changed, 92 insertions, 40 deletions
diff --git a/src/hash/sha1_sse2/sha1_sse2.cpp b/src/hash/sha1_sse2/sha1_sse2.cpp index 00a0752f6..500c8cf27 100644 --- a/src/hash/sha1_sse2/sha1_sse2.cpp +++ b/src/hash/sha1_sse2/sha1_sse2.cpp @@ -189,82 +189,132 @@ void SHA_160_SSE2::compress_n(const byte input_bytes[], u32bit blocks) __m128i W3 = _mm_loadu_si128(&input[3]); - F1(A, B, C, D, E, P0.u32[0]); F1(E, A, B, C, D, P0.u32[1]); - F1(D, E, A, B, C, P0.u32[2]); F1(C, D, E, A, B, P0.u32[3]); + /* + Using SSE4; slower on Core2 and Nehalem + #define GET_P_32(P, i) _mm_extract_epi32(P.u128, i) + + Much slower on all tested platforms + #define GET_P_32(P,i) _mm_cvtsi128_si32(_mm_srli_si128(P.u128, i*4)) + */ + +#define GET_P_32(P, i) P.u32[i] + + F1(A, B, C, D, E, GET_P_32(P0, 0)); + F1(E, A, B, C, D, GET_P_32(P0, 1)); + F1(D, E, A, B, C, GET_P_32(P0, 2)); + F1(C, D, E, A, B, GET_P_32(P0, 3)); prep00_15(P0, W3); - F1(B, C, D, E, A, P1.u32[0]); F1(A, B, C, D, E, P1.u32[1]); - F1(E, A, B, C, D, P1.u32[2]); F1(D, E, A, B, C, P1.u32[3]); + F1(B, C, D, E, A, GET_P_32(P1, 0)); + F1(A, B, C, D, E, GET_P_32(P1, 1)); + F1(E, A, B, C, D, GET_P_32(P1, 2)); + F1(D, E, A, B, C, GET_P_32(P1, 3)); prep(P1, W0, W1, W2, W3, K00_19); - F1(C, D, E, A, B, P2.u32[0]); F1(B, C, D, E, A, P2.u32[1]); - F1(A, B, C, D, E, P2.u32[2]); F1(E, A, B, C, D, P2.u32[3]); + F1(C, D, E, A, B, GET_P_32(P2, 0)); + F1(B, C, D, E, A, GET_P_32(P2, 1)); + F1(A, B, C, D, E, GET_P_32(P2, 2)); + F1(E, A, B, C, D, GET_P_32(P2, 3)); prep(P2, W1, W2, W3, W0, K20_39); - F1(D, E, A, B, C, P0.u32[0]); F1(C, D, E, A, B, P0.u32[1]); - F1(B, C, D, E, A, P0.u32[2]); F1(A, B, C, D, E, P0.u32[3]); + F1(D, E, A, B, C, GET_P_32(P0, 0)); + F1(C, D, E, A, B, GET_P_32(P0, 1)); + F1(B, C, D, E, A, GET_P_32(P0, 2)); + F1(A, B, C, D, E, GET_P_32(P0, 3)); prep(P0, W2, W3, W0, W1, K20_39); - F1(E, A, B, C, D, P1.u32[0]); F1(D, E, A, B, C, P1.u32[1]); - F1(C, D, E, A, B, P1.u32[2]); F1(B, C, D, E, A, P1.u32[3]); + F1(E, A, B, C, D, GET_P_32(P1, 0)); + F1(D, E, A, B, C, GET_P_32(P1, 1)); + F1(C, D, E, A, B, GET_P_32(P1, 2)); + F1(B, C, D, E, A, GET_P_32(P1, 3)); prep(P1, W3, W0, W1, W2, K20_39); - F2(A, B, C, D, E, P2.u32[0]); F2(E, A, B, C, D, P2.u32[1]); - F2(D, E, A, B, C, P2.u32[2]); F2(C, D, E, A, B, P2.u32[3]); + F2(A, B, C, D, E, GET_P_32(P2, 0)); + F2(E, A, B, C, D, GET_P_32(P2, 1)); + F2(D, E, A, B, C, GET_P_32(P2, 2)); + F2(C, D, E, A, B, GET_P_32(P2, 3)); prep(P2, W0, W1, W2, W3, K20_39); - F2(B, C, D, E, A, P0.u32[0]); F2(A, B, C, D, E, P0.u32[1]); - F2(E, A, B, C, D, P0.u32[2]); F2(D, E, A, B, C, P0.u32[3]); + F2(B, C, D, E, A, GET_P_32(P0, 0)); + F2(A, B, C, D, E, GET_P_32(P0, 1)); + F2(E, A, B, C, D, GET_P_32(P0, 2)); + F2(D, E, A, B, C, GET_P_32(P0, 3)); prep(P0, W1, W2, W3, W0, K20_39); - F2(C, D, E, A, B, P1.u32[0]); F2(B, C, D, E, A, P1.u32[1]); - F2(A, B, C, D, E, P1.u32[2]); F2(E, A, B, C, D, P1.u32[3]); + F2(C, D, E, A, B, GET_P_32(P1, 0)); + F2(B, C, D, E, A, GET_P_32(P1, 1)); + F2(A, B, C, D, E, GET_P_32(P1, 2)); + F2(E, A, B, C, D, GET_P_32(P1, 3)); prep(P1, W2, W3, W0, W1, K40_59); - F2(D, E, A, B, C, P2.u32[0]); F2(C, D, E, A, B, P2.u32[1]); - F2(B, C, D, E, A, P2.u32[2]); F2(A, B, C, D, E, P2.u32[3]); + F2(D, E, A, B, C, GET_P_32(P2, 0)); + F2(C, D, E, A, B, GET_P_32(P2, 1)); + F2(B, C, D, E, A, GET_P_32(P2, 2)); + F2(A, B, C, D, E, GET_P_32(P2, 3)); prep(P2, W3, W0, W1, W2, K40_59); - F2(E, A, B, C, D, P0.u32[0]); F2(D, E, A, B, C, P0.u32[1]); - F2(C, D, E, A, B, P0.u32[2]); F2(B, C, D, E, A, P0.u32[3]); + F2(E, A, B, C, D, GET_P_32(P0, 0)); + F2(D, E, A, B, C, GET_P_32(P0, 1)); + F2(C, D, E, A, B, GET_P_32(P0, 2)); + F2(B, C, D, E, A, GET_P_32(P0, 3)); prep(P0, W0, W1, W2, W3, K40_59); - F3(A, B, C, D, E, P1.u32[0]); F3(E, A, B, C, D, P1.u32[1]); - F3(D, E, A, B, C, P1.u32[2]); F3(C, D, E, A, B, P1.u32[3]); + F3(A, B, C, D, E, GET_P_32(P1, 0)); + F3(E, A, B, C, D, GET_P_32(P1, 1)); + F3(D, E, A, B, C, GET_P_32(P1, 2)); + F3(C, D, E, A, B, GET_P_32(P1, 3)); prep(P1, W1, W2, W3, W0, K40_59); - F3(B, C, D, E, A, P2.u32[0]); F3(A, B, C, D, E, P2.u32[1]); - F3(E, A, B, C, D, P2.u32[2]); F3(D, E, A, B, C, P2.u32[3]); + F3(B, C, D, E, A, GET_P_32(P2, 0)); + F3(A, B, C, D, E, GET_P_32(P2, 1)); + F3(E, A, B, C, D, GET_P_32(P2, 2)); + F3(D, E, A, B, C, GET_P_32(P2, 3)); prep(P2, W2, W3, W0, W1, K40_59); - F3(C, D, E, A, B, P0.u32[0]); F3(B, C, D, E, A, P0.u32[1]); - F3(A, B, C, D, E, P0.u32[2]); F3(E, A, B, C, D, P0.u32[3]); + F3(C, D, E, A, B, GET_P_32(P0, 0)); + F3(B, C, D, E, A, GET_P_32(P0, 1)); + F3(A, B, C, D, E, GET_P_32(P0, 2)); + F3(E, A, B, C, D, GET_P_32(P0, 3)); prep(P0, W3, W0, W1, W2, K60_79); - F3(D, E, A, B, C, P1.u32[0]); F3(C, D, E, A, B, P1.u32[1]); - F3(B, C, D, E, A, P1.u32[2]); F3(A, B, C, D, E, P1.u32[3]); + F3(D, E, A, B, C, GET_P_32(P1, 0)); + F3(C, D, E, A, B, GET_P_32(P1, 1)); + F3(B, C, D, E, A, GET_P_32(P1, 2)); + F3(A, B, C, D, E, GET_P_32(P1, 3)); prep(P1, W0, W1, W2, W3, K60_79); - F3(E, A, B, C, D, P2.u32[0]); F3(D, E, A, B, C, P2.u32[1]); - F3(C, D, E, A, B, P2.u32[2]); F3(B, C, D, E, A, P2.u32[3]); + F3(E, A, B, C, D, GET_P_32(P2, 0)); + F3(D, E, A, B, C, GET_P_32(P2, 1)); + F3(C, D, E, A, B, GET_P_32(P2, 2)); + F3(B, C, D, E, A, GET_P_32(P2, 3)); prep(P2, W1, W2, W3, W0, K60_79); - F4(A, B, C, D, E, P0.u32[0]); F4(E, A, B, C, D, P0.u32[1]); - F4(D, E, A, B, C, P0.u32[2]); F4(C, D, E, A, B, P0.u32[3]); + F4(A, B, C, D, E, GET_P_32(P0, 0)); + F4(E, A, B, C, D, GET_P_32(P0, 1)); + F4(D, E, A, B, C, GET_P_32(P0, 2)); + F4(C, D, E, A, B, GET_P_32(P0, 3)); prep(P0, W2, W3, W0, W1, K60_79); - F4(B, C, D, E, A, P1.u32[0]); F4(A, B, C, D, E, P1.u32[1]); - F4(E, A, B, C, D, P1.u32[2]); F4(D, E, A, B, C, P1.u32[3]); + F4(B, C, D, E, A, GET_P_32(P1, 0)); + F4(A, B, C, D, E, GET_P_32(P1, 1)); + F4(E, A, B, C, D, GET_P_32(P1, 2)); + F4(D, E, A, B, C, GET_P_32(P1, 3)); prep(P1, W3, W0, W1, W2, K60_79); - F4(C, D, E, A, B, P2.u32[0]); F4(B, C, D, E, A, P2.u32[1]); - F4(A, B, C, D, E, P2.u32[2]); F4(E, A, B, C, D, P2.u32[3]); + F4(C, D, E, A, B, GET_P_32(P2, 0)); + F4(B, C, D, E, A, GET_P_32(P2, 1)); + F4(A, B, C, D, E, GET_P_32(P2, 2)); + F4(E, A, B, C, D, GET_P_32(P2, 3)); - F4(D, E, A, B, C, P0.u32[0]); F4(C, D, E, A, B, P0.u32[1]); - F4(B, C, D, E, A, P0.u32[2]); F4(A, B, C, D, E, P0.u32[3]); + F4(D, E, A, B, C, GET_P_32(P0, 0)); + F4(C, D, E, A, B, GET_P_32(P0, 1)); + F4(B, C, D, E, A, GET_P_32(P0, 2)); + F4(A, B, C, D, E, GET_P_32(P0, 3)); - F4(E, A, B, C, D, P1.u32[0]); F4(D, E, A, B, C, P1.u32[1]); - F4(C, D, E, A, B, P1.u32[2]); F4(B, C, D, E, A, P1.u32[3]); + F4(E, A, B, C, D, GET_P_32(P1, 0)); + F4(D, E, A, B, C, GET_P_32(P1, 1)); + F4(C, D, E, A, B, GET_P_32(P1, 2)); + F4(B, C, D, E, A, GET_P_32(P1, 3)); A = (digest[0] += A); B = (digest[1] += B); @@ -274,6 +324,8 @@ void SHA_160_SSE2::compress_n(const byte input_bytes[], u32bit blocks) input += (HASH_BLOCK_SIZE / 16); } + +#undef GET_P_32 } } |