Unroll the loops in SHA_160::hash that unpack the input and perform the

expansion. While I would prefer to have the compiler to this, using GCC 4.1.2 it is 4% faster on a Core2 Q6600 with the loops partially unrolled.
author: lloyd <[email protected]> 2008-03-09 19:10:57 +0000
committer: lloyd <[email protected]> 2008-03-09 19:10:57 +0000
commit: 78788d72f74ab1e6f0f3f37d4aa60a6df3d26bc8 (patch)
tree: e63ddbbc68d0c9ea3ec3f11b2b80c693ffc38bab /src
parent: f2bd1be860136bebd63f487b996f77f148c0aae3 (diff)
1 files changed, 15 insertions, 4 deletions
diff --git a/src/sha160.cpp b/src/sha160.cpp
index 359d0a790..5d9afb684 100644
--- a/src/sha160.cpp
+++ b/src/sha160.cpp
@@ -54,10 +54,21 @@ inline void F4(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E, u32bit msg)
 *************************************************/
 void SHA_160::hash(const byte input[])
    {
-   for(u32bit j = 0; j != 16; ++j)
-      W[j] = load_be<u32bit>(input, j);
-   for(u32bit j = 16; j != 80; ++j)
-      W[j] = rotate_left((W[j-3] ^ W[j-8] ^ W[j-14] ^ W[j-16]), 1);
+   for(u32bit j = 0; j != 16; j += 4)
+      {
+      W[j  ] = load_be<u32bit>(input, j);
+      W[j+1] = load_be<u32bit>(input, j+1);
+      W[j+2] = load_be<u32bit>(input, j+2);
+      W[j+3] = load_be<u32bit>(input, j+3);
+      }
+
+   for(u32bit j = 16; j != 80; j += 4)
+      {
+      W[j  ] = rotate_left((W[j-3] ^ W[j-8] ^ W[j-14] ^ W[j-16]), 1);
+      W[j+1] = rotate_left((W[j-2] ^ W[j-7] ^ W[j-13] ^ W[j-15]), 1);
+      W[j+2] = rotate_left((W[j-1] ^ W[j-6] ^ W[j-12] ^ W[j-14]), 1);
+      W[j+3] = rotate_left((W[j  ] ^ W[j-5] ^ W[j-11] ^ W[j-13]), 1);
+      }
 
    u32bit A = digest[0], B = digest[1], C = digest[2],
           D = digest[3], E = digest[4];
author	lloyd <[email protected]>	2008-03-09 19:10:57 +0000
committer	lloyd <[email protected]>	2008-03-09 19:10:57 +0000
commit	78788d72f74ab1e6f0f3f37d4aa60a6df3d26bc8 (patch)
tree	e63ddbbc68d0c9ea3ec3f11b2b80c693ffc38bab /src
parent	f2bd1be860136bebd63f487b996f77f148c0aae3 (diff)