Write functions to handle loading and saving words a block at a time, taking into

account endian differences. The current code does not take advantage of the knowledge of which endianness we are running on; an optimization suggested by Yves Jerschow is to use (unsafe) casts to speed up the load/store operations. This turns out to provide large performance increases (30% or more) in some cases. Even without the unsafe casts, this version seems to average a few percent faster, probably because the longer loading loops have been partially or fully unrolled. This also makes the code implementing low-level algorithms like ciphers and hashes a bit more succint.
author: lloyd <[email protected]> 2007-05-31 03:25:19 +0000
committer: lloyd <[email protected]> 2007-05-31 03:25:19 +0000
commit: 55608e7dd1aa593944f967f2549564e4f42b654e (patch)
tree: ec2ec03a762a6dac82eb608487d5394370135624 /src/square.cpp
parent: 22ecdc45a0efa4c444d0b7010b7cd743aeb68c57 (diff)
1 files changed, 16 insertions, 18 deletions
diff --git a/src/square.cpp b/src/square.cpp
index 7d7cf1da5..988e56ef5 100644
--- a/src/square.cpp
+++ b/src/square.cpp
@@ -117,7 +117,7 @@ void Square::key(const byte key[], u32bit)
    {
    SecureBuffer<u32bit, 36> XEK, XDK;
    for(u32bit j = 0; j != 4; ++j)
-      XEK[j] = make_u32bit(key[4*j], key[4*j+1], key[4*j+2], key[4*j+3]);
+      XEK[j] = load_be<u32bit>(key, j);
    for(u32bit j = 0; j != 8; ++j)
       {
       XEK[4*j+4] = XEK[4*j  ] ^ rotate_left(XEK[4*j+3], 8) ^ (0x01000000 << j);
@@ -149,27 +149,25 @@ void Square::transform(u32bit round_key[4])
       { 0x03, 0x02, 0x01, 0x01 },
       { 0x01, 0x03, 0x02, 0x01 },
       { 0x01, 0x01, 0x03, 0x02 } };
-   SecureBuffer<byte, 4> A[4], B[4];
-   for(u32bit j = 0; j != 4; ++j)
-      for(u32bit k = 0; k != 4; ++k)
-         A[j][k] = get_byte(k, round_key[j]);
+
    for(u32bit j = 0; j != 4; ++j)
+      {
+      SecureBuffer<byte, 4> A, B;
+
+      store_be(round_key[j], A);
+
       for(u32bit k = 0; k != 4; ++k)
          for(u32bit l = 0; l != 4; ++l)
-            B[j][k] ^= mul(A[j][l], G[l][k]);
-   for(u32bit j = 0; j != 4; ++j)
-      round_key[j] = make_u32bit(B[j][0], B[j][1], B[j][2], B[j][3]);
-   }
+            {
+            const byte a = A[l];
+            const byte b = G[l][k];
 
-/*************************************************
-* Multiply in GF(2^8)                            *
-*************************************************/
-byte Square::mul(byte a, byte b)
-   {
-   if(a && b)
-      return ALog[(Log[a] + Log[b]) % 255];
-   else
-      return 0;
+            if(a && b)
+               B[k] ^= ALog[(Log[a] + Log[b]) % 255];
+            }
+
+      round_key[j] = load_be<u32bit>(B.begin(), 0);
+      }
    }
 
 /*************************************************
author	lloyd <[email protected]>	2007-05-31 03:25:19 +0000
committer	lloyd <[email protected]>	2007-05-31 03:25:19 +0000
commit	55608e7dd1aa593944f967f2549564e4f42b654e (patch)
tree	ec2ec03a762a6dac82eb608487d5394370135624 /src/square.cpp
parent	22ecdc45a0efa4c444d0b7010b7cd743aeb68c57 (diff)