Modify the implementation of multiplication mod 65537 used in IDEA to

be branch-free. This reduces performance noticably on my Core2 (from 32 MiB/s to a bit over 27 MiB), but so it goes. The IDEA implementation using SSE2 is already branch-free here, and runs at about 135 MiB/s on my machine. Also add more IDEA tests, generated by OpenSSL
author: lloyd <[email protected]> 2010-04-30 16:25:01 +0000
committer: lloyd <[email protected]> 2010-04-30 16:25:01 +0000
commit: 1e10b45b171fde455d32ed34a3aafa0bf90f3b4e (patch)
tree: 8444300de580a32744ba32aec5a9e77c5d66e607 /src
parent: 18d5d5fa3f58d2ecd15ac130eda909f44d9c6f71 (diff)
1 files changed, 13 insertions, 10 deletions
diff --git a/src/block/idea/idea.cpp b/src/block/idea/idea.cpp
index 15ff7c0ec..0c5dfed42 100644
--- a/src/block/idea/idea.cpp
+++ b/src/block/idea/idea.cpp
@@ -1,6 +1,6 @@
 /*
 * IDEA
-* (C) 1999-2007 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
 *
 * Distributed under the terms of the Botan license
 */
@@ -17,15 +17,18 @@ namespace {
 */
 inline u16bit mul(u16bit x, u16bit y)
    {
-   if(x && y)
-      {
-      u32bit T = static_cast<u32bit>(x) * y;
-      x = static_cast<u16bit>(T >> 16);
-      y = static_cast<u16bit>(T & 0xFFFF);
-      return static_cast<u16bit>(y - x + ((y < x) ? 1 : 0));
-      }
-   else
-      return static_cast<u16bit>(1 - x - y);
+   const u32bit P = static_cast<u32bit>(x) * y;
+
+   // P ? 0xFFFF : 0
+   const u16bit P_mask = !P - 1;
+
+   const u32bit P_hi = P >> 16;
+   const u32bit P_lo = P & 0xFFFF;
+
+   const u16bit r_1 = (P_lo - P_hi) + (P_lo < P_hi);
+   const u16bit r_2 = 1 - x - y;
+
+   return (r_1 & P_mask) | (r_2 & ~P_mask);
    }
 
 /*
author	lloyd <[email protected]>	2010-04-30 16:25:01 +0000
committer	lloyd <[email protected]>	2010-04-30 16:25:01 +0000
commit	1e10b45b171fde455d32ed34a3aafa0bf90f3b4e (patch)
tree	8444300de580a32744ba32aec5a9e77c5d66e607 /src
parent	18d5d5fa3f58d2ecd15ac130eda909f44d9c6f71 (diff)