Add a generic 64x64->128 multiplication op.

Use it to merge mp_msvc64 (was using MSVC _umul128 intrinsic) and mp_asm64 (was using inline asm) into mp_word64, which calls the new mul64x64_128 function. That function wraps any available compiler intrinsics or CPU instructions.
author: lloyd <[email protected]> 2013-07-30 18:13:00 +0000
committer: lloyd <[email protected]> 2013-07-30 18:13:00 +0000
commit: 929a271f0c8e1eed79527d0663d75cd371b9841a (patch)
tree: c0c4d4027ed04c53e6a425107b1b7fcd2bc04803
parent: 1e420da500081dc11d60affc73933e980285d59e (diff)
16 files changed, 210 insertions, 214 deletions
diff --git a/configure.py b/configure.py
index 338af8766..87ec8aa4a 100755
--- a/configure.py
+++ b/configure.py
@@ -645,13 +645,17 @@ class ArchInfo(object):
                       ['aliases', 'submodels', 'submodel_aliases', 'isa_extensions'],
                       { 'endian': None,
                         'family': None,
-                        'unaligned': 'no'
+                        'unaligned': 'no',
+                        'wordsize': None
                         })
 
         self.submodel_aliases = force_to_dict(self.submodel_aliases)
 
         self.unaligned_ok = (1 if self.unaligned == 'ok' else 0)
 
+        if self.wordsize is not None:
+            self.wordsize = int(self.wordsize)
+
     """
     Return a list of all submodels for this arch, ordered longest
     to shortest
@@ -697,6 +701,12 @@ class ArchInfo(object):
         if self.family is not None:
             macros.append('TARGET_CPU_IS_%s_FAMILY' % (self.family.upper()))
 
+        if self.wordsize is not None:
+            macros.append('TARGET_CPU_NATIVE_WORD_SIZE %d' % (self.wordsize))
+
+            if self.wordsize == 64:
+                macros.append('TARGET_CPU_HAS_NATIVE_64BIT')
+
         macros.append('TARGET_UNALIGNED_MEMORY_ACCESS_OK %d' % (unaligned_ok))
 
         return macros
diff --git a/src/build-data/arch/alpha.txt b/src/build-data/arch/alpha.txt
index 2bf72edef..233691b9e 100644
--- a/src/build-data/arch/alpha.txt
+++ b/src/build-data/arch/alpha.txt
@@ -1,4 +1,5 @@
 endian little
+wordsize 64
 
 <aliases>
 axp
diff --git a/src/build-data/arch/ia64.txt b/src/build-data/arch/ia64.txt
index 55967d5ab..46b40eff8 100644
--- a/src/build-data/arch/ia64.txt
+++ b/src/build-data/arch/ia64.txt
@@ -1,3 +1,5 @@
+wordsize 64
+
 <aliases>
 itanium
 itanic
diff --git a/src/build-data/arch/mips64.txt b/src/build-data/arch/mips64.txt
index d6f481346..9a56a0334 100644
--- a/src/build-data/arch/mips64.txt
+++ b/src/build-data/arch/mips64.txt
@@ -1,3 +1,5 @@
+wordsize 64
+
 <aliases>
 mips64el
 </aliases>
diff --git a/src/build-data/arch/ppc64.txt b/src/build-data/arch/ppc64.txt
index 07436c19d..fa1dab674 100644
--- a/src/build-data/arch/ppc64.txt
+++ b/src/build-data/arch/ppc64.txt
@@ -1,6 +1,7 @@
 endian big
 
 family ppc
+wordsize 64
 
 <aliases>
 powerpc64
diff --git a/src/build-data/arch/s390x.txt b/src/build-data/arch/s390x.txt
index 6f4271607..0fec592b4 100644
--- a/src/build-data/arch/s390x.txt
+++ b/src/build-data/arch/s390x.txt
@@ -1,5 +1,6 @@
 endian big
 unaligned ok
+wordsize 64
 
 <submodels>
 s390x
diff --git a/src/build-data/arch/sparc64.txt b/src/build-data/arch/sparc64.txt
index 3a6acd6c3..62dd69be9 100644
--- a/src/build-data/arch/sparc64.txt
+++ b/src/build-data/arch/sparc64.txt
@@ -1,5 +1,6 @@
 
 family sparc
+wordsize 64
 
 <submodels>
 ultrasparc
diff --git a/src/build-data/arch/x86_64.txt b/src/build-data/arch/x86_64.txt
index 608249101..e3e6f18e1 100644
--- a/src/build-data/arch/x86_64.txt
+++ b/src/build-data/arch/x86_64.txt
@@ -1,5 +1,6 @@
 endian little
 unaligned ok
+wordsize 64
 
 family x86
 
diff --git a/src/math/mp/info.txt b/src/math/mp/info.txt
index bf7f40d3c..531eee4e4 100644
--- a/src/math/mp/info.txt
+++ b/src/math/mp/info.txt
@@ -19,5 +19,5 @@ mp_core.h
 </header:internal>
 
 <requires>
-mp_x86_64|mp_msvc64|mp_asm64|mp_x86_32|mp_x86_32_msvc|mp_generic
+mp_x86_64|mp_word64|mp_x86_32|mp_x86_32_msvc|mp_generic
 </requires>
diff --git a/src/math/mp/mp_asm64/info.txt b/src/math/mp/mp_asm64/info.txt
deleted file mode 100644
index 9af7c4ae7..000000000
--- a/src/math/mp/mp_asm64/info.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-mp_bits 64
-
-load_on dep
-
-<header:internal>
-mp_asm.h
-mp_generic:mp_asmi.h
-</header:internal>
-
-<arch>
-alpha
-ia64
-mips64
-ppc64
-sparc64
-</arch>
-
-# The inline asm only works with gcc, but it looks like (at least on
-# UltraSPARC), using 64-bit words and the sythensized multiply is a 5 to 25%
-# win, so it's probably worth using elsewhere.
-<cc>
-gcc
-sunstudio
-</cc>
diff --git a/src/math/mp/mp_asm64/mp_asm.h b/src/math/mp/mp_asm64/mp_asm.h
deleted file mode 100644
index 625ea1c4f..000000000
--- a/src/math/mp/mp_asm64/mp_asm.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
-* MPI Multiply-Add Core
-* (C) 1999-2007 Jack Lloyd
-*
-* Distributed under the terms of the Botan license
-*/
-
-#ifndef BOTAN_MP_MADD_H__
-#define BOTAN_MP_MADD_H__
-
-#include <botan/mp_types.h>
-
-namespace Botan {
-
-#if (BOTAN_MP_WORD_BITS != 64)
-   #error The mp_asm64 module requires that BOTAN_MP_WORD_BITS == 64
-#endif
-
-#if defined(BOTAN_TARGET_ARCH_IS_ALPHA)
-
-#define BOTAN_WORD_MUL(a,b,z1,z0) do {                   \
-   asm("umulh %1,%2,%0" : "=r" (z0) : "r" (a), "r" (b)); \
-   z1 = a * b;                                           \
-} while(0);
-
-#elif defined(BOTAN_TARGET_ARCH_IS_IA64)
-
-#define BOTAN_WORD_MUL(a,b,z1,z0) do {                     \
-   asm("xmpy.hu %0=%1,%2" : "=f" (z0) : "f" (a), "f" (b)); \
-   z1 = a * b;                                             \
-} while(0);
-
-#elif defined(BOTAN_TARGET_ARCH_IS_PPC64)
-
-#define BOTAN_WORD_MUL(a,b,z1,z0) do {                           \
-   asm("mulhdu %0,%1,%2" : "=r" (z0) : "r" (a), "r" (b) : "cc"); \
-   z1 = a * b;                                                   \
-} while(0);
-
-#elif defined(BOTAN_TARGET_ARCH_IS_MIPS64)
-
-#define BOTAN_WORD_MUL(a,b,z1,z0) do {                            \
-   typedef unsigned int uint128_t __attribute__((mode(TI)));      \
-   uint128_t r = (uint128_t)a * b;                                \
-   z0 = (r >> 64) & 0xFFFFFFFFFFFFFFFF;                           \
-   z1 = (r      ) & 0xFFFFFFFFFFFFFFFF;                           \
-} while(0);
-
-#else
-
-// Do a 64x64->128 multiply using four 64x64->64 multiplies
-// plus some adds and shifts. Last resort for CPUs like UltraSPARC,
-// with 64-bit registers/ALU, but no 64x64->128 multiply.
-inline void bigint_2word_mul(word a, word b, word* z1, word* z0)
-   {
-   const size_t MP_HWORD_BITS = BOTAN_MP_WORD_BITS / 2;
-   const word MP_HWORD_MASK = ((word)1 << MP_HWORD_BITS) - 1;
-
-   const word a_hi = (a >> MP_HWORD_BITS);
-   const word a_lo = (a & MP_HWORD_MASK);
-   const word b_hi = (b >> MP_HWORD_BITS);
-   const word b_lo = (b & MP_HWORD_MASK);
-
-   word x0 = a_hi * b_hi;
-   word x1 = a_lo * b_hi;
-   word x2 = a_hi * b_lo;
-   word x3 = a_lo * b_lo;
-
-   x2 += x3 >> (MP_HWORD_BITS);
-   x2 += x1;
-
-   if(x2 < x1) // timing channel
-      x0 += ((word)1 << MP_HWORD_BITS);
-
-   *z0 = x0 + (x2 >> MP_HWORD_BITS);
-   *z1 = ((x2 & MP_HWORD_MASK) << MP_HWORD_BITS) + (x3 & MP_HWORD_MASK);
-   }
-
-#define BOTAN_WORD_MUL(a,b,z1,z0) bigint_2word_mul(a, b, &z1, &z0)
-
-#endif
-
-/*
-* Word Multiply/Add
-*/
-inline word word_madd2(word a, word b, word* c)
-   {
-   word z0 = 0, z1 = 0;
-
-   BOTAN_WORD_MUL(a, b, z1, z0);
-
-   z1 += *c;
-   z0 += (z1 < *c);
-
-   *c = z0;
-   return z1;
-   }
-
-/*
-* Word Multiply/Add
-*/
-inline word word_madd3(word a, word b, word c, word* d)
-   {
-   word z0 = 0, z1 = 0;
-
-   BOTAN_WORD_MUL(a, b, z1, z0);
-
-   z1 += c;
-   z0 += (z1 < c);
-
-   z1 += *d;
-   z0 += (z1 < *d);
-
-   *d = z0;
-   return z1;
-   }
-
-}
-
-#endif
diff --git a/src/math/mp/mp_msvc64/mp_asm.h b/src/math/mp/mp_msvc64/mp_asm.h
deleted file mode 100644
index 8e4535c35..000000000
--- a/src/math/mp/mp_msvc64/mp_asm.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-* Multiply-Add for 64-bit MSVC
-* (C) 2010 Jack Lloyd
-*
-* Distributed under the terms of the Botan license
-*/
-
-#ifndef BOTAN_MP_ASM_H__
-#define BOTAN_MP_ASM_H__
-
-#include <botan/mp_types.h>
-#include <intrin.h>
-
-#if (BOTAN_MP_WORD_BITS != 64)
-   #error The mp_msvc64 module requires that BOTAN_MP_WORD_BITS == 64
-#endif
-
-#pragma intrinsic(_umul128)
-
-namespace Botan {
-
-extern "C" {
-
-/*
-* Word Multiply
-*/
-inline word word_madd2(word a, word b, word* c)
-   {
-   word hi, lo;
-   lo = _umul128(a, b, &hi);
-
-   lo += *c;
-   hi += (lo < *c); // carry?
-
-   *c = hi;
-   return lo;
-   }
-
-/*
-* Word Multiply/Add
-*/
-inline word word_madd3(word a, word b, word c, word* d)
-   {
-   word hi, lo;
-   lo = _umul128(a, b, &hi);
-
-   lo += c;
-   hi += (lo < c); // carry?
-
-   lo += *d;
-   hi += (lo < *d); // carry?
-
-   *d = hi;
-   return lo;
-   }
-
-}
-
-}
-
-#endif
diff --git a/src/math/mp/mp_msvc64/info.txt b/src/math/mp/mp_word64/info.txt
index fa7d90fed..a12221f4e 100644
--- a/src/math/mp/mp_msvc64/info.txt
+++ b/src/math/mp/mp_word64/info.txt
@@ -1,17 +1,18 @@
-load_on dep
-
 mp_bits 64
 
+load_on dep
+
 <header:internal>
 mp_asm.h
 mp_generic:mp_asmi.h
 </header:internal>
 
 <arch>
-x86_64
+alpha
 ia64
+mips64
+ppc64
+s390x
+sparc64
+x86_64
 </arch>
-
-<cc>
-msvc
-</cc>
diff --git a/src/math/mp/mp_word64/mp_asm.h b/src/math/mp/mp_word64/mp_asm.h
new file mode 100644
index 000000000..76d2bb918
--- /dev/null
+++ b/src/math/mp/mp_word64/mp_asm.h
@@ -0,0 +1,57 @@
+/*
+* MPI Multiply-Add Core
+* (C) 1999-2007 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef BOTAN_MP_MADD_H__
+#define BOTAN_MP_MADD_H__
+
+#include <botan/mp_types.h>
+#include <botan/internal/mul128.h>
+
+namespace Botan {
+
+#if (BOTAN_MP_WORD_BITS != 64)
+   #error The mp_word64 module requires that BOTAN_MP_WORD_BITS == 64
+#endif
+
+/*
+* Word Multiply/Add
+*/
+inline word word_madd2(word a, word b, word* c)
+   {
+   word z0 = 0, z1 = 0;
+
+   mul64x64_128(a, b, &z1, &z0);
+
+   z1 += *c;
+   z0 += (z1 < *c);
+
+   *c = z0;
+   return z1;
+   }
+
+/*
+* Word Multiply/Add
+*/
+inline word word_madd3(word a, word b, word c, word* d)
+   {
+   word z0 = 0, z1 = 0;
+
+   mul64x64_128(a, b, &z1, &z0);
+
+   z1 += c;
+   z0 += (z1 < c);
+
+   z1 += *d;
+   z0 += (z1 < *d);
+
+   *d = z0;
+   return z1;
+   }
+
+}
+
+#endif
diff --git a/src/utils/info.txt b/src/utils/info.txt
index b8e72f42e..1d77b87a7 100644
--- a/src/utils/info.txt
+++ b/src/utils/info.txt
@@ -15,6 +15,7 @@ zero_mem.cpp
 
 <header:internal>
 bit_ops.h
+mul128.h
 prefetch.h
 rounding.h
 semaphore.h
diff --git a/src/utils/mul128.h b/src/utils/mul128.h
new file mode 100644
index 000000000..83d6f5aa6
--- /dev/null
+++ b/src/utils/mul128.h
@@ -0,0 +1,123 @@
+/*
+* 64x64->128 bit multiply operation
+* (C) 2013 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef BOTAN_UTIL_MUL128_H__
+#define BOTAN_UTIL_MUL128_H__
+
+#include <botan/types.h>
+
+namespace Botan {
+
+#if defined(__SIZEOF_INT128__)
+   #define BOTAN_TARGET_HAS_NATIVE_UINT128
+   typedef unsigned __int128 uint128_t;
+
+#elif (BOTAN_GCC_VERSION > 440) && defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
+   #define BOTAN_TARGET_HAS_NATIVE_UINT128
+   typedef unsigned int uint128_t __attribute__((mode(TI)));
+#endif
+
+}
+
+#if defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
+
+#define BOTAN_FAST_64X64_MUL(a,b,lo,hi)      \
+   do {                                      \
+      const uint128_t r = (uint128_t)a * b;  \
+      *hi = (r >> 64) & 0xFFFFFFFFFFFFFFFF;  \
+      *lo = (r      ) & 0xFFFFFFFFFFFFFFFF;  \
+   } while(0)
+
+#elif defined(BOTAN_BUILD_COMPILER_IS_MSVC) && defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
+
+#include <intrin.h>
+#pragma intrinsic(_umul128)
+
+#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) \
+   do { *lo = _umul128(a, b, hi); } while(0)
+
+#elif defined(BOTAN_USE_GCC_INLINE_ASM)
+
+#if defined(BOTAN_TARGET_ARCH_IS_X86_64)
+
+#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do {                           \
+   asm("mulq %3" : "=d" (*hi), "=a" (*lo) : "a" (a), "rm" (b) : "cc"); \
+   } while(0)
+
+#elif defined(BOTAN_TARGET_ARCH_IS_ALPHA)
+
+#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do {              \
+   asm("umulh %1,%2,%0" : "=r" (*hi) : "r" (a), "r" (b)); \
+   *lo = a * b;                                           \
+} while(0)
+
+#elif defined(BOTAN_TARGET_ARCH_IS_IA64)
+
+#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do {                \
+   asm("xmpy.hu %0=%1,%2" : "=f" (*hi) : "f" (a), "f" (b)); \
+   *lo = a * b;                                             \
+} while(0)
+
+#elif defined(BOTAN_TARGET_ARCH_IS_PPC64)
+
+#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do {                      \
+   asm("mulhdu %0,%1,%2" : "=r" (*hi) : "r" (a), "r" (b) : "cc"); \
+   *lo = a * b;                                                   \
+} while(0)
+
+#endif
+
+#endif
+
+namespace Botan {
+
+/**
+* Perform a 64x64->128 bit multiplication
+*/
+inline void mul64x64_128(u64bit a, u64bit b, u64bit* lo, u64bit* hi)
+   {
+#if defined(BOTAN_FAST_64X64_MUL)
+   BOTAN_FAST_64X64_MUL(a, b, lo, hi);
+#else
+
+   /*
+   * Do a 64x64->128 multiply using four 32x32->64 multiplies plus
+   * some adds and shifts. Last resort for CPUs like UltraSPARC (with
+   * 64-bit registers/ALU, but no 64x64->128 multiply) or 32-bit CPUs.
+   */
+   const size_t HWORD_BITS = 32;
+   const u32bit HWORD_MASK = 0xFFFFFFFF;
+
+   const u32bit a_hi = (a >> HWORD_BITS);
+   const u32bit a_lo = (a  & HWORD_MASK);
+   const u32bit b_hi = (b >> HWORD_BITS);
+   const u32bit b_lo = (b  & HWORD_MASK);
+
+   u64bit x0 = static_cast<u64bit>(a_hi) * b_hi;
+   u64bit x1 = static_cast<u64bit>(a_lo) * b_hi;
+   u64bit x2 = static_cast<u64bit>(a_hi) * b_lo;
+   u64bit x3 = static_cast<u64bit>(a_lo) * b_lo;
+
+   // this cannot overflow as (2^32-1)^2 + 2^32-1 < 2^64-1
+   x2 += x3 >> HWORD_BITS;
+
+   // this one can overflow
+   x2 += x1;
+
+   // propagate the carry if any
+   x0 += static_cast<u64bit>(static_cast<bool>(x2 < x1)) << HWORD_BITS;
+
+   *hi = x0 + (x2 >> HWORD_BITS);
+   *lo  = ((x2 & HWORD_MASK) << HWORD_BITS) + (x3 & HWORD_MASK);
+#endif
+   }
+
+}
+
+}
+
+#endif
author	lloyd <[email protected]>	2013-07-30 18:13:00 +0000
committer	lloyd <[email protected]>	2013-07-30 18:13:00 +0000
commit	929a271f0c8e1eed79527d0663d75cd371b9841a (patch)
tree	c0c4d4027ed04c53e6a425107b1b7fcd2bc04803
parent	1e420da500081dc11d60affc73933e980285d59e (diff)