diff options
author | lloyd <[email protected]> | 2013-07-30 18:13:00 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2013-07-30 18:13:00 +0000 |
commit | 929a271f0c8e1eed79527d0663d75cd371b9841a (patch) | |
tree | c0c4d4027ed04c53e6a425107b1b7fcd2bc04803 | |
parent | 1e420da500081dc11d60affc73933e980285d59e (diff) |
Add a generic 64x64->128 multiplication op.
Use it to merge mp_msvc64 (was using MSVC _umul128 intrinsic) and
mp_asm64 (was using inline asm) into mp_word64, which calls the new
mul64x64_128 function. That function wraps any available compiler
intrinsics or CPU instructions.
-rwxr-xr-x | configure.py | 12 | ||||
-rw-r--r-- | src/build-data/arch/alpha.txt | 1 | ||||
-rw-r--r-- | src/build-data/arch/ia64.txt | 2 | ||||
-rw-r--r-- | src/build-data/arch/mips64.txt | 2 | ||||
-rw-r--r-- | src/build-data/arch/ppc64.txt | 1 | ||||
-rw-r--r-- | src/build-data/arch/s390x.txt | 1 | ||||
-rw-r--r-- | src/build-data/arch/sparc64.txt | 1 | ||||
-rw-r--r-- | src/build-data/arch/x86_64.txt | 1 | ||||
-rw-r--r-- | src/math/mp/info.txt | 2 | ||||
-rw-r--r-- | src/math/mp/mp_asm64/info.txt | 24 | ||||
-rw-r--r-- | src/math/mp/mp_asm64/mp_asm.h | 120 | ||||
-rw-r--r-- | src/math/mp/mp_msvc64/mp_asm.h | 61 | ||||
-rw-r--r-- | src/math/mp/mp_word64/info.txt (renamed from src/math/mp/mp_msvc64/info.txt) | 15 | ||||
-rw-r--r-- | src/math/mp/mp_word64/mp_asm.h | 57 | ||||
-rw-r--r-- | src/utils/info.txt | 1 | ||||
-rw-r--r-- | src/utils/mul128.h | 123 |
16 files changed, 210 insertions, 214 deletions
diff --git a/configure.py b/configure.py index 338af8766..87ec8aa4a 100755 --- a/configure.py +++ b/configure.py @@ -645,13 +645,17 @@ class ArchInfo(object): ['aliases', 'submodels', 'submodel_aliases', 'isa_extensions'], { 'endian': None, 'family': None, - 'unaligned': 'no' + 'unaligned': 'no', + 'wordsize': None }) self.submodel_aliases = force_to_dict(self.submodel_aliases) self.unaligned_ok = (1 if self.unaligned == 'ok' else 0) + if self.wordsize is not None: + self.wordsize = int(self.wordsize) + """ Return a list of all submodels for this arch, ordered longest to shortest @@ -697,6 +701,12 @@ class ArchInfo(object): if self.family is not None: macros.append('TARGET_CPU_IS_%s_FAMILY' % (self.family.upper())) + if self.wordsize is not None: + macros.append('TARGET_CPU_NATIVE_WORD_SIZE %d' % (self.wordsize)) + + if self.wordsize == 64: + macros.append('TARGET_CPU_HAS_NATIVE_64BIT') + macros.append('TARGET_UNALIGNED_MEMORY_ACCESS_OK %d' % (unaligned_ok)) return macros diff --git a/src/build-data/arch/alpha.txt b/src/build-data/arch/alpha.txt index 2bf72edef..233691b9e 100644 --- a/src/build-data/arch/alpha.txt +++ b/src/build-data/arch/alpha.txt @@ -1,4 +1,5 @@ endian little +wordsize 64 <aliases> axp diff --git a/src/build-data/arch/ia64.txt b/src/build-data/arch/ia64.txt index 55967d5ab..46b40eff8 100644 --- a/src/build-data/arch/ia64.txt +++ b/src/build-data/arch/ia64.txt @@ -1,3 +1,5 @@ +wordsize 64 + <aliases> itanium itanic diff --git a/src/build-data/arch/mips64.txt b/src/build-data/arch/mips64.txt index d6f481346..9a56a0334 100644 --- a/src/build-data/arch/mips64.txt +++ b/src/build-data/arch/mips64.txt @@ -1,3 +1,5 @@ +wordsize 64 + <aliases> mips64el </aliases> diff --git a/src/build-data/arch/ppc64.txt b/src/build-data/arch/ppc64.txt index 07436c19d..fa1dab674 100644 --- a/src/build-data/arch/ppc64.txt +++ b/src/build-data/arch/ppc64.txt @@ -1,6 +1,7 @@ endian big family ppc +wordsize 64 <aliases> powerpc64 diff --git a/src/build-data/arch/s390x.txt b/src/build-data/arch/s390x.txt index 6f4271607..0fec592b4 100644 --- a/src/build-data/arch/s390x.txt +++ b/src/build-data/arch/s390x.txt @@ -1,5 +1,6 @@ endian big unaligned ok +wordsize 64 <submodels> s390x diff --git a/src/build-data/arch/sparc64.txt b/src/build-data/arch/sparc64.txt index 3a6acd6c3..62dd69be9 100644 --- a/src/build-data/arch/sparc64.txt +++ b/src/build-data/arch/sparc64.txt @@ -1,5 +1,6 @@ family sparc +wordsize 64 <submodels> ultrasparc diff --git a/src/build-data/arch/x86_64.txt b/src/build-data/arch/x86_64.txt index 608249101..e3e6f18e1 100644 --- a/src/build-data/arch/x86_64.txt +++ b/src/build-data/arch/x86_64.txt @@ -1,5 +1,6 @@ endian little unaligned ok +wordsize 64 family x86 diff --git a/src/math/mp/info.txt b/src/math/mp/info.txt index bf7f40d3c..531eee4e4 100644 --- a/src/math/mp/info.txt +++ b/src/math/mp/info.txt @@ -19,5 +19,5 @@ mp_core.h </header:internal> <requires> -mp_x86_64|mp_msvc64|mp_asm64|mp_x86_32|mp_x86_32_msvc|mp_generic +mp_x86_64|mp_word64|mp_x86_32|mp_x86_32_msvc|mp_generic </requires> diff --git a/src/math/mp/mp_asm64/info.txt b/src/math/mp/mp_asm64/info.txt deleted file mode 100644 index 9af7c4ae7..000000000 --- a/src/math/mp/mp_asm64/info.txt +++ /dev/null @@ -1,24 +0,0 @@ -mp_bits 64 - -load_on dep - -<header:internal> -mp_asm.h -mp_generic:mp_asmi.h -</header:internal> - -<arch> -alpha -ia64 -mips64 -ppc64 -sparc64 -</arch> - -# The inline asm only works with gcc, but it looks like (at least on -# UltraSPARC), using 64-bit words and the sythensized multiply is a 5 to 25% -# win, so it's probably worth using elsewhere. -<cc> -gcc -sunstudio -</cc> diff --git a/src/math/mp/mp_asm64/mp_asm.h b/src/math/mp/mp_asm64/mp_asm.h deleted file mode 100644 index 625ea1c4f..000000000 --- a/src/math/mp/mp_asm64/mp_asm.h +++ /dev/null @@ -1,120 +0,0 @@ -/* -* MPI Multiply-Add Core -* (C) 1999-2007 Jack Lloyd -* -* Distributed under the terms of the Botan license -*/ - -#ifndef BOTAN_MP_MADD_H__ -#define BOTAN_MP_MADD_H__ - -#include <botan/mp_types.h> - -namespace Botan { - -#if (BOTAN_MP_WORD_BITS != 64) - #error The mp_asm64 module requires that BOTAN_MP_WORD_BITS == 64 -#endif - -#if defined(BOTAN_TARGET_ARCH_IS_ALPHA) - -#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ - asm("umulh %1,%2,%0" : "=r" (z0) : "r" (a), "r" (b)); \ - z1 = a * b; \ -} while(0); - -#elif defined(BOTAN_TARGET_ARCH_IS_IA64) - -#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ - asm("xmpy.hu %0=%1,%2" : "=f" (z0) : "f" (a), "f" (b)); \ - z1 = a * b; \ -} while(0); - -#elif defined(BOTAN_TARGET_ARCH_IS_PPC64) - -#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ - asm("mulhdu %0,%1,%2" : "=r" (z0) : "r" (a), "r" (b) : "cc"); \ - z1 = a * b; \ -} while(0); - -#elif defined(BOTAN_TARGET_ARCH_IS_MIPS64) - -#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ - typedef unsigned int uint128_t __attribute__((mode(TI))); \ - uint128_t r = (uint128_t)a * b; \ - z0 = (r >> 64) & 0xFFFFFFFFFFFFFFFF; \ - z1 = (r ) & 0xFFFFFFFFFFFFFFFF; \ -} while(0); - -#else - -// Do a 64x64->128 multiply using four 64x64->64 multiplies -// plus some adds and shifts. Last resort for CPUs like UltraSPARC, -// with 64-bit registers/ALU, but no 64x64->128 multiply. -inline void bigint_2word_mul(word a, word b, word* z1, word* z0) - { - const size_t MP_HWORD_BITS = BOTAN_MP_WORD_BITS / 2; - const word MP_HWORD_MASK = ((word)1 << MP_HWORD_BITS) - 1; - - const word a_hi = (a >> MP_HWORD_BITS); - const word a_lo = (a & MP_HWORD_MASK); - const word b_hi = (b >> MP_HWORD_BITS); - const word b_lo = (b & MP_HWORD_MASK); - - word x0 = a_hi * b_hi; - word x1 = a_lo * b_hi; - word x2 = a_hi * b_lo; - word x3 = a_lo * b_lo; - - x2 += x3 >> (MP_HWORD_BITS); - x2 += x1; - - if(x2 < x1) // timing channel - x0 += ((word)1 << MP_HWORD_BITS); - - *z0 = x0 + (x2 >> MP_HWORD_BITS); - *z1 = ((x2 & MP_HWORD_MASK) << MP_HWORD_BITS) + (x3 & MP_HWORD_MASK); - } - -#define BOTAN_WORD_MUL(a,b,z1,z0) bigint_2word_mul(a, b, &z1, &z0) - -#endif - -/* -* Word Multiply/Add -*/ -inline word word_madd2(word a, word b, word* c) - { - word z0 = 0, z1 = 0; - - BOTAN_WORD_MUL(a, b, z1, z0); - - z1 += *c; - z0 += (z1 < *c); - - *c = z0; - return z1; - } - -/* -* Word Multiply/Add -*/ -inline word word_madd3(word a, word b, word c, word* d) - { - word z0 = 0, z1 = 0; - - BOTAN_WORD_MUL(a, b, z1, z0); - - z1 += c; - z0 += (z1 < c); - - z1 += *d; - z0 += (z1 < *d); - - *d = z0; - return z1; - } - -} - -#endif diff --git a/src/math/mp/mp_msvc64/mp_asm.h b/src/math/mp/mp_msvc64/mp_asm.h deleted file mode 100644 index 8e4535c35..000000000 --- a/src/math/mp/mp_msvc64/mp_asm.h +++ /dev/null @@ -1,61 +0,0 @@ -/* -* Multiply-Add for 64-bit MSVC -* (C) 2010 Jack Lloyd -* -* Distributed under the terms of the Botan license -*/ - -#ifndef BOTAN_MP_ASM_H__ -#define BOTAN_MP_ASM_H__ - -#include <botan/mp_types.h> -#include <intrin.h> - -#if (BOTAN_MP_WORD_BITS != 64) - #error The mp_msvc64 module requires that BOTAN_MP_WORD_BITS == 64 -#endif - -#pragma intrinsic(_umul128) - -namespace Botan { - -extern "C" { - -/* -* Word Multiply -*/ -inline word word_madd2(word a, word b, word* c) - { - word hi, lo; - lo = _umul128(a, b, &hi); - - lo += *c; - hi += (lo < *c); // carry? - - *c = hi; - return lo; - } - -/* -* Word Multiply/Add -*/ -inline word word_madd3(word a, word b, word c, word* d) - { - word hi, lo; - lo = _umul128(a, b, &hi); - - lo += c; - hi += (lo < c); // carry? - - lo += *d; - hi += (lo < *d); // carry? - - *d = hi; - return lo; - } - -} - -} - -#endif diff --git a/src/math/mp/mp_msvc64/info.txt b/src/math/mp/mp_word64/info.txt index fa7d90fed..a12221f4e 100644 --- a/src/math/mp/mp_msvc64/info.txt +++ b/src/math/mp/mp_word64/info.txt @@ -1,17 +1,18 @@ -load_on dep - mp_bits 64 +load_on dep + <header:internal> mp_asm.h mp_generic:mp_asmi.h </header:internal> <arch> -x86_64 +alpha ia64 +mips64 +ppc64 +s390x +sparc64 +x86_64 </arch> - -<cc> -msvc -</cc> diff --git a/src/math/mp/mp_word64/mp_asm.h b/src/math/mp/mp_word64/mp_asm.h new file mode 100644 index 000000000..76d2bb918 --- /dev/null +++ b/src/math/mp/mp_word64/mp_asm.h @@ -0,0 +1,57 @@ +/* +* MPI Multiply-Add Core +* (C) 1999-2007 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_MADD_H__ +#define BOTAN_MP_MADD_H__ + +#include <botan/mp_types.h> +#include <botan/internal/mul128.h> + +namespace Botan { + +#if (BOTAN_MP_WORD_BITS != 64) + #error The mp_word64 module requires that BOTAN_MP_WORD_BITS == 64 +#endif + +/* +* Word Multiply/Add +*/ +inline word word_madd2(word a, word b, word* c) + { + word z0 = 0, z1 = 0; + + mul64x64_128(a, b, &z1, &z0); + + z1 += *c; + z0 += (z1 < *c); + + *c = z0; + return z1; + } + +/* +* Word Multiply/Add +*/ +inline word word_madd3(word a, word b, word c, word* d) + { + word z0 = 0, z1 = 0; + + mul64x64_128(a, b, &z1, &z0); + + z1 += c; + z0 += (z1 < c); + + z1 += *d; + z0 += (z1 < *d); + + *d = z0; + return z1; + } + +} + +#endif diff --git a/src/utils/info.txt b/src/utils/info.txt index b8e72f42e..1d77b87a7 100644 --- a/src/utils/info.txt +++ b/src/utils/info.txt @@ -15,6 +15,7 @@ zero_mem.cpp <header:internal> bit_ops.h +mul128.h prefetch.h rounding.h semaphore.h diff --git a/src/utils/mul128.h b/src/utils/mul128.h new file mode 100644 index 000000000..83d6f5aa6 --- /dev/null +++ b/src/utils/mul128.h @@ -0,0 +1,123 @@ +/* +* 64x64->128 bit multiply operation +* (C) 2013 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_UTIL_MUL128_H__ +#define BOTAN_UTIL_MUL128_H__ + +#include <botan/types.h> + +namespace Botan { + +#if defined(__SIZEOF_INT128__) + #define BOTAN_TARGET_HAS_NATIVE_UINT128 + typedef unsigned __int128 uint128_t; + +#elif (BOTAN_GCC_VERSION > 440) && defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT) + #define BOTAN_TARGET_HAS_NATIVE_UINT128 + typedef unsigned int uint128_t __attribute__((mode(TI))); +#endif + +} + +#if defined(BOTAN_TARGET_HAS_NATIVE_UINT128) + +#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) \ + do { \ + const uint128_t r = (uint128_t)a * b; \ + *hi = (r >> 64) & 0xFFFFFFFFFFFFFFFF; \ + *lo = (r ) & 0xFFFFFFFFFFFFFFFF; \ + } while(0) + +#elif defined(BOTAN_BUILD_COMPILER_IS_MSVC) && defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT) + +#include <intrin.h> +#pragma intrinsic(_umul128) + +#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) \ + do { *lo = _umul128(a, b, hi); } while(0) + +#elif defined(BOTAN_USE_GCC_INLINE_ASM) + +#if defined(BOTAN_TARGET_ARCH_IS_X86_64) + +#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ + asm("mulq %3" : "=d" (*hi), "=a" (*lo) : "a" (a), "rm" (b) : "cc"); \ + } while(0) + +#elif defined(BOTAN_TARGET_ARCH_IS_ALPHA) + +#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ + asm("umulh %1,%2,%0" : "=r" (*hi) : "r" (a), "r" (b)); \ + *lo = a * b; \ +} while(0) + +#elif defined(BOTAN_TARGET_ARCH_IS_IA64) + +#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ + asm("xmpy.hu %0=%1,%2" : "=f" (*hi) : "f" (a), "f" (b)); \ + *lo = a * b; \ +} while(0) + +#elif defined(BOTAN_TARGET_ARCH_IS_PPC64) + +#define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ + asm("mulhdu %0,%1,%2" : "=r" (*hi) : "r" (a), "r" (b) : "cc"); \ + *lo = a * b; \ +} while(0) + +#endif + +#endif + +namespace Botan { + +/** +* Perform a 64x64->128 bit multiplication +*/ +inline void mul64x64_128(u64bit a, u64bit b, u64bit* lo, u64bit* hi) + { +#if defined(BOTAN_FAST_64X64_MUL) + BOTAN_FAST_64X64_MUL(a, b, lo, hi); +#else + + /* + * Do a 64x64->128 multiply using four 32x32->64 multiplies plus + * some adds and shifts. Last resort for CPUs like UltraSPARC (with + * 64-bit registers/ALU, but no 64x64->128 multiply) or 32-bit CPUs. + */ + const size_t HWORD_BITS = 32; + const u32bit HWORD_MASK = 0xFFFFFFFF; + + const u32bit a_hi = (a >> HWORD_BITS); + const u32bit a_lo = (a & HWORD_MASK); + const u32bit b_hi = (b >> HWORD_BITS); + const u32bit b_lo = (b & HWORD_MASK); + + u64bit x0 = static_cast<u64bit>(a_hi) * b_hi; + u64bit x1 = static_cast<u64bit>(a_lo) * b_hi; + u64bit x2 = static_cast<u64bit>(a_hi) * b_lo; + u64bit x3 = static_cast<u64bit>(a_lo) * b_lo; + + // this cannot overflow as (2^32-1)^2 + 2^32-1 < 2^64-1 + x2 += x3 >> HWORD_BITS; + + // this one can overflow + x2 += x1; + + // propagate the carry if any + x0 += static_cast<u64bit>(static_cast<bool>(x2 < x1)) << HWORD_BITS; + + *hi = x0 + (x2 >> HWORD_BITS); + *lo = ((x2 & HWORD_MASK) << HWORD_BITS) + (x3 & HWORD_MASK); +#endif + } + +} + +} + +#endif |