/* * 64x64->128 bit multiply operation * (C) 2013 Jack Lloyd * * Distributed under the terms of the Botan license */ #ifndef BOTAN_UTIL_MUL128_H__ #define BOTAN_UTIL_MUL128_H__ #include namespace Botan { #if defined(__SIZEOF_INT128__) #define BOTAN_TARGET_HAS_NATIVE_UINT128 typedef unsigned __int128 uint128_t; #elif (BOTAN_GCC_VERSION > 440) && defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT) #define BOTAN_TARGET_HAS_NATIVE_UINT128 typedef unsigned int uint128_t __attribute__((mode(TI))); #endif } #if defined(BOTAN_TARGET_HAS_NATIVE_UINT128) #define BOTAN_FAST_64X64_MUL(a,b,lo,hi) \ do { \ const uint128_t r = (uint128_t)a * b; \ *hi = (r >> 64) & 0xFFFFFFFFFFFFFFFF; \ *lo = (r ) & 0xFFFFFFFFFFFFFFFF; \ } while(0) #elif defined(BOTAN_BUILD_COMPILER_IS_MSVC) && defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT) #include #pragma intrinsic(_umul128) #define BOTAN_FAST_64X64_MUL(a,b,lo,hi) \ do { *lo = _umul128(a, b, hi); } while(0) #elif defined(BOTAN_USE_GCC_INLINE_ASM) #if defined(BOTAN_TARGET_ARCH_IS_X86_64) #define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ asm("mulq %3" : "=d" (*hi), "=a" (*lo) : "a" (a), "rm" (b) : "cc"); \ } while(0) #elif defined(BOTAN_TARGET_ARCH_IS_ALPHA) #define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ asm("umulh %1,%2,%0" : "=r" (*hi) : "r" (a), "r" (b)); \ *lo = a * b; \ } while(0) #elif defined(BOTAN_TARGET_ARCH_IS_IA64) #define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ asm("xmpy.hu %0=%1,%2" : "=f" (*hi) : "f" (a), "f" (b)); \ *lo = a * b; \ } while(0) #elif defined(BOTAN_TARGET_ARCH_IS_PPC64) #define BOTAN_FAST_64X64_MUL(a,b,lo,hi) do { \ asm("mulhdu %0,%1,%2" : "=r" (*hi) : "r" (a), "r" (b) : "cc"); \ *lo = a * b; \ } while(0) #endif #endif namespace Botan { /** * Perform a 64x64->128 bit multiplication */ inline void mul64x64_128(u64bit a, u64bit b, u64bit* lo, u64bit* hi) { #if defined(BOTAN_FAST_64X64_MUL) BOTAN_FAST_64X64_MUL(a, b, lo, hi); #else /* * Do a 64x64->128 multiply using four 32x32->64 multiplies plus * some adds and shifts. Last resort for CPUs like UltraSPARC (with * 64-bit registers/ALU, but no 64x64->128 multiply) or 32-bit CPUs. */ const size_t HWORD_BITS = 32; const u32bit HWORD_MASK = 0xFFFFFFFF; const u32bit a_hi = (a >> HWORD_BITS); const u32bit a_lo = (a & HWORD_MASK); const u32bit b_hi = (b >> HWORD_BITS); const u32bit b_lo = (b & HWORD_MASK); u64bit x0 = static_cast(a_hi) * b_hi; u64bit x1 = static_cast(a_lo) * b_hi; u64bit x2 = static_cast(a_hi) * b_lo; u64bit x3 = static_cast(a_lo) * b_lo; // this cannot overflow as (2^32-1)^2 + 2^32-1 < 2^64-1 x2 += x3 >> HWORD_BITS; // this one can overflow x2 += x1; // propagate the carry if any x0 += static_cast(static_cast(x2 < x1)) << HWORD_BITS; *hi = x0 + (x2 >> HWORD_BITS); *lo = ((x2 & HWORD_MASK) << HWORD_BITS) + (x3 & HWORD_MASK); #endif } } #endif