diff options
Diffstat (limited to 'lib/math/mp')
-rw-r--r-- | lib/math/mp/info.txt | 23 | ||||
-rw-r--r-- | lib/math/mp/mp_asm.cpp | 184 | ||||
-rw-r--r-- | lib/math/mp/mp_comba.cpp | 920 | ||||
-rw-r--r-- | lib/math/mp/mp_core.h | 175 | ||||
-rw-r--r-- | lib/math/mp/mp_generic/info.txt | 6 | ||||
-rw-r--r-- | lib/math/mp/mp_generic/mp_asmi.h | 207 | ||||
-rw-r--r-- | lib/math/mp/mp_generic/mp_madd.h | 73 | ||||
-rw-r--r-- | lib/math/mp/mp_karat.cpp | 303 | ||||
-rw-r--r-- | lib/math/mp/mp_misc.cpp | 79 | ||||
-rw-r--r-- | lib/math/mp/mp_monty.cpp | 101 | ||||
-rw-r--r-- | lib/math/mp/mp_mulop.cpp | 77 | ||||
-rw-r--r-- | lib/math/mp/mp_shift.cpp | 138 | ||||
-rw-r--r-- | lib/math/mp/mp_types.h | 46 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_32/info.txt | 18 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_32/mp_asmi.h | 240 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_32/mp_madd.h | 67 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_32_msvc/info.txt | 16 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_32_msvc/mp_asmi.h | 542 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_64/info.txt | 18 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_64/mp_asmi.h | 248 | ||||
-rw-r--r-- | lib/math/mp/mp_x86_64/mp_madd.h | 69 |
21 files changed, 3550 insertions, 0 deletions
diff --git a/lib/math/mp/info.txt b/lib/math/mp/info.txt new file mode 100644 index 000000000..a47475f7b --- /dev/null +++ b/lib/math/mp/info.txt @@ -0,0 +1,23 @@ +define BIGINT_MP 20131128 + +<source> +mp_asm.cpp +mp_comba.cpp +mp_karat.cpp +mp_monty.cpp +mp_mulop.cpp +mp_misc.cpp +mp_shift.cpp +</source> + +<header:public> +mp_types.h +</header:public> + +<header:internal> +mp_core.h +</header:internal> + +<requires> +mp_x86_64|mp_x86_32|mp_x86_32_msvc|mp_generic +</requires> diff --git a/lib/math/mp/mp_asm.cpp b/lib/math/mp/mp_asm.cpp new file mode 100644 index 000000000..a3caba620 --- /dev/null +++ b/lib/math/mp/mp_asm.cpp @@ -0,0 +1,184 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2010 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/internal/mp_asmi.h> +#include <botan/internal/mp_core.h> +#include <botan/exceptn.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/* +* Two Operand Addition, No Carry +*/ +word bigint_add2_nc(word x[], size_t x_size, const word y[], size_t y_size) + { + word carry = 0; + + const size_t blocks = y_size - (y_size % 8); + + for(size_t i = 0; i != blocks; i += 8) + carry = word8_add2(x + i, y + i, carry); + + for(size_t i = blocks; i != y_size; ++i) + x[i] = word_add(x[i], y[i], &carry); + + for(size_t i = y_size; i != x_size; ++i) + x[i] = word_add(x[i], 0, &carry); + + return carry; + } + +/* +* Three Operand Addition, No Carry +*/ +word bigint_add3_nc(word z[], const word x[], size_t x_size, + const word y[], size_t y_size) + { + if(x_size < y_size) + { return bigint_add3_nc(z, y, y_size, x, x_size); } + + word carry = 0; + + const size_t blocks = y_size - (y_size % 8); + + for(size_t i = 0; i != blocks; i += 8) + carry = word8_add3(z + i, x + i, y + i, carry); + + for(size_t i = blocks; i != y_size; ++i) + z[i] = word_add(x[i], y[i], &carry); + + for(size_t i = y_size; i != x_size; ++i) + z[i] = word_add(x[i], 0, &carry); + + return carry; + } + +/* +* Two Operand Addition +*/ +void bigint_add2(word x[], size_t x_size, const word y[], size_t y_size) + { + if(bigint_add2_nc(x, x_size, y, y_size)) + x[x_size] += 1; + } + +/* +* Three Operand Addition +*/ +void bigint_add3(word z[], const word x[], size_t x_size, + const word y[], size_t y_size) + { + z[(x_size > y_size ? x_size : y_size)] += + bigint_add3_nc(z, x, x_size, y, y_size); + } + +/* +* Two Operand Subtraction +*/ +word bigint_sub2(word x[], size_t x_size, const word y[], size_t y_size) + { + word borrow = 0; + + const size_t blocks = y_size - (y_size % 8); + + for(size_t i = 0; i != blocks; i += 8) + borrow = word8_sub2(x + i, y + i, borrow); + + for(size_t i = blocks; i != y_size; ++i) + x[i] = word_sub(x[i], y[i], &borrow); + + for(size_t i = y_size; i != x_size; ++i) + x[i] = word_sub(x[i], 0, &borrow); + + return borrow; + } + +/* +* Two Operand Subtraction x = y - x +*/ +void bigint_sub2_rev(word x[], const word y[], size_t y_size) + { + word borrow = 0; + + const size_t blocks = y_size - (y_size % 8); + + for(size_t i = 0; i != blocks; i += 8) + borrow = word8_sub2_rev(x + i, y + i, borrow); + + for(size_t i = blocks; i != y_size; ++i) + x[i] = word_sub(y[i], x[i], &borrow); + + if(borrow) + throw Internal_Error("bigint_sub2_rev: x >= y"); + } + +/* +* Three Operand Subtraction +*/ +word bigint_sub3(word z[], const word x[], size_t x_size, + const word y[], size_t y_size) + { + word borrow = 0; + + const size_t blocks = y_size - (y_size % 8); + + for(size_t i = 0; i != blocks; i += 8) + borrow = word8_sub3(z + i, x + i, y + i, borrow); + + for(size_t i = blocks; i != y_size; ++i) + z[i] = word_sub(x[i], y[i], &borrow); + + for(size_t i = y_size; i != x_size; ++i) + z[i] = word_sub(x[i], 0, &borrow); + + return borrow; + } + +/* +* Two Operand Linear Multiply +*/ +void bigint_linmul2(word x[], size_t x_size, word y) + { + const size_t blocks = x_size - (x_size % 8); + + word carry = 0; + + for(size_t i = 0; i != blocks; i += 8) + carry = word8_linmul2(x + i, y, carry); + + for(size_t i = blocks; i != x_size; ++i) + x[i] = word_madd2(x[i], y, &carry); + + x[x_size] = carry; + } + +/* +* Three Operand Linear Multiply +*/ +void bigint_linmul3(word z[], const word x[], size_t x_size, word y) + { + const size_t blocks = x_size - (x_size % 8); + + word carry = 0; + + for(size_t i = 0; i != blocks; i += 8) + carry = word8_linmul3(z + i, x + i, y, carry); + + for(size_t i = blocks; i != x_size; ++i) + z[i] = word_madd2(x[i], y, &carry); + + z[x_size] = carry; + } + +} + +} diff --git a/lib/math/mp/mp_comba.cpp b/lib/math/mp/mp_comba.cpp new file mode 100644 index 000000000..99dcda176 --- /dev/null +++ b/lib/math/mp/mp_comba.cpp @@ -0,0 +1,920 @@ +/* +* Comba Multiplication and Squaring +* (C) 1999-2007,2011 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/internal/mp_asmi.h> + +namespace Botan { + +extern "C" { + +/* +* Comba 4x4 Squaring +*/ +void bigint_comba_sqr4(word z[8], const word x[4]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 1]); + z[ 1] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], x[ 1]); + z[ 2] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], x[ 2]); + z[ 4] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 3]); + z[ 5] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; + z[ 7] = w1; + } + +/* +* Comba 4x4 Multiplication +*/ +void bigint_comba_mul4(word z[8], const word x[4], const word y[4]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 0]); + z[ 1] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 0]); + z[ 2] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); + z[ 3] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 1]); + z[ 4] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 2]); + z[ 5] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + z[ 6] = w0; + z[ 7] = w1; + } + +/* +* Comba 6x6 Squaring +*/ +void bigint_comba_sqr6(word z[12], const word x[6]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 1]); + z[ 1] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], x[ 1]); + z[ 2] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 4]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], x[ 2]); + z[ 4] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 5]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 4]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 3]); + z[ 5] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 5]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 4]); + z[ 7] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], x[ 4]); + z[ 8] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]); + z[ 9] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 5], x[ 5]); + z[10] = w1; + z[11] = w2; + } + +/* +* Comba 6x6 Multiplication +*/ +void bigint_comba_mul6(word z[12], const word x[6], const word y[6]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 0]); + z[ 1] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 0]); + z[ 2] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); + z[ 3] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 0]); + z[ 4] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 0]); + z[ 5] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]); + z[ 6] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 2]); + z[ 7] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 3]); + z[ 8] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]); + z[ 9] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 5]); + z[10] = w1; + z[11] = w2; + } + +/* +* Comba 8x8 Squaring +*/ +void bigint_comba_sqr8(word z[16], const word x[8]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 1]); + z[ 1] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], x[ 1]); + z[ 2] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 4]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], x[ 2]); + z[ 4] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 5]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 4]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 3]); + z[ 5] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 7]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 6]); + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 5]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 4]); + z[ 7] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 7]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 6]); + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], x[ 4]); + z[ 8] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]); + z[ 9] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 7]); + word3_muladd_2(&w0, &w2, &w1, x[ 4], x[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 5], x[ 5]); + z[10] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 4], x[ 7]); + word3_muladd_2(&w1, &w0, &w2, x[ 5], x[ 6]); + z[11] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], x[ 6]); + z[12] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 6], x[ 7]); + z[13] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 7], x[ 7]); + z[14] = w2; + z[15] = w0; + } + +/* +* Comba 8x8 Multiplication +*/ +void bigint_comba_mul8(word z[16], const word x[8], const word y[8]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 0]); + z[ 1] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 0]); + z[ 2] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); + z[ 3] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 0]); + z[ 4] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 0]); + z[ 5] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 0]); + z[ 6] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 0]); + z[ 7] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 1]); + z[ 8] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 2]); + z[ 9] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 3]); + z[10] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 4]); + z[11] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 5]); + z[12] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 6]); + z[13] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 7]); + z[14] = w2; + z[15] = w0; + } + +/* +* Comba 16x16 Squaring +*/ +void bigint_comba_sqr16(word z[32], const word x[16]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 1]); + z[ 1] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], x[ 1]); + z[ 2] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 4]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], x[ 2]); + z[ 4] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 5]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 4]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 3]); + z[ 5] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 7]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 6]); + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 5]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 4]); + z[ 7] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 8]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 7]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 6]); + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], x[ 4]); + z[ 8] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]); + z[ 9] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[10]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 9]); + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 8]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 7]); + word3_muladd_2(&w0, &w2, &w1, x[ 4], x[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 5], x[ 5]); + z[10] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[11]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[10]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 9]); + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 8]); + word3_muladd_2(&w1, &w0, &w2, x[ 4], x[ 7]); + word3_muladd_2(&w1, &w0, &w2, x[ 5], x[ 6]); + z[11] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], x[ 6]); + z[12] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[13]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[12]); + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[11]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[10]); + word3_muladd_2(&w0, &w2, &w1, x[ 4], x[ 9]); + word3_muladd_2(&w0, &w2, &w1, x[ 5], x[ 8]); + word3_muladd_2(&w0, &w2, &w1, x[ 6], x[ 7]); + z[13] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[14]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[13]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[12]); + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[11]); + word3_muladd_2(&w1, &w0, &w2, x[ 4], x[10]); + word3_muladd_2(&w1, &w0, &w2, x[ 5], x[ 9]); + word3_muladd_2(&w1, &w0, &w2, x[ 6], x[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 7], x[ 7]); + z[14] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 8]); + z[15] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[15]); + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[14]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[13]); + word3_muladd_2(&w0, &w2, &w1, x[ 4], x[12]); + word3_muladd_2(&w0, &w2, &w1, x[ 5], x[11]); + word3_muladd_2(&w0, &w2, &w1, x[ 6], x[10]); + word3_muladd_2(&w0, &w2, &w1, x[ 7], x[ 9]); + word3_muladd(&w0, &w2, &w1, x[ 8], x[ 8]); + z[16] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[15]); + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[14]); + word3_muladd_2(&w1, &w0, &w2, x[ 4], x[13]); + word3_muladd_2(&w1, &w0, &w2, x[ 5], x[12]); + word3_muladd_2(&w1, &w0, &w2, x[ 6], x[11]); + word3_muladd_2(&w1, &w0, &w2, x[ 7], x[10]); + word3_muladd_2(&w1, &w0, &w2, x[ 8], x[ 9]); + z[17] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[10]); + word3_muladd(&w2, &w1, &w0, x[ 9], x[ 9]); + z[18] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 4], x[15]); + word3_muladd_2(&w0, &w2, &w1, x[ 5], x[14]); + word3_muladd_2(&w0, &w2, &w1, x[ 6], x[13]); + word3_muladd_2(&w0, &w2, &w1, x[ 7], x[12]); + word3_muladd_2(&w0, &w2, &w1, x[ 8], x[11]); + word3_muladd_2(&w0, &w2, &w1, x[ 9], x[10]); + z[19] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 5], x[15]); + word3_muladd_2(&w1, &w0, &w2, x[ 6], x[14]); + word3_muladd_2(&w1, &w0, &w2, x[ 7], x[13]); + word3_muladd_2(&w1, &w0, &w2, x[ 8], x[12]); + word3_muladd_2(&w1, &w0, &w2, x[ 9], x[11]); + word3_muladd(&w1, &w0, &w2, x[10], x[10]); + z[20] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[11]); + z[21] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 7], x[15]); + word3_muladd_2(&w0, &w2, &w1, x[ 8], x[14]); + word3_muladd_2(&w0, &w2, &w1, x[ 9], x[13]); + word3_muladd_2(&w0, &w2, &w1, x[10], x[12]); + word3_muladd(&w0, &w2, &w1, x[11], x[11]); + z[22] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 8], x[15]); + word3_muladd_2(&w1, &w0, &w2, x[ 9], x[14]); + word3_muladd_2(&w1, &w0, &w2, x[10], x[13]); + word3_muladd_2(&w1, &w0, &w2, x[11], x[12]); + z[23] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[13]); + word3_muladd(&w2, &w1, &w0, x[12], x[12]); + z[24] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[10], x[15]); + word3_muladd_2(&w0, &w2, &w1, x[11], x[14]); + word3_muladd_2(&w0, &w2, &w1, x[12], x[13]); + z[25] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[11], x[15]); + word3_muladd_2(&w1, &w0, &w2, x[12], x[14]); + word3_muladd(&w1, &w0, &w2, x[13], x[13]); + z[26] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[12], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[13], x[14]); + z[27] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[13], x[15]); + word3_muladd(&w0, &w2, &w1, x[14], x[14]); + z[28] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[14], x[15]); + z[29] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[15], x[15]); + z[30] = w0; + z[31] = w1; + } + +/* +* Comba 16x16 Multiplication +*/ +void bigint_comba_mul16(word z[32], const word x[16], const word y[16]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 0]); + z[ 1] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 0]); + z[ 2] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); + z[ 3] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 0]); + z[ 4] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 0]); + z[ 5] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 0]); + z[ 6] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 0]); + z[ 7] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 0]); + z[ 8] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 0]); + z[ 9] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[10]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 9]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 9], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[10], y[ 0]); + z[10] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[11]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[10]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 9]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 9], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[10], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[11], y[ 0]); + z[11] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 0]); + z[12] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[13]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[12]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[11]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[10]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 9]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 9], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[10], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[11], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[12], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[13], y[ 0]); + z[13] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[14]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[13]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[12]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[11]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[10]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 9]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 9], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[10], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[11], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[12], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[13], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[14], y[ 0]); + z[14] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 0]); + z[15] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 1], y[15]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[14]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[13]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[12]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[11]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[10]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 9]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[ 9], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[10], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[11], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[12], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[13], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[14], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[15], y[ 1]); + z[16] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 2], y[15]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[14]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[13]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[12]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[11]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[10]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 9]); + word3_muladd(&w1, &w0, &w2, x[ 9], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[10], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[11], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[12], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[13], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[14], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[15], y[ 2]); + z[17] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 3], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 3]); + z[18] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 4], y[15]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[14]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[13]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[12]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[11]); + word3_muladd(&w0, &w2, &w1, x[ 9], y[10]); + word3_muladd(&w0, &w2, &w1, x[10], y[ 9]); + word3_muladd(&w0, &w2, &w1, x[11], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[12], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[13], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[14], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[15], y[ 4]); + z[19] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 5], y[15]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[14]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[13]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[12]); + word3_muladd(&w1, &w0, &w2, x[ 9], y[11]); + word3_muladd(&w1, &w0, &w2, x[10], y[10]); + word3_muladd(&w1, &w0, &w2, x[11], y[ 9]); + word3_muladd(&w1, &w0, &w2, x[12], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[13], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[14], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[15], y[ 5]); + z[20] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 6], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[12]); + word3_muladd(&w2, &w1, &w0, x[10], y[11]); + word3_muladd(&w2, &w1, &w0, x[11], y[10]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 6]); + z[21] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 7], y[15]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[14]); + word3_muladd(&w0, &w2, &w1, x[ 9], y[13]); + word3_muladd(&w0, &w2, &w1, x[10], y[12]); + word3_muladd(&w0, &w2, &w1, x[11], y[11]); + word3_muladd(&w0, &w2, &w1, x[12], y[10]); + word3_muladd(&w0, &w2, &w1, x[13], y[ 9]); + word3_muladd(&w0, &w2, &w1, x[14], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[15], y[ 7]); + z[22] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 8], y[15]); + word3_muladd(&w1, &w0, &w2, x[ 9], y[14]); + word3_muladd(&w1, &w0, &w2, x[10], y[13]); + word3_muladd(&w1, &w0, &w2, x[11], y[12]); + word3_muladd(&w1, &w0, &w2, x[12], y[11]); + word3_muladd(&w1, &w0, &w2, x[13], y[10]); + word3_muladd(&w1, &w0, &w2, x[14], y[ 9]); + word3_muladd(&w1, &w0, &w2, x[15], y[ 8]); + z[23] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 9], y[15]); + word3_muladd(&w2, &w1, &w0, x[10], y[14]); + word3_muladd(&w2, &w1, &w0, x[11], y[13]); + word3_muladd(&w2, &w1, &w0, x[12], y[12]); + word3_muladd(&w2, &w1, &w0, x[13], y[11]); + word3_muladd(&w2, &w1, &w0, x[14], y[10]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 9]); + z[24] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[10], y[15]); + word3_muladd(&w0, &w2, &w1, x[11], y[14]); + word3_muladd(&w0, &w2, &w1, x[12], y[13]); + word3_muladd(&w0, &w2, &w1, x[13], y[12]); + word3_muladd(&w0, &w2, &w1, x[14], y[11]); + word3_muladd(&w0, &w2, &w1, x[15], y[10]); + z[25] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[11], y[15]); + word3_muladd(&w1, &w0, &w2, x[12], y[14]); + word3_muladd(&w1, &w0, &w2, x[13], y[13]); + word3_muladd(&w1, &w0, &w2, x[14], y[12]); + word3_muladd(&w1, &w0, &w2, x[15], y[11]); + z[26] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[12], y[15]); + word3_muladd(&w2, &w1, &w0, x[13], y[14]); + word3_muladd(&w2, &w1, &w0, x[14], y[13]); + word3_muladd(&w2, &w1, &w0, x[15], y[12]); + z[27] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[13], y[15]); + word3_muladd(&w0, &w2, &w1, x[14], y[14]); + word3_muladd(&w0, &w2, &w1, x[15], y[13]); + z[28] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[14], y[15]); + word3_muladd(&w1, &w0, &w2, x[15], y[14]); + z[29] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[15], y[15]); + z[30] = w0; + z[31] = w1; + } + +} + +} diff --git a/lib/math/mp/mp_core.h b/lib/math/mp/mp_core.h new file mode 100644 index 000000000..c25cb994f --- /dev/null +++ b/lib/math/mp/mp_core.h @@ -0,0 +1,175 @@ +/* +* MPI Algorithms +* (C) 1999-2010 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_CORE_OPS_H__ +#define BOTAN_MP_CORE_OPS_H__ + +#include <botan/mp_types.h> + +namespace Botan { + +/* +* The size of the word type, in bits +*/ +const size_t MP_WORD_BITS = BOTAN_MP_WORD_BITS; + +extern "C" { + +/** +* Two operand addition +* @param x the first operand (and output) +* @param x_size size of x +* @param y the second operand +* @param y_size size of y (must be >= x_size) +*/ +void bigint_add2(word x[], size_t x_size, + const word y[], size_t y_size); + +/** +* Three operand addition +*/ +void bigint_add3(word z[], + const word x[], size_t x_size, + const word y[], size_t y_size); + +/** +* Two operand addition with carry out +*/ +word bigint_add2_nc(word x[], size_t x_size, const word y[], size_t y_size); + +/** +* Three operand addition with carry out +*/ +word bigint_add3_nc(word z[], + const word x[], size_t x_size, + const word y[], size_t y_size); + +/** +* Two operand subtraction +*/ +word bigint_sub2(word x[], size_t x_size, + const word y[], size_t y_size); + +/** +* Two operand subtraction, x = y - x; assumes y >= x +*/ +void bigint_sub2_rev(word x[], const word y[], size_t y_size); + +/** +* Three operand subtraction +*/ +word bigint_sub3(word z[], + const word x[], size_t x_size, + const word y[], size_t y_size); + +/* +* Shift Operations +*/ +void bigint_shl1(word x[], size_t x_size, + size_t word_shift, size_t bit_shift); + +void bigint_shr1(word x[], size_t x_size, + size_t word_shift, size_t bit_shift); + +void bigint_shl2(word y[], const word x[], size_t x_size, + size_t word_shift, size_t bit_shift); + +void bigint_shr2(word y[], const word x[], size_t x_size, + size_t word_shift, size_t bit_shift); + +/* +* Simple O(N^2) Multiplication and Squaring +*/ +void bigint_simple_mul(word z[], + const word x[], size_t x_size, + const word y[], size_t y_size); + +void bigint_simple_sqr(word z[], const word x[], size_t x_size); + +/* +* Linear Multiply +*/ +void bigint_linmul2(word x[], size_t x_size, word y); +void bigint_linmul3(word z[], const word x[], size_t x_size, word y); + +/** +* Montgomery Reduction +* @param z integer to reduce, of size exactly 2*(p_size+1). + Output is in the first p_size+1 words, higher + words are set to zero. +* @param p modulus +* @param p_size size of p +* @param p_dash Montgomery value +* @param workspace array of at least 2*(p_size+1) words +*/ +void bigint_monty_redc(word z[], + const word p[], size_t p_size, + word p_dash, + word workspace[]); + +/* +* Montgomery Multiplication +*/ +void bigint_monty_mul(word z[], size_t z_size, + const word x[], size_t x_size, size_t x_sw, + const word y[], size_t y_size, size_t y_sw, + const word p[], size_t p_size, word p_dash, + word workspace[]); + +/* +* Montgomery Squaring +*/ +void bigint_monty_sqr(word z[], size_t z_size, + const word x[], size_t x_size, size_t x_sw, + const word p[], size_t p_size, word p_dash, + word workspace[]); + +/** +* Compare x and y +*/ +s32bit bigint_cmp(const word x[], size_t x_size, + const word y[], size_t y_size); + +/** +* Compute ((n1<<bits) + n0) / d +*/ +word bigint_divop(word n1, word n0, word d); + +/** +* Compute ((n1<<bits) + n0) % d +*/ +word bigint_modop(word n1, word n0, word d); + +/* +* Comba Multiplication / Squaring +*/ +void bigint_comba_mul4(word z[8], const word x[4], const word y[4]); +void bigint_comba_mul6(word z[12], const word x[6], const word y[6]); +void bigint_comba_mul8(word z[16], const word x[8], const word y[8]); +void bigint_comba_mul16(word z[32], const word x[16], const word y[16]); + +void bigint_comba_sqr4(word out[8], const word in[4]); +void bigint_comba_sqr6(word out[12], const word in[6]); +void bigint_comba_sqr8(word out[16], const word in[8]); +void bigint_comba_sqr16(word out[32], const word in[16]); + +} + +/* +* High Level Multiplication/Squaring Interfaces +*/ +void bigint_mul(word z[], size_t z_size, word workspace[], + const word x[], size_t x_size, size_t x_sw, + const word y[], size_t y_size, size_t y_sw); + +void bigint_sqr(word z[], size_t z_size, word workspace[], + const word x[], size_t x_size, size_t x_sw); + +} + +#endif diff --git a/lib/math/mp/mp_generic/info.txt b/lib/math/mp/mp_generic/info.txt new file mode 100644 index 000000000..c87dd00ca --- /dev/null +++ b/lib/math/mp/mp_generic/info.txt @@ -0,0 +1,6 @@ +load_on dep + +<header:internal> +mp_madd.h +mp_asmi.h +</header:internal> diff --git a/lib/math/mp/mp_generic/mp_asmi.h b/lib/math/mp/mp_generic/mp_asmi.h new file mode 100644 index 000000000..018055696 --- /dev/null +++ b/lib/math/mp/mp_generic/mp_asmi.h @@ -0,0 +1,207 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2010 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/internal/mp_madd.h> + +namespace Botan { + +extern "C" { + +/* +* Word Addition +*/ +inline word word_add(word x, word y, word* carry) + { + word z = x + y; + word c1 = (z < x); + z += *carry; + *carry = c1 | (z < *carry); + return z; + } + +/* +* Eight Word Block Addition, Two Argument +*/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + x[0] = word_add(x[0], y[0], &carry); + x[1] = word_add(x[1], y[1], &carry); + x[2] = word_add(x[2], y[2], &carry); + x[3] = word_add(x[3], y[3], &carry); + x[4] = word_add(x[4], y[4], &carry); + x[5] = word_add(x[5], y[5], &carry); + x[6] = word_add(x[6], y[6], &carry); + x[7] = word_add(x[7], y[7], &carry); + return carry; + } + +/* +* Eight Word Block Addition, Three Argument +*/ +inline word word8_add3(word z[8], const word x[8], + const word y[8], word carry) + { + z[0] = word_add(x[0], y[0], &carry); + z[1] = word_add(x[1], y[1], &carry); + z[2] = word_add(x[2], y[2], &carry); + z[3] = word_add(x[3], y[3], &carry); + z[4] = word_add(x[4], y[4], &carry); + z[5] = word_add(x[5], y[5], &carry); + z[6] = word_add(x[6], y[6], &carry); + z[7] = word_add(x[7], y[7], &carry); + return carry; + } + +/* +* Word Subtraction +*/ +inline word word_sub(word x, word y, word* carry) + { + word t0 = x - y; + word c1 = (t0 > x); + word z = t0 - *carry; + *carry = c1 | (z > t0); + return z; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + x[0] = word_sub(x[0], y[0], &carry); + x[1] = word_sub(x[1], y[1], &carry); + x[2] = word_sub(x[2], y[2], &carry); + x[3] = word_sub(x[3], y[3], &carry); + x[4] = word_sub(x[4], y[4], &carry); + x[5] = word_sub(x[5], y[5], &carry); + x[6] = word_sub(x[6], y[6], &carry); + x[7] = word_sub(x[7], y[7], &carry); + return carry; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + x[0] = word_sub(y[0], x[0], &carry); + x[1] = word_sub(y[1], x[1], &carry); + x[2] = word_sub(y[2], x[2], &carry); + x[3] = word_sub(y[3], x[3], &carry); + x[4] = word_sub(y[4], x[4], &carry); + x[5] = word_sub(y[5], x[5], &carry); + x[6] = word_sub(y[6], x[6], &carry); + x[7] = word_sub(y[7], x[7], &carry); + return carry; + } + +/* +* Eight Word Block Subtraction, Three Argument +*/ +inline word word8_sub3(word z[8], const word x[8], + const word y[8], word carry) + { + z[0] = word_sub(x[0], y[0], &carry); + z[1] = word_sub(x[1], y[1], &carry); + z[2] = word_sub(x[2], y[2], &carry); + z[3] = word_sub(x[3], y[3], &carry); + z[4] = word_sub(x[4], y[4], &carry); + z[5] = word_sub(x[5], y[5], &carry); + z[6] = word_sub(x[6], y[6], &carry); + z[7] = word_sub(x[7], y[7], &carry); + return carry; + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul2(word x[8], word y, word carry) + { + x[0] = word_madd2(x[0], y, &carry); + x[1] = word_madd2(x[1], y, &carry); + x[2] = word_madd2(x[2], y, &carry); + x[3] = word_madd2(x[3], y, &carry); + x[4] = word_madd2(x[4], y, &carry); + x[5] = word_madd2(x[5], y, &carry); + x[6] = word_madd2(x[6], y, &carry); + x[7] = word_madd2(x[7], y, &carry); + return carry; + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd2(x[0], y, &carry); + z[1] = word_madd2(x[1], y, &carry); + z[2] = word_madd2(x[2], y, &carry); + z[3] = word_madd2(x[3], y, &carry); + z[4] = word_madd2(x[4], y, &carry); + z[5] = word_madd2(x[5], y, &carry); + z[6] = word_madd2(x[6], y, &carry); + z[7] = word_madd2(x[7], y, &carry); + return carry; + } + +/* +* Eight Word Block Multiply/Add +*/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd3(x[0], y, z[0], &carry); + z[1] = word_madd3(x[1], y, z[1], &carry); + z[2] = word_madd3(x[2], y, z[2], &carry); + z[3] = word_madd3(x[3], y, z[3], &carry); + z[4] = word_madd3(x[4], y, z[4], &carry); + z[5] = word_madd3(x[5], y, z[5], &carry); + z[6] = word_madd3(x[6], y, z[6], &carry); + z[7] = word_madd3(x[7], y, z[7], &carry); + return carry; + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b) + { + word carry = *w0; + *w0 = word_madd2(a, b, &carry); + *w1 += carry; + *w2 += (*w1 < carry) ? 1 : 0; + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b) + { + word carry = 0; + a = word_madd2(a, b, &carry); + b = carry; + + word top = (b >> (BOTAN_MP_WORD_BITS-1)); + b <<= 1; + b |= (a >> (BOTAN_MP_WORD_BITS-1)); + a <<= 1; + + carry = 0; + *w0 = word_add(*w0, a, &carry); + *w1 = word_add(*w1, b, &carry); + *w2 = word_add(*w2, top, &carry); + } + +} + +} + +#endif diff --git a/lib/math/mp/mp_generic/mp_madd.h b/lib/math/mp/mp_generic/mp_madd.h new file mode 100644 index 000000000..17713f55f --- /dev/null +++ b/lib/math/mp/mp_generic/mp_madd.h @@ -0,0 +1,73 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2008,2013 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_WORD_MULADD_H__ +#define BOTAN_MP_WORD_MULADD_H__ + +#include <botan/mp_types.h> + +namespace Botan { + +extern "C" { + +/* +* Word Multiply/Add +*/ +inline word word_madd2(word a, word b, word* c) + { +#if defined(BOTAN_HAS_MP_DWORD) + const dword s = static_cast<dword>(a) * b + *c; + *c = static_cast<word>(s >> BOTAN_MP_WORD_BITS); + return static_cast<word>(s); +#else + static_assert(BOTAN_MP_WORD_BITS == 64, "Unexpected word size"); + + word hi = 0, lo = 0; + + mul64x64_128(a, b, &lo, &hi); + + lo += *c; + hi += (lo < *c); // carry? + + *c = hi; + return lo; +#endif + } + +/* +* Word Multiply/Add +*/ +inline word word_madd3(word a, word b, word c, word* d) + { +#if defined(BOTAN_HAS_MP_DWORD) + const dword s = static_cast<dword>(a) * b + c + *d; + *d = static_cast<word>(s >> BOTAN_MP_WORD_BITS); + return static_cast<word>(s); +#else + static_assert(BOTAN_MP_WORD_BITS == 64, "Unexpected word size"); + + word hi = 0, lo = 0; + + mul64x64_128(a, b, &lo, &hi); + + lo += c; + hi += (lo < c); // carry? + + lo += *d; + hi += (lo < *d); // carry? + + *d = hi; + return lo; +#endif + } + +} + +} + +#endif diff --git a/lib/math/mp/mp_karat.cpp b/lib/math/mp/mp_karat.cpp new file mode 100644 index 000000000..b549a05c8 --- /dev/null +++ b/lib/math/mp/mp_karat.cpp @@ -0,0 +1,303 @@ +/* +* Karatsuba Multiplication/Squaring +* (C) 1999-2010 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/internal/mp_asmi.h> +#include <botan/mem_ops.h> + +namespace Botan { + +namespace { + +static const size_t KARATSUBA_MULTIPLY_THRESHOLD = 32; +static const size_t KARATSUBA_SQUARE_THRESHOLD = 32; + +/* +* Karatsuba Multiplication Operation +*/ +void karatsuba_mul(word z[], const word x[], const word y[], size_t N, + word workspace[]) + { + if(N < KARATSUBA_MULTIPLY_THRESHOLD || N % 2) + { + if(N == 6) + return bigint_comba_mul6(z, x, y); + else if(N == 8) + return bigint_comba_mul8(z, x, y); + else if(N == 16) + return bigint_comba_mul16(z, x, y); + else + return bigint_simple_mul(z, x, N, y, N); + } + + const size_t N2 = N / 2; + + const word* x0 = x; + const word* x1 = x + N2; + const word* y0 = y; + const word* y1 = y + N2; + word* z0 = z; + word* z1 = z + N; + + const s32bit cmp0 = bigint_cmp(x0, N2, x1, N2); + const s32bit cmp1 = bigint_cmp(y1, N2, y0, N2); + + clear_mem(workspace, 2*N); + + //if(cmp0 && cmp1) + { + if(cmp0 > 0) + bigint_sub3(z0, x0, N2, x1, N2); + else + bigint_sub3(z0, x1, N2, x0, N2); + + if(cmp1 > 0) + bigint_sub3(z1, y1, N2, y0, N2); + else + bigint_sub3(z1, y0, N2, y1, N2); + + karatsuba_mul(workspace, z0, z1, N2, workspace+N); + } + + karatsuba_mul(z0, x0, y0, N2, workspace+N); + karatsuba_mul(z1, x1, y1, N2, workspace+N); + + const word ws_carry = bigint_add3_nc(workspace + N, z0, N, z1, N); + word z_carry = bigint_add2_nc(z + N2, N, workspace + N, N); + + z_carry += bigint_add2_nc(z + N + N2, N2, &ws_carry, 1); + bigint_add2_nc(z + N + N2, N2, &z_carry, 1); + + if((cmp0 == cmp1) || (cmp0 == 0) || (cmp1 == 0)) + bigint_add2(z + N2, 2*N-N2, workspace, N); + else + bigint_sub2(z + N2, 2*N-N2, workspace, N); + } + +/* +* Karatsuba Squaring Operation +*/ +void karatsuba_sqr(word z[], const word x[], size_t N, word workspace[]) + { + if(N < KARATSUBA_SQUARE_THRESHOLD || N % 2) + { + if(N == 6) + return bigint_comba_sqr6(z, x); + else if(N == 8) + return bigint_comba_sqr8(z, x); + else if(N == 16) + return bigint_comba_sqr16(z, x); + else + return bigint_simple_sqr(z, x, N); + } + + const size_t N2 = N / 2; + + const word* x0 = x; + const word* x1 = x + N2; + word* z0 = z; + word* z1 = z + N; + + const s32bit cmp = bigint_cmp(x0, N2, x1, N2); + + clear_mem(workspace, 2*N); + + //if(cmp) + { + if(cmp > 0) + bigint_sub3(z0, x0, N2, x1, N2); + else + bigint_sub3(z0, x1, N2, x0, N2); + + karatsuba_sqr(workspace, z0, N2, workspace+N); + } + + karatsuba_sqr(z0, x0, N2, workspace+N); + karatsuba_sqr(z1, x1, N2, workspace+N); + + const word ws_carry = bigint_add3_nc(workspace + N, z0, N, z1, N); + word z_carry = bigint_add2_nc(z + N2, N, workspace + N, N); + + z_carry += bigint_add2_nc(z + N + N2, N2, &ws_carry, 1); + bigint_add2_nc(z + N + N2, N2, &z_carry, 1); + + /* + * This is only actually required if cmp is != 0, however + * if cmp==0 then workspace[0:N] == 0 and avoiding the jump + * hides a timing channel. + */ + bigint_sub2(z + N2, 2*N-N2, workspace, N); + } + +/* +* Pick a good size for the Karatsuba multiply +*/ +size_t karatsuba_size(size_t z_size, + size_t x_size, size_t x_sw, + size_t y_size, size_t y_sw) + { + if(x_sw > x_size || x_sw > y_size || y_sw > x_size || y_sw > y_size) + return 0; + + if(((x_size == x_sw) && (x_size % 2)) || + ((y_size == y_sw) && (y_size % 2))) + return 0; + + const size_t start = (x_sw > y_sw) ? x_sw : y_sw; + const size_t end = (x_size < y_size) ? x_size : y_size; + + if(start == end) + { + if(start % 2) + return 0; + return start; + } + + for(size_t j = start; j <= end; ++j) + { + if(j % 2) + continue; + + if(2*j > z_size) + return 0; + + if(x_sw <= j && j <= x_size && y_sw <= j && j <= y_size) + { + if(j % 4 == 2 && + (j+2) <= x_size && (j+2) <= y_size && 2*(j+2) <= z_size) + return j+2; + return j; + } + } + + return 0; + } + +/* +* Pick a good size for the Karatsuba squaring +*/ +size_t karatsuba_size(size_t z_size, size_t x_size, size_t x_sw) + { + if(x_sw == x_size) + { + if(x_sw % 2) + return 0; + return x_sw; + } + + for(size_t j = x_sw; j <= x_size; ++j) + { + if(j % 2) + continue; + + if(2*j > z_size) + return 0; + + if(j % 4 == 2 && (j+2) <= x_size && 2*(j+2) <= z_size) + return j+2; + return j; + } + + return 0; + } + +} + +/* +* Multiplication Algorithm Dispatcher +*/ +void bigint_mul(word z[], size_t z_size, word workspace[], + const word x[], size_t x_size, size_t x_sw, + const word y[], size_t y_size, size_t y_sw) + { + if(x_sw == 1) + { + bigint_linmul3(z, y, y_sw, x[0]); + } + else if(y_sw == 1) + { + bigint_linmul3(z, x, x_sw, y[0]); + } + else if(x_sw <= 4 && x_size >= 4 && + y_sw <= 4 && y_size >= 4 && z_size >= 8) + { + bigint_comba_mul4(z, x, y); + } + else if(x_sw <= 6 && x_size >= 6 && + y_sw <= 6 && y_size >= 6 && z_size >= 12) + { + bigint_comba_mul6(z, x, y); + } + else if(x_sw <= 8 && x_size >= 8 && + y_sw <= 8 && y_size >= 8 && z_size >= 16) + { + bigint_comba_mul8(z, x, y); + } + else if(x_sw <= 16 && x_size >= 16 && + y_sw <= 16 && y_size >= 16 && z_size >= 32) + { + bigint_comba_mul16(z, x, y); + } + else if(x_sw < KARATSUBA_MULTIPLY_THRESHOLD || + y_sw < KARATSUBA_MULTIPLY_THRESHOLD || + !workspace) + { + bigint_simple_mul(z, x, x_sw, y, y_sw); + } + else + { + const size_t N = karatsuba_size(z_size, x_size, x_sw, y_size, y_sw); + + if(N) + karatsuba_mul(z, x, y, N, workspace); + else + bigint_simple_mul(z, x, x_sw, y, y_sw); + } + } + +/* +* Squaring Algorithm Dispatcher +*/ +void bigint_sqr(word z[], size_t z_size, word workspace[], + const word x[], size_t x_size, size_t x_sw) + { + if(x_sw == 1) + { + bigint_linmul3(z, x, x_sw, x[0]); + } + else if(x_sw <= 4 && x_size >= 4 && z_size >= 8) + { + bigint_comba_sqr4(z, x); + } + else if(x_sw <= 6 && x_size >= 6 && z_size >= 12) + { + bigint_comba_sqr6(z, x); + } + else if(x_sw <= 8 && x_size >= 8 && z_size >= 16) + { + bigint_comba_sqr8(z, x); + } + else if(x_sw <= 16 && x_size >= 16 && z_size >= 32) + { + bigint_comba_sqr16(z, x); + } + else if(x_size < KARATSUBA_SQUARE_THRESHOLD || !workspace) + { + bigint_simple_sqr(z, x, x_sw); + } + else + { + const size_t N = karatsuba_size(z_size, x_size, x_sw); + + if(N) + karatsuba_sqr(z, x, N, workspace); + else + bigint_simple_sqr(z, x, x_sw); + } + } + +} diff --git a/lib/math/mp/mp_misc.cpp b/lib/math/mp/mp_misc.cpp new file mode 100644 index 000000000..0efd5fd19 --- /dev/null +++ b/lib/math/mp/mp_misc.cpp @@ -0,0 +1,79 @@ +/* +* MP Misc Functions +* (C) 1999-2008 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/internal/mp_madd.h> + +namespace Botan { + +extern "C" { + +/* +* Compare two MP integers +*/ +s32bit bigint_cmp(const word x[], size_t x_size, + const word y[], size_t y_size) + { + if(x_size < y_size) { return (-bigint_cmp(y, y_size, x, x_size)); } + + while(x_size > y_size) + { + if(x[x_size-1]) + return 1; + x_size--; + } + + for(size_t i = x_size; i > 0; --i) + { + if(x[i-1] > y[i-1]) + return 1; + if(x[i-1] < y[i-1]) + return -1; + } + + return 0; + } + +/* +* Do a 2-word/1-word Division +*/ +word bigint_divop(word n1, word n0, word d) + { + word high = n1 % d, quotient = 0; + + for(size_t i = 0; i != MP_WORD_BITS; ++i) + { + word high_top_bit = (high & MP_WORD_TOP_BIT); + + high <<= 1; + high |= (n0 >> (MP_WORD_BITS-1-i)) & 1; + quotient <<= 1; + + if(high_top_bit || high >= d) + { + high -= d; + quotient |= 1; + } + } + + return quotient; + } + +/* +* Do a 2-word/1-word Modulo +*/ +word bigint_modop(word n1, word n0, word d) + { + word z = bigint_divop(n1, n0, d); + word dummy = 0; + z = word_madd2(z, d, &dummy); + return (n0-z); + } + +} + +} diff --git a/lib/math/mp/mp_monty.cpp b/lib/math/mp/mp_monty.cpp new file mode 100644 index 000000000..095457dbe --- /dev/null +++ b/lib/math/mp/mp_monty.cpp @@ -0,0 +1,101 @@ +/* +* Montgomery Reduction +* (C) 1999-2011 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/internal/mp_madd.h> +#include <botan/internal/mp_asmi.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/* +* Montgomery Reduction Algorithm +*/ +void bigint_monty_redc(word z[], + const word p[], size_t p_size, + word p_dash, word ws[]) + { + const size_t z_size = 2*(p_size+1); + + const size_t blocks_of_8 = p_size - (p_size % 8); + + for(size_t i = 0; i != p_size; ++i) + { + word* z_i = z + i; + + const word y = z_i[0] * p_dash; + + /* + bigint_linmul3(ws, p, p_size, y); + bigint_add2(z_i, z_size - i, ws, p_size+1); + */ + + word carry = 0; + + for(size_t j = 0; j != blocks_of_8; j += 8) + carry = word8_madd3(z_i + j, p + j, y, carry); + + for(size_t j = blocks_of_8; j != p_size; ++j) + z_i[j] = word_madd3(p[j], y, z_i[j], &carry); + + word z_sum = z_i[p_size] + carry; + carry = (z_sum < z_i[p_size]); + z_i[p_size] = z_sum; + + for(size_t j = p_size + 1; carry && j != z_size - i; ++j) + { + ++z_i[j]; + carry = !z_i[j]; + } + } + + word borrow = 0; + for(size_t i = 0; i != p_size; ++i) + ws[i] = word_sub(z[p_size + i], p[i], &borrow); + + ws[p_size] = word_sub(z[p_size+p_size], 0, &borrow); + + copy_mem(ws + p_size + 1, z + p_size, p_size + 1); + + copy_mem(z, ws + borrow*(p_size+1), p_size + 1); + clear_mem(z + p_size + 1, z_size - p_size - 1); + } + +void bigint_monty_mul(word z[], size_t z_size, + const word x[], size_t x_size, size_t x_sw, + const word y[], size_t y_size, size_t y_sw, + const word p[], size_t p_size, word p_dash, + word ws[]) + { + bigint_mul(&z[0], z_size, &ws[0], + &x[0], x_size, x_sw, + &y[0], y_size, y_sw); + + bigint_monty_redc(&z[0], + &p[0], p_size, p_dash, + &ws[0]); + } + +void bigint_monty_sqr(word z[], size_t z_size, + const word x[], size_t x_size, size_t x_sw, + const word p[], size_t p_size, word p_dash, + word ws[]) + { + bigint_sqr(&z[0], z_size, &ws[0], + &x[0], x_size, x_sw); + + bigint_monty_redc(&z[0], + &p[0], p_size, p_dash, + &ws[0]); + } + +} + +} diff --git a/lib/math/mp/mp_mulop.cpp b/lib/math/mp/mp_mulop.cpp new file mode 100644 index 000000000..0c79cc2ef --- /dev/null +++ b/lib/math/mp/mp_mulop.cpp @@ -0,0 +1,77 @@ +/* +* Simple O(N^2) Multiplication and Squaring +* (C) 1999-2008 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/internal/mp_madd.h> +#include <botan/internal/mp_asmi.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/* +* Simple O(N^2) Multiplication +*/ +void bigint_simple_mul(word z[], const word x[], size_t x_size, + const word y[], size_t y_size) + { + const size_t x_size_8 = x_size - (x_size % 8); + + clear_mem(z, x_size + y_size); + + for(size_t i = 0; i != y_size; ++i) + { + const word y_i = y[i]; + + word carry = 0; + + for(size_t j = 0; j != x_size_8; j += 8) + carry = word8_madd3(z + i + j, x + j, y_i, carry); + + for(size_t j = x_size_8; j != x_size; ++j) + z[i+j] = word_madd3(x[j], y_i, z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +/* +* Simple O(N^2) Squaring +* +* This is exactly the same algorithm as bigint_simple_mul, however +* because C/C++ compilers suck at alias analysis it is good to have +* the version where the compiler knows that x == y +* +* There is an O(n^1.5) squaring algorithm specified in Handbook of +* Applied Cryptography, chapter 14 +* +*/ +void bigint_simple_sqr(word z[], const word x[], size_t x_size) + { + const size_t x_size_8 = x_size - (x_size % 8); + + clear_mem(z, 2*x_size); + + for(size_t i = 0; i != x_size; ++i) + { + const word x_i = x[i]; + word carry = 0; + + for(size_t j = 0; j != x_size_8; j += 8) + carry = word8_madd3(z + i + j, x + j, x_i, carry); + + for(size_t j = x_size_8; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x_i, z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +} + +} diff --git a/lib/math/mp/mp_shift.cpp b/lib/math/mp/mp_shift.cpp new file mode 100644 index 000000000..0531658ec --- /dev/null +++ b/lib/math/mp/mp_shift.cpp @@ -0,0 +1,138 @@ +/* +* MP Shift Algorithms +* (C) 1999-2007 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#include <botan/internal/mp_core.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/* +* Single Operand Left Shift +*/ +void bigint_shl1(word x[], size_t x_size, size_t word_shift, size_t bit_shift) + { + if(word_shift) + { + for(size_t j = 1; j != x_size + 1; ++j) + x[(x_size - j) + word_shift] = x[x_size - j]; + clear_mem(x, word_shift); + } + + if(bit_shift) + { + word carry = 0; + for(size_t j = word_shift; j != x_size + word_shift + 1; ++j) + { + word temp = x[j]; + x[j] = (temp << bit_shift) | carry; + carry = (temp >> (MP_WORD_BITS - bit_shift)); + } + } + } + +/* +* Single Operand Right Shift +*/ +void bigint_shr1(word x[], size_t x_size, size_t word_shift, size_t bit_shift) + { + if(x_size < word_shift) + { + clear_mem(x, x_size); + return; + } + + if(word_shift) + { + copy_mem(x, x + word_shift, x_size - word_shift); + clear_mem(x + x_size - word_shift, word_shift); + } + + if(bit_shift) + { + word carry = 0; + + size_t top = x_size - word_shift; + + while(top >= 4) + { + word w = x[top-1]; + x[top-1] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + w = x[top-2]; + x[top-2] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + w = x[top-3]; + x[top-3] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + w = x[top-4]; + x[top-4] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + top -= 4; + } + + while(top) + { + word w = x[top-1]; + x[top-1] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + top--; + } + } + } + +/* +* Two Operand Left Shift +*/ +void bigint_shl2(word y[], const word x[], size_t x_size, + size_t word_shift, size_t bit_shift) + { + for(size_t j = 0; j != x_size; ++j) + y[j + word_shift] = x[j]; + if(bit_shift) + { + word carry = 0; + for(size_t j = word_shift; j != x_size + word_shift + 1; ++j) + { + word w = y[j]; + y[j] = (w << bit_shift) | carry; + carry = (w >> (MP_WORD_BITS - bit_shift)); + } + } + } + +/* +* Two Operand Right Shift +*/ +void bigint_shr2(word y[], const word x[], size_t x_size, + size_t word_shift, size_t bit_shift) + { + if(x_size < word_shift) return; + + for(size_t j = 0; j != x_size - word_shift; ++j) + y[j] = x[j + word_shift]; + if(bit_shift) + { + word carry = 0; + for(size_t j = x_size - word_shift; j > 0; --j) + { + word w = y[j-1]; + y[j-1] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + } + } + } + +} + +} diff --git a/lib/math/mp/mp_types.h b/lib/math/mp/mp_types.h new file mode 100644 index 000000000..60282fb83 --- /dev/null +++ b/lib/math/mp/mp_types.h @@ -0,0 +1,46 @@ +/* +* Low Level MPI Types +* (C) 1999-2007 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MPI_TYPES_H__ +#define BOTAN_MPI_TYPES_H__ + +#include <botan/types.h> +#include <botan/mul128.h> + +namespace Botan { + +#if (BOTAN_MP_WORD_BITS == 8) + typedef byte word; + typedef u16bit dword; + #define BOTAN_HAS_MP_DWORD +#elif (BOTAN_MP_WORD_BITS == 16) + typedef u16bit word; + typedef u32bit dword; + #define BOTAN_HAS_MP_DWORD +#elif (BOTAN_MP_WORD_BITS == 32) + typedef u32bit word; + typedef u64bit dword; + #define BOTAN_HAS_MP_DWORD +#elif (BOTAN_MP_WORD_BITS == 64) + typedef u64bit word; + + #if defined(BOTAN_TARGET_HAS_NATIVE_UINT128) + typedef uint128_t dword; + #define BOTAN_HAS_MP_DWORD + #endif + +#else + #error BOTAN_MP_WORD_BITS must be 8, 16, 32, or 64 +#endif + +const word MP_WORD_MASK = ~static_cast<word>(0); +const word MP_WORD_TOP_BIT = static_cast<word>(1) << (8*sizeof(word) - 1); +const word MP_WORD_MAX = MP_WORD_MASK; + +} + +#endif diff --git a/lib/math/mp/mp_x86_32/info.txt b/lib/math/mp/mp_x86_32/info.txt new file mode 100644 index 000000000..f36abaf62 --- /dev/null +++ b/lib/math/mp/mp_x86_32/info.txt @@ -0,0 +1,18 @@ +load_on dep + +mp_bits 32 + +<header:internal> +mp_madd.h +mp_asmi.h +</header:internal> + +<arch> +x86_32 +</arch> + +<cc> +clang +gcc +icc +</cc> diff --git a/lib/math/mp/mp_x86_32/mp_asmi.h b/lib/math/mp/mp_x86_32/mp_asmi.h new file mode 100644 index 000000000..9b858c8d5 --- /dev/null +++ b/lib/math/mp/mp_x86_32/mp_asmi.h @@ -0,0 +1,240 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2010 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/internal/mp_madd.h> + +namespace Botan { + +extern "C" { + +/* +* Helper Macros for x86 Assembly +*/ +#ifndef ASM + #define ASM(x) x "\n\t" +#endif + +#define ADDSUB2_OP(OPERATION, INDEX) \ + ASM("movl 4*" #INDEX "(%[y]), %[carry]") \ + ASM(OPERATION " %[carry], 4*" #INDEX "(%[x])") \ + +#define ADDSUB3_OP(OPERATION, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]), %[carry]") \ + ASM(OPERATION " 4*" #INDEX "(%[y]), %[carry]") \ + ASM("movl %[carry], 4*" #INDEX "(%[z])") \ + +#define LINMUL_OP(WRITE_TO, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]),%%eax") \ + ASM("mull %[y]") \ + ASM("addl %[carry],%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("movl %%edx,%[carry]") \ + ASM("movl %%eax, 4*" #INDEX "(%[" WRITE_TO "])") + +#define MULADD_OP(IGNORED, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]),%%eax") \ + ASM("mull %[y]") \ + ASM("addl %[carry],%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("addl 4*" #INDEX "(%[z]),%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("movl %%edx,%[carry]") \ + ASM("movl %%eax, 4*" #INDEX " (%[z])") + +#define DO_8_TIMES(MACRO, ARG) \ + MACRO(ARG, 0) \ + MACRO(ARG, 1) \ + MACRO(ARG, 2) \ + MACRO(ARG, 3) \ + MACRO(ARG, 4) \ + MACRO(ARG, 5) \ + MACRO(ARG, 6) \ + MACRO(ARG, 7) + +#define ADD_OR_SUBTRACT(CORE_CODE) \ + ASM("rorl %[carry]") \ + CORE_CODE \ + ASM("sbbl %[carry],%[carry]") \ + ASM("negl %[carry]") + +/* +* Word Addition +*/ +inline word word_add(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("adcl %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/* +* Eight Word Block Addition, Two Argument +*/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Addition, Three Argument +*/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Word Subtraction +*/ +inline word word_sub(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("sbbl %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Subtraction, Three Argument +*/ +inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul2(word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "x") + : [carry]"=r"(carry) + : [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "z") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/* +* Eight Word Block Multiply/Add +*/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(MULADD_OP, "") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mull %[y]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mull %[y]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +} + +} + +#endif diff --git a/lib/math/mp/mp_x86_32/mp_madd.h b/lib/math/mp/mp_x86_32/mp_madd.h new file mode 100644 index 000000000..9d60c721d --- /dev/null +++ b/lib/math/mp/mp_x86_32/mp_madd.h @@ -0,0 +1,67 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2008 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_WORD_MULADD_H__ +#define BOTAN_MP_WORD_MULADD_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS != 32) + #error The mp_x86_32 module requires that BOTAN_MP_WORD_BITS == 32 +#endif + +namespace Botan { + +extern "C" { + +/* +* Helper Macros for x86 Assembly +*/ +#define ASM(x) x "\n\t" + +/* +* Word Multiply +*/ +inline word word_madd2(word a, word b, word* c) + { + asm( + ASM("mull %[b]") + ASM("addl %[c],%[a]") + ASM("adcl $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c) + : "0"(a), "1"(b), [c]"g"(*c) : "cc"); + + return a; + } + +/* +* Word Multiply/Add +*/ +inline word word_madd3(word a, word b, word c, word* d) + { + asm( + ASM("mull %[b]") + + ASM("addl %[c],%[a]") + ASM("adcl $0,%[carry]") + + ASM("addl %[d],%[a]") + ASM("adcl $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d) + : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc"); + + return a; + } + +} + +} + +#endif diff --git a/lib/math/mp/mp_x86_32_msvc/info.txt b/lib/math/mp/mp_x86_32_msvc/info.txt new file mode 100644 index 000000000..3029d6a61 --- /dev/null +++ b/lib/math/mp/mp_x86_32_msvc/info.txt @@ -0,0 +1,16 @@ +mp_bits 32 + +load_on dep + +<header:internal> +mp_generic:mp_madd.h +mp_asmi.h +</header:internal> + +<arch> +x86_32 +</arch> + +<cc> +msvc +</cc> diff --git a/lib/math/mp/mp_x86_32_msvc/mp_asmi.h b/lib/math/mp/mp_x86_32_msvc/mp_asmi.h new file mode 100644 index 000000000..ef149c920 --- /dev/null +++ b/lib/math/mp/mp_x86_32_msvc/mp_asmi.h @@ -0,0 +1,542 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2010 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/internal/mp_madd.h> + +namespace Botan { + +extern "C" { + +/* +* Word Addition +*/ +inline word word_add(word x, word y, word* carry) + { + word z = x + y; + word c1 = (z < x); + z += *carry; + *carry = c1 | (z < *carry); + return z; + } + +/* +* Eight Word Block Addition, Two Argument +*/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + __asm { + mov edx,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[esi] + adc [edx],eax + mov eax,[esi+4] + adc [edx+4],eax + mov eax,[esi+8] + adc [edx+8],eax + mov eax,[esi+12] + adc [edx+12],eax + mov eax,[esi+16] + adc [edx+16],eax + mov eax,[esi+20] + adc [edx+20],eax + mov eax,[esi+24] + adc [edx+24],eax + mov eax,[esi+28] + adc [edx+28],eax + sbb eax,eax + neg eax + } + } + +/* +* Eight Word Block Addition, Three Argument +*/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + mov ebx,[z] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + adc eax,[esi] + mov [ebx],eax + + mov eax,[edi+4] + adc eax,[esi+4] + mov [ebx+4],eax + + mov eax,[edi+8] + adc eax,[esi+8] + mov [ebx+8],eax + + mov eax,[edi+12] + adc eax,[esi+12] + mov [ebx+12],eax + + mov eax,[edi+16] + adc eax,[esi+16] + mov [ebx+16],eax + + mov eax,[edi+20] + adc eax,[esi+20] + mov [ebx+20],eax + + mov eax,[edi+24] + adc eax,[esi+24] + mov [ebx+24],eax + + mov eax,[edi+28] + adc eax,[esi+28] + mov [ebx+28],eax + + sbb eax,eax + neg eax + } + } + +/* +* Word Subtraction +*/ +inline word word_sub(word x, word y, word* carry) + { + word t0 = x - y; + word c1 = (t0 > x); + word z = t0 - *carry; + *carry = c1 | (z > t0); + return z; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + sbb eax,[esi] + mov [edi],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [edi+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [edi+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [edi+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [edi+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [edi+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [edi+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [edi+28],eax + sbb eax,eax + neg eax + } + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + x[0] = word_sub(y[0], x[0], &carry); + x[1] = word_sub(y[1], x[1], &carry); + x[2] = word_sub(y[2], x[2], &carry); + x[3] = word_sub(y[3], x[3], &carry); + x[4] = word_sub(y[4], x[4], &carry); + x[5] = word_sub(y[5], x[5], &carry); + x[6] = word_sub(y[6], x[6], &carry); + x[7] = word_sub(y[7], x[7], &carry); + return carry; + } + + +/* +* Eight Word Block Subtraction, Three Argument +*/ +inline word word8_sub3(word z[8], const word x[8], + const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov ebx,[z] + mov eax,[edi] + sbb eax,[esi] + mov [ebx],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [ebx+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [ebx+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [ebx+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [ebx+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [ebx+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [ebx+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [ebx+28],eax + sbb eax,eax + neg eax + } + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul2(word x[8], word y, word carry) + { + __asm { + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [esi+28],eax //load a + + mov eax,edx //store carry + } + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_muladd(word z[8], const word x[8], + word y, word carry) + { + __asm { + mov esi,[x] + mov ebx,[y] + mov edi,[z] + mov eax,[esi] //load a + mul ebx //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + add eax,[edi] //sum lo z + adc edx,0 //sum hi z + mov ecx,edx //carry for next block = hi z + mov [edi],eax //save lo z + + mov eax,[esi+4] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+4] + adc edx,0 + mov ecx,edx + mov [edi+4],eax + + mov eax,[esi+8] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+8] + adc edx,0 + mov ecx,edx + mov [edi+8],eax + + mov eax,[esi+12] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+12] + adc edx,0 + mov ecx,edx + mov [edi+12],eax + + mov eax,[esi+16] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+16] + adc edx,0 + mov ecx,edx + mov [edi+16],eax + + mov eax,[esi+20] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+20] + adc edx,0 + mov ecx,edx + mov [edi+20],eax + + mov eax,[esi+24] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+24] + adc edx,0 + mov ecx,edx + mov [edi+24],eax + + mov eax,[esi+28] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+28] + adc edx,0 + mov [edi+28],eax + mov eax,edx + } + } + +inline word word8_linmul3(word z[4], const word x[4], word y, word carry) + { + __asm { +#if 0 + //it's slower!!! + mov edx,[z] + mov eax,[x] + movd mm7,[y] + + movd mm0,[eax] + movd mm1,[eax+4] + movd mm2,[eax+8] + pmuludq mm0,mm7 + pmuludq mm1,mm7 + pmuludq mm2,mm7 + + movd mm6,[carry] + paddq mm0,mm6 + movd [edx],mm0 + + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+4],mm1 + + movd mm3,[eax+12] + psrlq mm1,32 + paddq mm2,mm1 + movd [edx+8],mm2 + + pmuludq mm3,mm7 + movd mm4,[eax+16] + psrlq mm2,32 + paddq mm3,mm2 + movd [edx+12],mm3 + + pmuludq mm4,mm7 + movd mm5,[eax+20] + psrlq mm3,32 + paddq mm4,mm3 + movd [edx+16],mm4 + + pmuludq mm5,mm7 + movd mm0,[eax+24] + psrlq mm4,32 + paddq mm5,mm4 + movd [edx+20],mm5 + + pmuludq mm0,mm7 + movd mm1,[eax+28] + psrlq mm5,32 + paddq mm0,mm5 + movd [edx+24],mm0 + + pmuludq mm1,mm7 + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+28],mm1 + psrlq mm1,32 + + movd eax,mm1 + emms +#else + mov edi,[z] + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [edi+28],eax //load a + mov eax,edx //store carry +#endif + } + } + +/* +* Eight Word Block Multiply/Add +*/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd3(x[0], y, z[0], &carry); + z[1] = word_madd3(x[1], y, z[1], &carry); + z[2] = word_madd3(x[2], y, z[2], &carry); + z[3] = word_madd3(x[3], y, z[3], &carry); + z[4] = word_madd3(x[4], y, z[4], &carry); + z[5] = word_madd3(x[5], y, z[5], &carry); + z[6] = word_madd3(x[6], y, z[6], &carry); + z[7] = word_madd3(x[7], y, z[7], &carry); + return carry; + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b) + { + word carry = *w0; + *w0 = word_madd2(a, b, &carry); + *w1 += carry; + *w2 += (*w1 < carry) ? 1 : 0; + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b) + { + word carry = 0; + a = word_madd2(a, b, &carry); + b = carry; + + word top = (b >> (BOTAN_MP_WORD_BITS-1)); + b <<= 1; + b |= (a >> (BOTAN_MP_WORD_BITS-1)); + a <<= 1; + + carry = 0; + *w0 = word_add(*w0, a, &carry); + *w1 = word_add(*w1, b, &carry); + *w2 = word_add(*w2, top, &carry); + } + +} + +} + +#endif diff --git a/lib/math/mp/mp_x86_64/info.txt b/lib/math/mp/mp_x86_64/info.txt new file mode 100644 index 000000000..75c42ddc1 --- /dev/null +++ b/lib/math/mp/mp_x86_64/info.txt @@ -0,0 +1,18 @@ +load_on dep + +mp_bits 64 + +<header:internal> +mp_madd.h +mp_asmi.h +</header:internal> + +<arch> +x86_64 +</arch> + +<cc> +clang +gcc +icc +</cc> diff --git a/lib/math/mp/mp_x86_64/mp_asmi.h b/lib/math/mp/mp_x86_64/mp_asmi.h new file mode 100644 index 000000000..b2f1202e3 --- /dev/null +++ b/lib/math/mp/mp_x86_64/mp_asmi.h @@ -0,0 +1,248 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2010 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/internal/mp_madd.h> + +namespace Botan { + +extern "C" { + +/* +* Helper Macros for x86-64 Assembly +*/ +#ifndef ASM + #define ASM(x) x "\n\t" +#endif + +#define ADDSUB2_OP(OPERATION, INDEX) \ + ASM("movq 8*" #INDEX "(%[y]), %[carry]") \ + ASM(OPERATION " %[carry], 8*" #INDEX "(%[x])") \ + +#define ADDSUB3_OP(OPERATION, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]), %[carry]") \ + ASM(OPERATION " 8*" #INDEX "(%[y]), %[carry]") \ + ASM("movq %[carry], 8*" #INDEX "(%[z])") \ + +#define LINMUL_OP(WRITE_TO, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]),%%rax") \ + ASM("mulq %[y]") \ + ASM("addq %[carry],%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("movq %%rdx,%[carry]") \ + ASM("movq %%rax, 8*" #INDEX "(%[" WRITE_TO "])") + +#define MULADD_OP(IGNORED, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]),%%rax") \ + ASM("mulq %[y]") \ + ASM("addq %[carry],%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("addq 8*" #INDEX "(%[z]),%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("movq %%rdx,%[carry]") \ + ASM("movq %%rax, 8*" #INDEX " (%[z])") + +#define DO_8_TIMES(MACRO, ARG) \ + MACRO(ARG, 0) \ + MACRO(ARG, 1) \ + MACRO(ARG, 2) \ + MACRO(ARG, 3) \ + MACRO(ARG, 4) \ + MACRO(ARG, 5) \ + MACRO(ARG, 6) \ + MACRO(ARG, 7) + +#define ADD_OR_SUBTRACT(CORE_CODE) \ + ASM("rorq %[carry]") \ + CORE_CODE \ + ASM("sbbq %[carry],%[carry]") \ + ASM("negq %[carry]") + +/* +* Word Addition +*/ +inline word word_add(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("adcq %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/* +* Eight Word Block Addition, Two Argument +*/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Addition, Three Argument +*/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Word Subtraction +*/ +inline word word_sub(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("sbbq %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Subtraction, Three Argument +*/ +inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul2(word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "x") + : [carry]"=r"(carry) + : [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/* +* Eight Word Block Linear Multiplication +*/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "z") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/* +* Eight Word Block Multiply/Add +*/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(MULADD_OP, "") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mulq %[y]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +/* +* Multiply-Add Accumulator +*/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mulq %[y]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + + +#undef ASM +#undef DO_8_TIMES +#undef ADD_OR_SUBTRACT +#undef ADDSUB2_OP +#undef ADDSUB3_OP +#undef LINMUL_OP +#undef MULADD_OP + +} + +} +#endif diff --git a/lib/math/mp/mp_x86_64/mp_madd.h b/lib/math/mp/mp_x86_64/mp_madd.h new file mode 100644 index 000000000..4c0d79931 --- /dev/null +++ b/lib/math/mp/mp_x86_64/mp_madd.h @@ -0,0 +1,69 @@ +/* +* Lowest Level MPI Algorithms +* (C) 1999-2008 Jack Lloyd +* 2006 Luca Piccarreta +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_MP_WORD_MULADD_H__ +#define BOTAN_MP_WORD_MULADD_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS != 64) + #error The mp_x86_64 module requires that BOTAN_MP_WORD_BITS == 64 +#endif + +namespace Botan { + +extern "C" { + +/* +* Helper Macros for x86-64 Assembly +*/ +#define ASM(x) x "\n\t" + +/* +* Word Multiply +*/ +inline word word_madd2(word a, word b, word* c) + { + asm( + ASM("mulq %[b]") + ASM("addq %[c],%[a]") + ASM("adcq $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c) + : "0"(a), "1"(b), [c]"g"(*c) : "cc"); + + return a; + } + +/* +* Word Multiply/Add +*/ +inline word word_madd3(word a, word b, word c, word* d) + { + asm( + ASM("mulq %[b]") + + ASM("addq %[c],%[a]") + ASM("adcq $0,%[carry]") + + ASM("addq %[d],%[a]") + ASM("adcq $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d) + : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc"); + + return a; + } + +#undef ASM + +} + +} + +#endif |