diff options
author | lloyd <[email protected]> | 2008-09-30 22:41:49 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2008-09-30 22:41:49 +0000 |
commit | 13d08cbe978c4cd0de01aa0120c39470508cbbcb (patch) | |
tree | ff93739131cbca0dbdf23a31cd4b7611faf5aa6e /src/math/bigint | |
parent | 8854fe339f2e1f81091ba65c042824e8cc62cbbc (diff) |
Rearrange BigInt directories:
math/bigint - BigInt implementation
math/numbertheory - Math stuff built on top of BigInt
Coming soon: math/gfp (parts of pk/ecdsa)
Update deps in the pk files
Diffstat (limited to 'src/math/bigint')
41 files changed, 5677 insertions, 0 deletions
diff --git a/src/math/bigint/big_code.cpp b/src/math/bigint/big_code.cpp new file mode 100644 index 000000000..e7a5e4946 --- /dev/null +++ b/src/math/bigint/big_code.cpp @@ -0,0 +1,152 @@ +/************************************************* +* BigInt Encoding/Decoding Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/bigint.h> +#include <botan/divide.h> +#include <botan/charset.h> +#include <botan/hex.h> + +namespace Botan { + +/************************************************* +* Encode a BigInt * +*************************************************/ +void BigInt::encode(byte output[], const BigInt& n, Base base) + { + if(base == Binary) + n.binary_encode(output); + else if(base == Hexadecimal) + { + SecureVector<byte> binary(n.encoded_size(Binary)); + n.binary_encode(binary); + for(u32bit j = 0; j != binary.size(); ++j) + Hex_Encoder::encode(binary[j], output + 2*j); + } + else if(base == Octal) + { + BigInt copy = n; + const u32bit output_size = n.encoded_size(Octal); + for(u32bit j = 0; j != output_size; ++j) + { + output[output_size - 1 - j] = Charset::digit2char(copy % 8); + copy /= 8; + } + } + else if(base == Decimal) + { + BigInt copy = n; + BigInt remainder; + copy.set_sign(Positive); + const u32bit output_size = n.encoded_size(Decimal); + for(u32bit j = 0; j != output_size; ++j) + { + divide(copy, 10, copy, remainder); + output[output_size - 1 - j] = + Charset::digit2char(remainder.word_at(0)); + if(copy.is_zero()) + break; + } + } + else + throw Invalid_Argument("Unknown BigInt encoding method"); + } + +/************************************************* +* Encode a BigInt * +*************************************************/ +SecureVector<byte> BigInt::encode(const BigInt& n, Base base) + { + SecureVector<byte> output(n.encoded_size(base)); + encode(output, n, base); + if(base != Binary) + for(u32bit j = 0; j != output.size(); ++j) + if(output[j] == 0) + output[j] = '0'; + return output; + } + +/************************************************* +* Encode a BigInt, with leading 0s if needed * +*************************************************/ +SecureVector<byte> BigInt::encode_1363(const BigInt& n, u32bit bytes) + { + const u32bit n_bytes = n.bytes(); + if(n_bytes > bytes) + throw Encoding_Error("encode_1363: n is too large to encode properly"); + + const u32bit leading_0s = bytes - n_bytes; + + SecureVector<byte> output(bytes); + encode(output + leading_0s, n, Binary); + return output; + } + +/************************************************* +* Decode a BigInt * +*************************************************/ +BigInt BigInt::decode(const MemoryRegion<byte>& buf, Base base) + { + return BigInt::decode(buf, buf.size(), base); + } + +/************************************************* +* Decode a BigInt * +*************************************************/ +BigInt BigInt::decode(const byte buf[], u32bit length, Base base) + { + BigInt r; + if(base == Binary) + r.binary_decode(buf, length); + else if(base == Hexadecimal) + { + SecureVector<byte> hex; + for(u32bit j = 0; j != length; ++j) + if(Hex_Decoder::is_valid(buf[j])) + hex.append(buf[j]); + + u32bit offset = (hex.size() % 2); + SecureVector<byte> binary(hex.size() / 2 + offset); + + if(offset) + { + byte temp[2] = { '0', hex[0] }; + binary[0] = Hex_Decoder::decode(temp); + } + + for(u32bit j = offset; j != binary.size(); ++j) + binary[j] = Hex_Decoder::decode(hex+2*j-offset); + r.binary_decode(binary, binary.size()); + } + else if(base == Decimal || base == Octal) + { + const u32bit RADIX = ((base == Decimal) ? 10 : 8); + for(u32bit j = 0; j != length; ++j) + { + if(Charset::is_space(buf[j])) + continue; + + if(!Charset::is_digit(buf[j])) + throw Invalid_Argument("BigInt::decode: " + "Invalid character in decimal input"); + + byte x = Charset::char2digit(buf[j]); + if(x >= RADIX) + { + if(RADIX == 10) + throw Invalid_Argument("BigInt: Invalid decimal string"); + else + throw Invalid_Argument("BigInt: Invalid octal string"); + } + + r *= RADIX; + r += x; + } + } + else + throw Invalid_Argument("Unknown BigInt decoding method"); + return r; + } + +} diff --git a/src/math/bigint/big_io.cpp b/src/math/bigint/big_io.cpp new file mode 100644 index 000000000..3c201e8b2 --- /dev/null +++ b/src/math/bigint/big_io.cpp @@ -0,0 +1,53 @@ +/************************************************* +* BigInt Input/Output Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/bigint.h> +#include <iostream> + +namespace Botan { + +/************************************************* +* Write the BigInt into a stream * +*************************************************/ +std::ostream& operator<<(std::ostream& stream, const BigInt& n) + { + BigInt::Base base = BigInt::Decimal; + if(stream.flags() & std::ios::hex) + base = BigInt::Hexadecimal; + else if(stream.flags() & std::ios::oct) + base = BigInt::Octal; + + if(n == 0) + stream.write("0", 1); + else + { + if(n < 0) + stream.write("-", 1); + SecureVector<byte> buffer = BigInt::encode(n, base); + u32bit skip = 0; + while(buffer[skip] == '0' && skip < buffer.size()) + ++skip; + stream.write(reinterpret_cast<const char*>(buffer.begin()) + skip, + buffer.size() - skip); + } + if(!stream.good()) + throw Stream_IO_Error("BigInt output operator has failed"); + return stream; + } + +/************************************************* +* Read the BigInt from a stream * +*************************************************/ +std::istream& operator>>(std::istream& stream, BigInt& n) + { + std::string str; + std::getline(stream, str); + if(stream.bad() || (stream.fail() && !stream.eof())) + throw Stream_IO_Error("BigInt input operator has failed"); + n = BigInt(str); + return stream; + } + +} diff --git a/src/math/bigint/big_ops2.cpp b/src/math/bigint/big_ops2.cpp new file mode 100644 index 000000000..ef083f394 --- /dev/null +++ b/src/math/bigint/big_ops2.cpp @@ -0,0 +1,222 @@ +/************************************************* +* BigInt Assignment Operators Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/bigint.h> +#include <botan/mp_core.h> +#include <botan/bit_ops.h> +#include <algorithm> + +namespace Botan { + +/************************************************* +* Addition Operator * +*************************************************/ +BigInt& BigInt::operator+=(const BigInt& y) + { + const u32bit x_sw = sig_words(), y_sw = y.sig_words(); + + const u32bit reg_size = std::max(x_sw, y_sw) + 1; + grow_to(reg_size); + + if(sign() == y.sign()) + bigint_add2(get_reg(), reg_size - 1, y.data(), y_sw); + else + { + s32bit relative_size = bigint_cmp(data(), x_sw, y.data(), y_sw); + + if(relative_size < 0) + { + SecureVector<word> z(reg_size - 1); + bigint_sub3(z, y.data(), reg_size - 1, data(), x_sw); + copy_mem(get_reg().begin(), z.begin(), z.size()); + set_sign(y.sign()); + } + else if(relative_size == 0) + { + get_reg().clear(); + set_sign(Positive); + } + else if(relative_size > 0) + bigint_sub2(get_reg(), x_sw, y.data(), y_sw); + } + + return (*this); + } + +/************************************************* +* Subtraction Operator * +*************************************************/ +BigInt& BigInt::operator-=(const BigInt& y) + { + const u32bit x_sw = sig_words(), y_sw = y.sig_words(); + + s32bit relative_size = bigint_cmp(data(), x_sw, y.data(), y_sw); + + const u32bit reg_size = std::max(x_sw, y_sw) + 1; + grow_to(reg_size); + + if(relative_size < 0) + { + if(sign() == y.sign()) + { + SecureVector<word> z(reg_size - 1); + bigint_sub3(z, y.data(), reg_size - 1, data(), x_sw); + copy_mem(get_reg().begin(), z.begin(), z.size()); + } + else + bigint_add2(get_reg(), reg_size - 1, y.data(), y_sw); + + set_sign(y.reverse_sign()); + } + else if(relative_size == 0) + { + if(sign() == y.sign()) + { + get_reg().clear(); + set_sign(Positive); + } + else + bigint_shl1(get_reg(), x_sw, 0, 1); + } + else if(relative_size > 0) + { + if(sign() == y.sign()) + bigint_sub2(get_reg(), x_sw, y.data(), y_sw); + else + bigint_add2(get_reg(), reg_size - 1, y.data(), y_sw); + } + + return (*this); + } + +/************************************************* +* Multiplication Operator * +*************************************************/ +BigInt& BigInt::operator*=(const BigInt& y) + { + const u32bit x_sw = sig_words(), y_sw = y.sig_words(); + set_sign((sign() == y.sign()) ? Positive : Negative); + + if(x_sw == 0 || y_sw == 0) + { + get_reg().clear(); + set_sign(Positive); + } + else if(x_sw == 1 && y_sw) + { + grow_to(y_sw + 2); + bigint_linmul3(get_reg(), y.data(), y_sw, word_at(0)); + } + else if(y_sw == 1 && x_sw) + { + grow_to(x_sw + 2); + bigint_linmul2(get_reg(), x_sw, y.word_at(0)); + } + else + { + grow_to(size() + y.size()); + + SecureVector<word> z(data(), x_sw); + SecureVector<word> workspace(size()); + + bigint_mul(get_reg(), size(), workspace, + z, z.size(), x_sw, + y.data(), y.size(), y_sw); + } + + return (*this); + } + +/************************************************* +* Division Operator * +*************************************************/ +BigInt& BigInt::operator/=(const BigInt& y) + { + if(y.sig_words() == 1 && power_of_2(y.word_at(0))) + (*this) >>= (y.bits() - 1); + else + (*this) = (*this) / y; + return (*this); + } + +/************************************************* +* Modulo Operator * +*************************************************/ +BigInt& BigInt::operator%=(const BigInt& mod) + { + return (*this = (*this) % mod); + } + +/************************************************* +* Modulo Operator * +*************************************************/ +word BigInt::operator%=(word mod) + { + if(mod == 0) + throw BigInt::DivideByZero(); + if(power_of_2(mod)) + { + word result = (word_at(0) & (mod - 1)); + clear(); + grow_to(2); + get_reg()[0] = result; + return result; + } + + word remainder = 0; + + for(u32bit j = sig_words(); j > 0; --j) + remainder = bigint_modop(remainder, word_at(j-1), mod); + clear(); + grow_to(2); + + if(remainder && sign() == BigInt::Negative) + get_reg()[0] = mod - remainder; + else + get_reg()[0] = remainder; + + set_sign(BigInt::Positive); + + return word_at(0); + } + +/************************************************* +* Left Shift Operator * +*************************************************/ +BigInt& BigInt::operator<<=(u32bit shift) + { + if(shift) + { + const u32bit shift_words = shift / MP_WORD_BITS, + shift_bits = shift % MP_WORD_BITS, + words = sig_words(); + + grow_to(words + shift_words + (shift_bits ? 1 : 0)); + bigint_shl1(get_reg(), words, shift_words, shift_bits); + } + + return (*this); + } + +/************************************************* +* Right Shift Operator * +*************************************************/ +BigInt& BigInt::operator>>=(u32bit shift) + { + if(shift) + { + const u32bit shift_words = shift / MP_WORD_BITS, + shift_bits = shift % MP_WORD_BITS; + + bigint_shr1(get_reg(), sig_words(), shift_words, shift_bits); + + if(is_zero()) + set_sign(Positive); + } + + return (*this); + } + +} diff --git a/src/math/bigint/big_ops3.cpp b/src/math/bigint/big_ops3.cpp new file mode 100644 index 000000000..ff24eab1c --- /dev/null +++ b/src/math/bigint/big_ops3.cpp @@ -0,0 +1,188 @@ +/************************************************* +* BigInt Binary Operators Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/bigint.h> +#include <botan/divide.h> +#include <botan/mp_core.h> +#include <botan/bit_ops.h> +#include <algorithm> + +namespace Botan { + +/************************************************* +* Addition Operator * +*************************************************/ +BigInt operator+(const BigInt& x, const BigInt& y) + { + const u32bit x_sw = x.sig_words(), y_sw = y.sig_words(); + + BigInt z(x.sign(), std::max(x_sw, y_sw) + 1); + + if((x.sign() == y.sign())) + bigint_add3(z.get_reg(), x.data(), x_sw, y.data(), y_sw); + else + { + s32bit relative_size = bigint_cmp(x.data(), x_sw, y.data(), y_sw); + + if(relative_size < 0) + { + bigint_sub3(z.get_reg(), y.data(), y_sw, x.data(), x_sw); + z.set_sign(y.sign()); + } + else if(relative_size == 0) + z.set_sign(BigInt::Positive); + else if(relative_size > 0) + bigint_sub3(z.get_reg(), x.data(), x_sw, y.data(), y_sw); + } + + return z; + } + +/************************************************* +* Subtraction Operator * +*************************************************/ +BigInt operator-(const BigInt& x, const BigInt& y) + { + const u32bit x_sw = x.sig_words(), y_sw = y.sig_words(); + + s32bit relative_size = bigint_cmp(x.data(), x_sw, y.data(), y_sw); + + BigInt z(BigInt::Positive, std::max(x_sw, y_sw) + 1); + + if(relative_size < 0) + { + if(x.sign() == y.sign()) + bigint_sub3(z.get_reg(), y.data(), y_sw, x.data(), x_sw); + else + bigint_add3(z.get_reg(), x.data(), x_sw, y.data(), y_sw); + z.set_sign(y.reverse_sign()); + } + else if(relative_size == 0) + { + if(x.sign() != y.sign()) + bigint_shl2(z.get_reg(), x.data(), x_sw, 0, 1); + } + else if(relative_size > 0) + { + if(x.sign() == y.sign()) + bigint_sub3(z.get_reg(), x.data(), x_sw, y.data(), y_sw); + else + bigint_add3(z.get_reg(), x.data(), x_sw, y.data(), y_sw); + z.set_sign(x.sign()); + } + return z; + } + +/************************************************* +* Multiplication Operator * +*************************************************/ +BigInt operator*(const BigInt& x, const BigInt& y) + { + const u32bit x_sw = x.sig_words(), y_sw = y.sig_words(); + + BigInt z(BigInt::Positive, x.size() + y.size()); + + if(x_sw == 1 && y_sw) + bigint_linmul3(z.get_reg(), y.data(), y_sw, x.word_at(0)); + else if(y_sw == 1 && x_sw) + bigint_linmul3(z.get_reg(), x.data(), x_sw, y.word_at(0)); + else if(x_sw && y_sw) + { + SecureVector<word> workspace(z.size()); + bigint_mul(z.get_reg(), z.size(), workspace, + x.data(), x.size(), x_sw, + y.data(), y.size(), y_sw); + } + + if(x_sw && y_sw && x.sign() != y.sign()) + z.flip_sign(); + return z; + } + +/************************************************* +* Division Operator * +*************************************************/ +BigInt operator/(const BigInt& x, const BigInt& y) + { + BigInt q, r; + divide(x, y, q, r); + return q; + } + +/************************************************* +* Modulo Operator * +*************************************************/ +BigInt operator%(const BigInt& n, const BigInt& mod) + { + if(mod.is_zero()) + throw BigInt::DivideByZero(); + if(mod.is_negative()) + throw Invalid_Argument("BigInt::operator%: modulus must be > 0"); + if(n.is_positive() && mod.is_positive() && n < mod) + return n; + + BigInt q, r; + divide(n, mod, q, r); + return r; + } + +/************************************************* +* Modulo Operator * +*************************************************/ +word operator%(const BigInt& n, word mod) + { + if(mod == 0) + throw BigInt::DivideByZero(); + if(power_of_2(mod)) + return (n.word_at(0) & (mod - 1)); + + word remainder = 0; + + for(u32bit j = n.sig_words(); j > 0; --j) + remainder = bigint_modop(remainder, n.word_at(j-1), mod); + + if(remainder && n.sign() == BigInt::Negative) + return mod - remainder; + return remainder; + } + +/************************************************* +* Left Shift Operator * +*************************************************/ +BigInt operator<<(const BigInt& x, u32bit shift) + { + if(shift == 0) + return x; + + const u32bit shift_words = shift / MP_WORD_BITS, + shift_bits = shift % MP_WORD_BITS; + + const u32bit x_sw = x.sig_words(); + + BigInt y(x.sign(), x_sw + shift_words + (shift_bits ? 1 : 0)); + bigint_shl2(y.get_reg(), x.data(), x_sw, shift_words, shift_bits); + return y; + } + +/************************************************* +* Right Shift Operator * +*************************************************/ +BigInt operator>>(const BigInt& x, u32bit shift) + { + if(shift == 0) + return x; + if(x.bits() <= shift) + return 0; + + const u32bit shift_words = shift / MP_WORD_BITS, + shift_bits = shift % MP_WORD_BITS, + x_sw = x.sig_words(); + + BigInt y(x.sign(), x_sw - shift_words); + bigint_shr2(y.get_reg(), x.data(), x_sw, shift_words, shift_bits); + return y; + } + +} diff --git a/src/math/bigint/big_rand.cpp b/src/math/bigint/big_rand.cpp new file mode 100644 index 000000000..055873642 --- /dev/null +++ b/src/math/bigint/big_rand.cpp @@ -0,0 +1,59 @@ +/************************************************* +* BigInt Random Generation Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/bigint.h> +#include <botan/parsing.h> + +namespace Botan { + +/************************************************* +* Construct a BigInt of a specific form * +*************************************************/ +BigInt::BigInt(NumberType type, u32bit bits) + { + set_sign(Positive); + + if(type == Power2) + set_bit(bits); + else + throw Invalid_Argument("BigInt(NumberType): Unknown type"); + } + +/************************************************* +* Randomize this number * +*************************************************/ +void BigInt::randomize(RandomNumberGenerator& rng, + u32bit bitsize) + { + set_sign(Positive); + + if(bitsize == 0) + clear(); + else + { + SecureVector<byte> array((bitsize + 7) / 8); + rng.randomize(array, array.size()); + if(bitsize % 8) + array[0] &= 0xFF >> (8 - (bitsize % 8)); + array[0] |= 0x80 >> ((bitsize % 8) ? (8 - bitsize % 8) : 0); + binary_decode(array, array.size()); + } + } + +/************************************************* +* Generate a random integer within given range * +*************************************************/ +BigInt BigInt::random_integer(RandomNumberGenerator& rng, + const BigInt& min, const BigInt& max) + { + BigInt range = max - min; + + if(range <= 0) + throw Invalid_Argument("random_integer: invalid min/max values"); + + return (min + (BigInt(rng, range.bits() + 2) % range)); + } + +} diff --git a/src/math/bigint/bigint.cpp b/src/math/bigint/bigint.cpp new file mode 100644 index 000000000..e3c7931e6 --- /dev/null +++ b/src/math/bigint/bigint.cpp @@ -0,0 +1,367 @@ +/************************************************* +* BigInt Base Source File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/bigint.h> +#include <botan/mp_core.h> +#include <botan/loadstor.h> +#include <botan/parsing.h> +#include <botan/util.h> + +namespace Botan { + +/************************************************* +* Construct a BigInt from a regular number * +*************************************************/ +BigInt::BigInt(u64bit n) + { + set_sign(Positive); + + if(n == 0) + return; + + const u32bit limbs_needed = sizeof(u64bit) / sizeof(word); + + reg.create(4*limbs_needed); + for(u32bit j = 0; j != limbs_needed; ++j) + reg[j] = ((n >> (j*MP_WORD_BITS)) & MP_WORD_MASK); + } + +/************************************************* +* Construct a BigInt of the specified size * +*************************************************/ +BigInt::BigInt(Sign s, u32bit size) + { + reg.create(round_up(size, 8)); + signedness = s; + } + +/************************************************* +* Construct a BigInt from a "raw" BigInt * +*************************************************/ +BigInt::BigInt(const BigInt& b) + { + const u32bit b_words = b.sig_words(); + + if(b_words) + { + reg.create(round_up(b_words, 8)); + reg.copy(b.data(), b_words); + set_sign(b.sign()); + } + else + { + reg.create(2); + set_sign(Positive); + } + } + +/************************************************* +* Construct a BigInt from a string * +*************************************************/ +BigInt::BigInt(const std::string& str) + { + Base base = Decimal; + u32bit markers = 0; + bool negative = false; + if(str.length() > 0 && str[0] == '-') { markers += 1; negative = true; } + + if(str.length() > markers + 2 && str[markers ] == '0' && + str[markers + 1] == 'x') + { markers += 2; base = Hexadecimal; } + else if(str.length() > markers + 1 && str[markers] == '0') + { markers += 1; base = Octal; } + + *this = decode(reinterpret_cast<const byte*>(str.data()) + markers, + str.length() - markers, base); + + if(negative) set_sign(Negative); + else set_sign(Positive); + } + +/************************************************* +* Construct a BigInt from an encoded BigInt * +*************************************************/ +BigInt::BigInt(const byte input[], u32bit length, Base base) + { + set_sign(Positive); + *this = decode(input, length, base); + } + +/************************************************* +* Construct a BigInt from an encoded BigInt * +*************************************************/ +BigInt::BigInt(RandomNumberGenerator& rng, u32bit bits) + { + set_sign(Positive); + randomize(rng, bits); + } + +/************************************************* +* Swap this BigInt with another * +*************************************************/ +void BigInt::swap(BigInt& other) + { + reg.swap(other.reg); + std::swap(signedness, other.signedness); + } + +/************************************************* +* Grow the internal storage * +*************************************************/ +void BigInt::grow_reg(u32bit n) + { + reg.grow_to(round_up(size() + n, 8)); + } + +/************************************************* +* Grow the internal storage * +*************************************************/ +void BigInt::grow_to(u32bit n) + { + if(n > size()) + reg.grow_to(round_up(n, 8)); + } + +/************************************************* +* Comparison Function * +*************************************************/ +s32bit BigInt::cmp(const BigInt& n, bool check_signs) const + { + if(check_signs) + { + if(n.is_positive() && this->is_negative()) return -1; + if(n.is_negative() && this->is_positive()) return 1; + if(n.is_negative() && this->is_negative()) + return (-bigint_cmp(data(), sig_words(), n.data(), n.sig_words())); + } + return bigint_cmp(data(), sig_words(), n.data(), n.sig_words()); + } + +/************************************************* +* Convert this number to a u32bit, if possible * +*************************************************/ +u32bit BigInt::to_u32bit() const + { + if(is_negative()) + throw Encoding_Error("BigInt::to_u32bit: Number is negative"); + if(bits() >= 32) + throw Encoding_Error("BigInt::to_u32bit: Number is too big to convert"); + + u32bit out = 0; + for(u32bit j = 0; j != 4; ++j) + out = (out << 8) | byte_at(3-j); + return out; + } + +/************************************************* +* Return byte n of this number * +*************************************************/ +byte BigInt::byte_at(u32bit n) const + { + const u32bit WORD_BYTES = sizeof(word); + u32bit word_num = n / WORD_BYTES, byte_num = n % WORD_BYTES; + if(word_num >= size()) + return 0; + else + return get_byte(WORD_BYTES - byte_num - 1, reg[word_num]); + } + +/************************************************* +* Return bit n of this number * +*************************************************/ +bool BigInt::get_bit(u32bit n) const + { + return ((word_at(n / MP_WORD_BITS) >> (n % MP_WORD_BITS)) & 1); + } + +/************************************************* +* Return bits {offset...offset+length} * +*************************************************/ +u32bit BigInt::get_substring(u32bit offset, u32bit length) const + { + if(length > 32) + throw Invalid_Argument("BigInt::get_substring: Substring size too big"); + + u64bit piece = 0; + for(u32bit j = 0; j != 8; ++j) + piece = (piece << 8) | byte_at((offset / 8) + (7-j)); + + u64bit mask = (1 << length) - 1; + u32bit shift = (offset % 8); + + return static_cast<u32bit>((piece >> shift) & mask); + } + +/************************************************* +* Set bit number n * +*************************************************/ +void BigInt::set_bit(u32bit n) + { + const u32bit which = n / MP_WORD_BITS; + const word mask = static_cast<word>(1) << (n % MP_WORD_BITS); + if(which >= size()) grow_to(which + 1); + reg[which] |= mask; + } + +/************************************************* +* Clear bit number n * +*************************************************/ +void BigInt::clear_bit(u32bit n) + { + const u32bit which = n / MP_WORD_BITS; + const word mask = static_cast<word>(1) << (n % MP_WORD_BITS); + if(which < size()) + reg[which] &= ~mask; + } + +/************************************************* +* Clear all but the lowest n bits * +*************************************************/ +void BigInt::mask_bits(u32bit n) + { + if(n == 0) { clear(); return; } + if(n >= bits()) return; + + const u32bit top_word = n / MP_WORD_BITS; + const word mask = (static_cast<word>(1) << (n % MP_WORD_BITS)) - 1; + + if(top_word < size()) + for(u32bit j = top_word + 1; j != size(); ++j) + reg[j] = 0; + + reg[top_word] &= mask; + } + +/************************************************* +* Count how many bytes are being used * +*************************************************/ +u32bit BigInt::bytes() const + { + return (bits() + 7) / 8; + } + +/************************************************* +* Count how many bits are being used * +*************************************************/ +u32bit BigInt::bits() const + { + if(sig_words() == 0) + return 0; + + u32bit full_words = sig_words() - 1, top_bits = MP_WORD_BITS; + word top_word = word_at(full_words), mask = MP_WORD_TOP_BIT; + + while(top_bits && ((top_word & mask) == 0)) + { mask >>= 1; top_bits--; } + + return (full_words * MP_WORD_BITS + top_bits); + } + +/************************************************* +* Calcluate the size in a certain base * +*************************************************/ +u32bit BigInt::encoded_size(Base base) const + { + static const double LOG_2_BASE_10 = 0.30102999566; + + if(base == Binary) + return bytes(); + else if(base == Hexadecimal) + return 2*bytes(); + else if(base == Octal) + return ((bits() + 2) / 3); + else if(base == Decimal) + return static_cast<u32bit>((bits() * LOG_2_BASE_10) + 1); + else + throw Invalid_Argument("Unknown base for BigInt encoding"); + } + +/************************************************* +* Set the sign * +*************************************************/ +void BigInt::set_sign(Sign s) + { + if(is_zero()) + signedness = Positive; + else + signedness = s; + } + +/************************************************* +* Reverse the value of the sign flag * +*************************************************/ +void BigInt::flip_sign() + { + set_sign(reverse_sign()); + } + +/************************************************* +* Return the opposite value of the current sign * +*************************************************/ +BigInt::Sign BigInt::reverse_sign() const + { + if(sign() == Positive) + return Negative; + return Positive; + } + +/************************************************* +* Return the negation of this number * +*************************************************/ +BigInt BigInt::operator-() const + { + BigInt x = (*this); + x.flip_sign(); + return x; + } + +/************************************************* +* Return the absolute value of this number * +*************************************************/ +BigInt BigInt::abs() const + { + BigInt x = (*this); + x.set_sign(Positive); + return x; + } + +/************************************************* +* Encode this number into bytes * +*************************************************/ +void BigInt::binary_encode(byte output[]) const + { + const u32bit sig_bytes = bytes(); + for(u32bit j = 0; j != sig_bytes; ++j) + output[sig_bytes-j-1] = byte_at(j); + } + +/************************************************* +* Set this number to the value in buf * +*************************************************/ +void BigInt::binary_decode(const byte buf[], u32bit length) + { + const u32bit WORD_BYTES = sizeof(word); + + reg.create(round_up((length / WORD_BYTES) + 1, 8)); + + for(u32bit j = 0; j != length / WORD_BYTES; ++j) + { + u32bit top = length - WORD_BYTES*j; + for(u32bit k = WORD_BYTES; k > 0; --k) + reg[j] = (reg[j] << 8) | buf[top - k]; + } + for(u32bit j = 0; j != length % WORD_BYTES; ++j) + reg[length / WORD_BYTES] = (reg[length / WORD_BYTES] << 8) | buf[j]; + } + +/************************************************* +* Set this number to the value in buf * +*************************************************/ +void BigInt::binary_decode(const MemoryRegion<byte>& buf) + { + binary_decode(buf, buf.size()); + } + +} diff --git a/src/math/bigint/bigint.h b/src/math/bigint/bigint.h new file mode 100644 index 000000000..bb1d7ef7b --- /dev/null +++ b/src/math/bigint/bigint.h @@ -0,0 +1,184 @@ +/************************************************* +* BigInt Header File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_BIGINT_H__ +#define BOTAN_BIGINT_H__ + +#include <botan/rng.h> +#include <botan/secmem.h> +#include <botan/mp_types.h> +#include <iosfwd> + +namespace Botan { + +/************************************************* +* BigInt * +*************************************************/ +class BOTAN_DLL BigInt + { + public: + enum Base { Octal = 8, Decimal = 10, Hexadecimal = 16, Binary = 256 }; + enum Sign { Negative = 0, Positive = 1 }; + enum NumberType { Power2 }; + + struct DivideByZero : public Exception + { DivideByZero() : Exception("BigInt divide by zero") {} }; + + BigInt& operator+=(const BigInt&); + BigInt& operator-=(const BigInt&); + + BigInt& operator*=(const BigInt&); + BigInt& operator/=(const BigInt&); + BigInt& operator%=(const BigInt&); + word operator%=(word); + BigInt& operator<<=(u32bit); + BigInt& operator>>=(u32bit); + + BigInt& operator++() { return (*this += 1); } + BigInt& operator--() { return (*this -= 1); } + BigInt operator++(int) { BigInt x = (*this); ++(*this); return x; } + BigInt operator--(int) { BigInt x = (*this); --(*this); return x; } + + BigInt operator-() const; + bool operator !() const { return (!is_nonzero()); } + + s32bit cmp(const BigInt&, bool = true) const; + bool is_even() const { return (get_bit(0) == 0); } + bool is_odd() const { return (get_bit(0) == 1); } + + bool is_zero() const + { + const u32bit sw = sig_words(); + + for(u32bit i = 0; i != sw; ++i) + if(reg[i]) + return false; + return true; + } + + bool is_nonzero() const { return (!is_zero()); } + + void set_bit(u32bit); + void clear_bit(u32bit); + void mask_bits(u32bit); + + bool get_bit(u32bit) const; + u32bit get_substring(u32bit, u32bit) const; + byte byte_at(u32bit) const; + + // same as operator[], remove this + word word_at(u32bit n) const + { return ((n < size()) ? get_reg()[n] : 0); } + + u32bit to_u32bit() const; + + bool is_negative() const { return (sign() == Negative); } + bool is_positive() const { return (sign() == Positive); } + Sign sign() const { return (signedness); } + Sign reverse_sign() const; + void flip_sign(); + void set_sign(Sign); + BigInt abs() const; + + u32bit size() const { return get_reg().size(); } + + u32bit sig_words() const + { + const word* x = reg.begin(); + u32bit sig = reg.size(); + + while(sig && (x[sig-1] == 0)) + sig--; + return sig; + } + + u32bit bytes() const; + u32bit bits() const; + + const word* data() const { return reg.begin(); } + SecureVector<word>& get_reg() { return reg; } + const SecureVector<word>& get_reg() const { return reg; } + + void grow_reg(u32bit); + void grow_to(u32bit); + + word& operator[](u32bit i) { return reg[i]; } + word operator[](u32bit i) const { return reg[i]; } + void clear() { get_reg().clear(); } + + void randomize(RandomNumberGenerator& rng, u32bit n); + + void binary_encode(byte[]) const; + void binary_decode(const byte[], u32bit); + void binary_decode(const MemoryRegion<byte>&); + u32bit encoded_size(Base = Binary) const; + + static BigInt random_integer(RandomNumberGenerator&, + const BigInt&, const BigInt&); + + static SecureVector<byte> encode(const BigInt&, Base = Binary); + static void encode(byte[], const BigInt&, Base = Binary); + static BigInt decode(const byte[], u32bit, Base = Binary); + static BigInt decode(const MemoryRegion<byte>&, Base = Binary); + static SecureVector<byte> encode_1363(const BigInt&, u32bit); + + void swap(BigInt&); + + BigInt() { signedness = Positive; } + BigInt(u64bit); + BigInt(const BigInt&); + BigInt(const std::string&); + BigInt(const byte[], u32bit, Base = Binary); + BigInt(RandomNumberGenerator& rng, u32bit bits); + BigInt(Sign, u32bit); + BigInt(NumberType, u32bit); + private: + SecureVector<word> reg; + Sign signedness; + }; + +/************************************************* +* Arithmetic Operators * +*************************************************/ +BigInt BOTAN_DLL operator+(const BigInt&, const BigInt&); +BigInt BOTAN_DLL operator-(const BigInt&, const BigInt&); +BigInt BOTAN_DLL operator*(const BigInt&, const BigInt&); +BigInt BOTAN_DLL operator/(const BigInt&, const BigInt&); +BigInt BOTAN_DLL operator%(const BigInt&, const BigInt&); +word BOTAN_DLL operator%(const BigInt&, word); +BigInt BOTAN_DLL operator<<(const BigInt&, u32bit); +BigInt BOTAN_DLL operator>>(const BigInt&, u32bit); + +/************************************************* +* Comparison Operators * +*************************************************/ +inline bool operator==(const BigInt& a, const BigInt& b) + { return (a.cmp(b) == 0); } +inline bool operator!=(const BigInt& a, const BigInt& b) + { return (a.cmp(b) != 0); } +inline bool operator<=(const BigInt& a, const BigInt& b) + { return (a.cmp(b) <= 0); } +inline bool operator>=(const BigInt& a, const BigInt& b) + { return (a.cmp(b) >= 0); } +inline bool operator<(const BigInt& a, const BigInt& b) + { return (a.cmp(b) < 0); } +inline bool operator>(const BigInt& a, const BigInt& b) + { return (a.cmp(b) > 0); } + +/************************************************* +* I/O Operators * +*************************************************/ +BOTAN_DLL std::ostream& operator<<(std::ostream&, const BigInt&); +BOTAN_DLL std::istream& operator>>(std::istream&, BigInt&); + +} + +namespace std { + +inline void swap(Botan::BigInt& a, Botan::BigInt& b) { a.swap(b); } + +} + +#endif diff --git a/src/math/bigint/divide.cpp b/src/math/bigint/divide.cpp new file mode 100644 index 000000000..ba088ced4 --- /dev/null +++ b/src/math/bigint/divide.cpp @@ -0,0 +1,104 @@ +/************************************************* +* Division Algorithm Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/divide.h> +#include <botan/mp_core.h> + +namespace Botan { + +namespace { + +/************************************************* +* Handle signed operands, if necessary * +*************************************************/ +void sign_fixup(const BigInt& x, const BigInt& y, BigInt& q, BigInt& r) + { + if(x.sign() == BigInt::Negative) + { + q.flip_sign(); + if(r.is_nonzero()) { --q; r = y.abs() - r; } + } + if(y.sign() == BigInt::Negative) + q.flip_sign(); + } + +} + +/************************************************* +* Solve x = q * y + r * +*************************************************/ +void divide(const BigInt& x, const BigInt& y_arg, BigInt& q, BigInt& r) + { + if(y_arg.is_zero()) + throw BigInt::DivideByZero(); + + BigInt y = y_arg; + const u32bit y_words = y.sig_words(); + r = x; + + r.set_sign(BigInt::Positive); + y.set_sign(BigInt::Positive); + + s32bit compare = r.cmp(y); + + if(compare < 0) + q = 0; + else if(compare == 0) + { + q = 1; + r = 0; + } + else + { + u32bit shifts = 0; + word y_top = y[y.sig_words()-1]; + while(y_top < MP_WORD_TOP_BIT) { y_top <<= 1; ++shifts; } + y <<= shifts; + r <<= shifts; + + const u32bit n = r.sig_words() - 1, t = y_words - 1; + + q.get_reg().create(n - t + 1); + if(n <= t) + { + while(r > y) { r -= y; ++q; } + r >>= shifts; + sign_fixup(x, y_arg, q, r); + return; + } + + BigInt temp = y << (MP_WORD_BITS * (n-t)); + + while(r >= temp) { r -= temp; ++q[n-t]; } + + for(u32bit j = n; j != t; --j) + { + const word x_j0 = r.word_at(j); + const word x_j1 = r.word_at(j-1); + const word y_t = y.word_at(t); + + if(x_j0 == y_t) + q[j-t-1] = MP_WORD_MAX; + else + q[j-t-1] = bigint_divop(x_j0, x_j1, y_t); + + while(bigint_divcore(q[j-t-1], y_t, y.word_at(t-1), + x_j0, x_j1, r.word_at(j-2))) + --q[j-t-1]; + + r -= (q[j-t-1] * y) << (MP_WORD_BITS * (j-t-1)); + if(r.is_negative()) + { + r += y << (MP_WORD_BITS * (j-t-1)); + --q[j-t-1]; + } + } + r >>= shifts; + } + + sign_fixup(x, y_arg, q, r); + } + +} diff --git a/src/math/bigint/divide.h b/src/math/bigint/divide.h new file mode 100644 index 000000000..d0de3da75 --- /dev/null +++ b/src/math/bigint/divide.h @@ -0,0 +1,17 @@ +/************************************************* +* Division Header File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_DIVISON_ALGORITHM__ +#define BOTAN_DIVISON_ALGORITHM__ + +#include <botan/bigint.h> + +namespace Botan { + +void BOTAN_DLL divide(const BigInt&, const BigInt&, BigInt&, BigInt&); + +} + +#endif diff --git a/src/math/bigint/info.txt b/src/math/bigint/info.txt new file mode 100644 index 000000000..d6c5df763 --- /dev/null +++ b/src/math/bigint/info.txt @@ -0,0 +1,31 @@ +realname "BigInt" + +load_on auto + +define BIGINT + +<requires> +hex +mp_amd64|mp_asm64|mp_ia32|mp_ia32_msvc|mp_generic +monty_generic +mulop_generic +</requires> + +<add> +bigint.h +divide.h +mp_core.h +mp_types.h +big_code.cpp +big_io.cpp +big_ops2.cpp +big_ops3.cpp +big_rand.cpp +bigint.cpp +divide.cpp +mp_asm.cpp +mp_comba.cpp +mp_karat.cpp +mp_misc.cpp +mp_shift.cpp +</add> diff --git a/src/math/bigint/monty_amd64/info.txt b/src/math/bigint/monty_amd64/info.txt new file mode 100644 index 000000000..32308bf41 --- /dev/null +++ b/src/math/bigint/monty_amd64/info.txt @@ -0,0 +1,31 @@ +realname "Montgomery Reduction (x86-64)" + +mp_bits 64 + +load_on request + +<add> +mp_monty.S +</add> + +<requires> +asm_amd64 +</requires> + +<arch> +amd64 +</arch> + +<cc> +gcc +icc +</cc> + +# ELF systems +<os> +linux +freebsd +netbsd +openbsd +solaris +</os> diff --git a/src/math/bigint/monty_amd64/mp_monty.S b/src/math/bigint/monty_amd64/mp_monty.S new file mode 100644 index 000000000..3dd4040bc --- /dev/null +++ b/src/math/bigint/monty_amd64/mp_monty.S @@ -0,0 +1,397 @@ +/************************************************* +* Montgomery Reduction Source File * +* (C) 2008 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(mp_monty.S) + +START_FUNCTION(bigint_monty_redc) + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + + movq %rdi, %r14 # z + movq %rdx, %r12 # x + movl %esi, %ebp # z_size + + xorl %esi, %esi # j.76 + movq %r8, -16(%rsp) # u, u + movl %ecx, %ebx # x_size, x_size + movl %ecx, %r8d # x_size, blocks_of_8 + andl $-8, %r8d #, blocks_of_8 + testl %ecx, %ecx # x_size + je .L3 #, + mov %ecx, %eax # x_size, pretmp.71 + leal 1(%rbx), %r15d #, k.73 + salq $3, %rax #, + xorl %r13d, %r13d # j + movq %rax, -8(%rsp) #, pretmp.21 + .p2align 4,,10 + .p2align 3 +.L11: + mov %r13d, %eax # j, j + movq -16(%rsp), %rdi # u, y + leaq (%r14,%rax,8), %r11 #, z_j + xorl %r9d, %r9d # i + imulq (%r11), %rdi #* z_j, y + xorl %r10d, %r10d # carry + testl %r8d, %r8d # blocks_of_8 + je .L7 #, + .p2align 4,,10 + .p2align 3 +.LOOP_MUL_ADD: + mov %r9d, %ecx # i, i + addl $8, %r9d #, i + salq $3, %rcx #, D.2315 + leaq (%r11,%rcx), %rsi #, tmp130 + leaq (%r12,%rcx), %rcx #, tmp131 + + movq 8*0(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*0(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*0 (%rsi) + + movq 8*1(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*1(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*1 (%rsi) + + movq 8*2(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*2(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*2 (%rsi) + + movq 8*3(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*3(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*3 (%rsi) + + movq 8*4(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*4(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*4 (%rsi) + + movq 8*5(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*5(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*5 (%rsi) + + movq 8*6(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*6(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*6 (%rsi) + + movq 8*7(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*7(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*7 (%rsi) + + cmpl %r9d, %r8d # i, blocks_of_8 + jne .LOOP_MUL_ADD #, + cmpl %r8d, %ebx # blocks_of_8, x_size + je .L8 #, +.L7: + movl %r8d, %esi # blocks_of_8, i + .p2align 4,,10 + .p2align 3 +.L5: + mov %esi, %eax # i, i + movq %rdi, %rcx # y, b + leaq (%r11, %rax,8), %r9 #, D.2325 + incl %esi # i + movq (%r12, %rax,8), %rax #* x, tmp133 + + mulq %rcx # b + addq (%r9), %rax #* D.2325, a + adcq $0,%rdx # + addq %r10, %rax # carry, a + adcq $0,%rdx # + + cmpl %esi, %ebx # i, x_size + movq %rdx, %r10 #, carry + movq %rax, (%r9) # a,* D.2325 + jne .L5 #, +.L8: + movq -8(%rsp), %rdx # pretmp.21, + leaq (%r11,%rdx), %rax #, D.2332 + movq (%rax), %rcx #* D.2332, D.2333 + leaq (%r10,%rcx), %rdx #, z_sum + movq %rdx, (%rax) # z_sum,* D.2332 + cmpq %rdx, %rcx # z_sum, D.2333 + jbe .L9 #, + cmpl %ebp, %r15d # z_size, k.73 + je .L9 #, + movl %r15d, %ecx # k.73, k + jmp .L10 # + .p2align 4,,10 + .p2align 3 +.L31: + incl %ecx # k + cmpl %ecx, %ebp # k, z_size + .p2align 4,,4 + .p2align 3 + je .L9 #, +.L10: + mov %ecx, %edx # k, k + leaq (%r11,%rdx,8), %rdx #, D.2342 + movq (%rdx), %rax #* D.2342, tmp136 + incq %rax # D.2344 + movq %rax, (%rdx) # D.2344,* D.2342 + testq %rax, %rax # D.2344 + je .L31 #, +.L9: + incl %r13d # j + decl %ebp # z_size + cmpl %r13d, %ebx # j, x_size + jne .L11 #, + movl %ebx, %esi # x_size, j.76 +.L3: + leal (%rbx,%rbx), %eax #, tmp137 + mov %eax, %eax + leaq (%r14, %rax,8), %rdi #, D.2349 + cmpq $0, (%rdi) #,* D.2349 + jne .L12 #, + testl %ebx, %ebx # x_size + je .L12 #, + leal -1(%rbx), %ecx #, j + leal (%rsi,%rcx), %edx #, tmp141 + mov %ecx, %eax # j, j + movq (%r14,%rdx,8), %rbp #* z, + cmpq %rbp, (%r12, %rax,8) #,* x + jb .L12 #, + ja .L_EXIT #, + leal -2(%rsi,%rbx), %edx #, ivtmp.45 + jmp .L14 # + .p2align 4,,10 + .p2align 3 +.L15: + mov %edx, %eax # ivtmp.45, ivtmp.45 + decl %ecx # j + movq (%r14, %rax,8), %rsi #* z, D.2360 + mov %ecx, %eax # j, j + movq (%r12, %rax,8), %rax #* x, temp.68 + cmpq %rax, %rsi + ja .L12 #, + decl %edx # ivtmp.45 + cmpq %rax, %rsi + jb .L_EXIT #, +.L14: + testl %ecx, %ecx # j + jne .L15 #, +.L12: + xorl %ecx, %ecx # j + xorl %r10d, %r10d # carry + mov %ebx, %esi # x_size, pretmp.19 + testl %r8d, %r8d # blocks_of_8 + je .L17 #, + .p2align 4,,10 + .p2align 3 +.L22: + mov %ecx, %edx # j, D.2375 + addl $8, %ecx #, j + leaq (%rdx,%rsi), %rax #, tmp146 + leaq (%r12,%rdx,8), %rdx #, tmp150 + leaq (%r14, %rax,8), %rax #, tmp148 + + rorq %r10 # carry + + movq 8*0(%rdx), %r10 + sbbq %r10, 8*0(%rax) + + movq 8*1(%rdx), %r10 + sbbq %r10, 8*1(%rax) + + movq 8*2(%rdx), %r10 + sbbq %r10, 8*2(%rax) + + movq 8*3(%rdx), %r10 + sbbq %r10, 8*3(%rax) + + movq 8*4(%rdx), %r10 + sbbq %r10, 8*4(%rax) + + movq 8*5(%rdx), %r10 + sbbq %r10, 8*5(%rax) + + movq 8*6(%rdx), %r10 + sbbq %r10, 8*6(%rax) + + movq 8*7(%rdx), %r10 + sbbq %r10, 8*7(%rax) + + sbbq %r10,%r10 # carry + negq %r10 # carry + + cmpl %ecx, %r8d # j, blocks_of_8 + jne .L22 #, +.L17: + cmpl %r8d, %ebx # blocks_of_8, x_size + je .L19 #, + leal (%r8,%rbx), %r9d #, ivtmp.33 + movl %r8d, %esi # blocks_of_8, j + .p2align 4,,10 + .p2align 3 +.L20: + mov %r9d, %eax # ivtmp.33, ivtmp.33 + mov %esi, %ecx # j, j + leaq (%r14, %rax,8), %rax #, D.2387 + incl %esi # j + movq (%rax), %rdx #* D.2387, tmp153 + incl %r9d # ivtmp.33 + + rorq %r10 # carry + sbbq (%r12,%rcx,8),%rdx #* x, x + sbbq %r10,%r10 # carry + negq %r10 # carry + + cmpl %esi, %ebx # j, x_size + movq %rdx, (%rax) # x,* D.2387 + jne .L20 #, +.L19: + testq %r10, %r10 # carry + je .L_EXIT #, + decq (%rdi) #* D.2349 +.L_EXIT: + popq %rbx # + popq %rbp # + popq %r12 # + popq %r13 # + popq %r14 # + popq %r15 # +END_FUNCTION(bigint_monty_redc) + + +#if 0 + #define Z_ARR ARG_1 // rdi +#define Z_SIZE ARG_2_32 // esi +// X_ARR is ARG_3 == rdx, moved b/c needed for multiply +#define X_SIZE ARG_4_32 // ecx +#define U ARG_5 // r8 + +/* + We need all arguments for a while (we can reuse U eventually) + So only temp registers are + TEMP_1 %r10 + TEMP_2 %r11 + TEMP_3 = ARG_6 = %r9 + void return, so also + R0 %rax (aka TEMP_9) + is free (but needed for multiply) + + Can push: + %rbx (base pointer, callee saved) + %rpb (frame pointer, callee saved) + %r12-%r15 (callee saved) + + Can push base/frame pointers since this is a leaf function + and does not reference any data. +*/ + + push %r12 + push %r13 + push %r14 + push %r15 + +#define LOOP_CTR_I %r12 +#define LOOP_CTR_J %r13 + +#define CARRY TEMP_1 +#define Z_WORD TEMP_2 +#define X_ARR TEMP_3 +#define MUL_LO %rax +#define MUL_HI %rdx + + ASSIGN(X_ARR, ARG_3) + + /* + ZEROIZE(CARRY) + + ASSIGN(LOOP_CTR, X_SIZE) + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) + +#define MULADD_OP(N) \ + ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ + ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ + MUL(Y) ; \ + ADD(Z_WORD, CARRY) ; \ + ASSIGN(CARRY, MUL_HI) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ADD(Z_WORD, MUL_LO) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) + +ALIGN +.LOOP_MULADD8: + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(Z_ARR, 64) + ADD_IMM(X_ARR, 64) + cmp IMM(8), LOOP_CTR + jge .LOOP_MULADD8 + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + +ALIGN +.LOOP_MULADD1: + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(Z_ARR, 8) + ADD_IMM(X_ARR, 8) + + cmp IMM(0), LOOP_CTR + jne .LOOP_MULADD1 +*/ + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif diff --git a/src/math/bigint/monty_generic/info.txt b/src/math/bigint/monty_generic/info.txt new file mode 100644 index 000000000..6f5f0e722 --- /dev/null +++ b/src/math/bigint/monty_generic/info.txt @@ -0,0 +1,7 @@ +realname "Montgomery Reduction" + +load_on dep + +<add> +mp_monty.cpp +</add> diff --git a/src/math/bigint/monty_generic/mp_monty.cpp b/src/math/bigint/monty_generic/mp_monty.cpp new file mode 100644 index 000000000..c162bfd4f --- /dev/null +++ b/src/math/bigint/monty_generic/mp_monty.cpp @@ -0,0 +1,76 @@ +/************************************************* +* Montgomery Reduction Source File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#include <botan/mp_core.h> +#include <botan/mp_asm.h> +#include <botan/mp_asmi.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Montgomery Reduction Algorithm * +*************************************************/ +void bigint_monty_redc(word z[], u32bit z_size, + const word x[], u32bit x_size, word u) + { + const u32bit blocks_of_8 = x_size - (x_size % 8); + + for(u32bit i = 0; i != x_size; ++i) + { + word* z_i = z + i; + + const word y = z_i[0] * u; + + word carry = 0; + + for(u32bit j = 0; j != blocks_of_8; j += 8) + carry = word8_madd3(z_i + j, x + j, y, carry); + + for(u32bit j = blocks_of_8; j != x_size; ++j) + z_i[j] = word_madd3(x[j], y, z_i[j], &carry); + + word z_sum = z_i[x_size] + carry; + carry = (z_sum < z_i[x_size]); + z_i[x_size] = z_sum; + + for(u32bit j = x_size + 1; carry && j != z_size - i; ++j) + { + ++z_i[j]; + carry = !z_i[j]; + } + } + + // Check if z[x_size...x_size+1] >= x[0...x_size] using bigint_cmp (inlined) + if(!z[x_size + x_size]) + { + for(u32bit i = x_size; i > 0; --i) + { + if(z[x_size + i - 1] > x[i-1]) + break; + + if(z[x_size + i - 1] < x[i-1]) + return; + } + } + + // If the compare above is true, subtract using bigint_sub2 (inlined) + word carry = 0; + + for(u32bit i = 0; i != blocks_of_8; i += 8) + carry = word8_sub2(z + x_size + i, x + i, carry); + + for(u32bit i = blocks_of_8; i != x_size; ++i) + z[x_size + i] = word_sub(z[x_size + i], x[i], &carry); + + if(carry) + --z[x_size+x_size]; + } + +} + +} diff --git a/src/math/bigint/mp_amd64/info.txt b/src/math/bigint/mp_amd64/info.txt new file mode 100644 index 000000000..84a5bcf53 --- /dev/null +++ b/src/math/bigint/mp_amd64/info.txt @@ -0,0 +1,19 @@ +realname "MPI Core (x86-64)" + +mp_bits 64 + +load_on dep + +<add> +mp_asm.h +mp_asmi.h +</add> + +<arch> +amd64 +</arch> + +<cc> +gcc +icc +</cc> diff --git a/src/math/bigint/mp_amd64/mp_asm.h b/src/math/bigint/mp_amd64/mp_asm.h new file mode 100644 index 000000000..eca7bae6c --- /dev/null +++ b/src/math/bigint/mp_amd64/mp_asm.h @@ -0,0 +1,67 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_H__ +#define BOTAN_MP_ASM_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS != 64) + #error The mp_amd64 module requires that BOTAN_MP_WORD_BITS == 64 +#endif + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for amd64 Assembly * +*************************************************/ +#define ASM(x) x "\n\t" + +/************************************************* +* Word Multiply * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + asm( + ASM("mulq %[b]") + ASM("addq %[c],%[a]") + ASM("adcq $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c) + : "0"(a), "1"(b), [c]"g"(*c) : "cc"); + + return a; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + asm( + ASM("mulq %[b]") + + ASM("addq %[c],%[a]") + ASM("adcq $0,%[carry]") + + ASM("addq %[d],%[a]") + ASM("adcq $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d) + : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc"); + + return a; + } + +#undef ASM + +} + +} + +#endif diff --git a/src/math/bigint/mp_amd64/mp_asmi.h b/src/math/bigint/mp_amd64/mp_asmi.h new file mode 100644 index 000000000..16632a38d --- /dev/null +++ b/src/math/bigint/mp_amd64/mp_asmi.h @@ -0,0 +1,233 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2007 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/mp_asm.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for amd64 Assembly * +*************************************************/ +#ifndef ASM + #define ASM(x) x "\n\t" +#endif + +#define ADDSUB2_OP(OPERATION, INDEX) \ + ASM("movq 8*" #INDEX "(%[y]), %[carry]") \ + ASM(OPERATION " %[carry], 8*" #INDEX "(%[x])") \ + +#define ADDSUB3_OP(OPERATION, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]), %[carry]") \ + ASM(OPERATION " 8*" #INDEX "(%[y]), %[carry]") \ + ASM("movq %[carry], 8*" #INDEX "(%[z])") \ + +#define LINMUL_OP(WRITE_TO, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]),%%rax") \ + ASM("mulq %[y]") \ + ASM("addq %[carry],%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("movq %%rdx,%[carry]") \ + ASM("movq %%rax, 8*" #INDEX "(%[" WRITE_TO "])") + +#define MULADD_OP(IGNORED, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]),%%rax") \ + ASM("mulq %[y]") \ + ASM("addq %[carry],%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("addq 8*" #INDEX "(%[z]),%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("movq %%rdx,%[carry]") \ + ASM("movq %%rax, 8*" #INDEX " (%[z])") + +#define DO_8_TIMES(MACRO, ARG) \ + MACRO(ARG, 0) \ + MACRO(ARG, 1) \ + MACRO(ARG, 2) \ + MACRO(ARG, 3) \ + MACRO(ARG, 4) \ + MACRO(ARG, 5) \ + MACRO(ARG, 6) \ + MACRO(ARG, 7) + +#define ADD_OR_SUBTRACT(CORE_CODE) \ + ASM("rorq %[carry]") \ + CORE_CODE \ + ASM("sbbq %[carry],%[carry]") \ + ASM("negq %[carry]") + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("adcq %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("sbbq %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Subtraction, Three Argument * +*************************************************/ +inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "x") + : [carry]"=r"(carry) + : [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "z") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(MULADD_OP, "") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mulq %[y]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mulq %[y]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + + +#undef ASM +#undef DO_8_TIMES +#undef ADD_OR_SUBTRACT +#undef ADDSUB2_OP +#undef ADDSUB3_OP +#undef LINMUL_OP +#undef MULADD_OP + +} + +} +#endif diff --git a/src/math/bigint/mp_asm.cpp b/src/math/bigint/mp_asm.cpp new file mode 100644 index 000000000..e5d1fe0d6 --- /dev/null +++ b/src/math/bigint/mp_asm.cpp @@ -0,0 +1,177 @@ +/************************************************* +* Lowest Level MPI Algorithms Source File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#include <botan/mp_asm.h> +#include <botan/mp_asmi.h> +#include <botan/mp_core.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Two Operand Addition, No Carry * +*************************************************/ +word bigint_add2_nc(word x[], u32bit x_size, const word y[], u32bit y_size) + { + word carry = 0; + + const u32bit blocks = y_size - (y_size % 8); + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_add2(x + j, y + j, carry); + + for(u32bit j = blocks; j != y_size; ++j) + x[j] = word_add(x[j], y[j], &carry); + + if(!carry) + return 0; + + for(u32bit j = y_size; j != x_size; ++j) + if(++x[j]) + return 0; + + return 1; + } + +/************************************************* +* Three Operand Addition, No Carry * +*************************************************/ +word bigint_add3_nc(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + if(x_size < y_size) + { return bigint_add3_nc(z, y, y_size, x, x_size); } + + word carry = 0; + + const u32bit blocks = y_size - (y_size % 8); + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_add3(z + j, x + j, y + j, carry); + + for(u32bit j = blocks; j != y_size; ++j) + z[j] = word_add(x[j], y[j], &carry); + + for(u32bit j = y_size; j != x_size; ++j) + { + word x_j = x[j] + carry; + if(carry && x_j) + carry = 0; + z[j] = x_j; + } + + return carry; + } + +/************************************************* +* Two Operand Addition * +*************************************************/ +void bigint_add2(word x[], u32bit x_size, const word y[], u32bit y_size) + { + if(bigint_add2_nc(x, x_size, y, y_size)) + ++x[x_size]; + } + +/************************************************* +* Three Operand Addition * +*************************************************/ +void bigint_add3(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + if(bigint_add3_nc(z, x, x_size, y, y_size)) + ++z[(x_size > y_size ? x_size : y_size)]; + } + +/************************************************* +* Two Operand Subtraction * +*************************************************/ +void bigint_sub2(word x[], u32bit x_size, const word y[], u32bit y_size) + { + word carry = 0; + + const u32bit blocks = y_size - (y_size % 8); + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_sub2(x + j, y + j, carry); + + for(u32bit j = blocks; j != y_size; ++j) + x[j] = word_sub(x[j], y[j], &carry); + + if(!carry) return; + + for(u32bit j = y_size; j != x_size; ++j) + { + --x[j]; + if(x[j] != MP_WORD_MAX) return; + } + } + +/************************************************* +* Three Operand Subtraction * +*************************************************/ +void bigint_sub3(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + word carry = 0; + + const u32bit blocks = y_size - (y_size % 8); + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_sub3(z + j, x + j, y + j, carry); + + for(u32bit j = blocks; j != y_size; ++j) + z[j] = word_sub(x[j], y[j], &carry); + + for(u32bit j = y_size; j != x_size; ++j) + { + word x_j = x[j] - carry; + if(carry && x_j != MP_WORD_MAX) + carry = 0; + z[j] = x_j; + } + } + +/************************************************* +* Two Operand Linear Multiply * +*************************************************/ +void bigint_linmul2(word x[], u32bit x_size, word y) + { + const u32bit blocks = x_size - (x_size % 8); + + word carry = 0; + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_linmul2(x + j, y, carry); + + for(u32bit j = blocks; j != x_size; ++j) + x[j] = word_madd2(x[j], y, &carry); + + x[x_size] = carry; + } + +/************************************************* +* Three Operand Linear Multiply * +*************************************************/ +void bigint_linmul3(word z[], const word x[], u32bit x_size, word y) + { + const u32bit blocks = x_size - (x_size % 8); + + word carry = 0; + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_linmul3(z + j, x + j, y, carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[j] = word_madd2(x[j], y, &carry); + + z[x_size] = carry; + } + +} + +} diff --git a/src/math/bigint/mp_asm64/info.txt b/src/math/bigint/mp_asm64/info.txt new file mode 100644 index 000000000..85a4391b9 --- /dev/null +++ b/src/math/bigint/mp_asm64/info.txt @@ -0,0 +1,26 @@ +realname "MPI Core (Alpha/IA-64/MIPS64/PowerPC-64/SPARC64)" + +mp_bits 64 + +load_on dep + +<add> +mp_asm.h +mp_generic:mp_asmi.h +</add> + +<arch> +alpha +ia64 +mips64 +ppc64 +sparc64 +</arch> + +# The inline asm only works with gcc, but it looks like (at least on +# UltraSPARC), using 64-bit words and the sythensized multiply is a 5 to 25% +# win, so it's probably worth using elsewhere. +<cc> +gcc +sunwspro +</cc> diff --git a/src/math/bigint/mp_asm64/mp_asm.h b/src/math/bigint/mp_asm64/mp_asm.h new file mode 100644 index 000000000..e455b3616 --- /dev/null +++ b/src/math/bigint/mp_asm64/mp_asm.h @@ -0,0 +1,110 @@ +/************************************************* +* MPI Multiply-Add Core Header File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_MP_MADD_H__ +#define BOTAN_MP_MADD_H__ + +#include <botan/mp_types.h> + +namespace Botan { + +#if (BOTAN_MP_WORD_BITS != 64) + #error The mp_asm64 module requires that BOTAN_MP_WORD_BITS == 64 +#endif + +#if defined(BOTAN_TARGET_ARCH_IS_ALPHA) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("umulh %1,%2,%0" : "=r" (z0) : "r" (a), "r" (b)); \ + z1 = a * b; \ +} while(0); + +#elif defined(BOTAN_TARGET_ARCH_IS_IA64) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("xmpy.hu %0=%1,%2" : "=f" (z0) : "f" (a), "f" (b)); \ + z1 = a * b; \ +} while(0); + +#elif defined(BOTAN_TARGET_ARCH_IS_PPC64) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("mulhdu %0,%1,%2" : "=r" (z0) : "r" (a), "r" (b) : "cc"); \ + z1 = a * b; \ +} while(0); + +#elif defined(BOTAN_TARGET_ARCH_IS_MIPS64) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("dmultu %2,%3" : "=h" (z0), "=l" (z1) : "r" (a), "r" (b)); \ +} while(0); + +#else + +// Do a 64x64->128 multiply using four 64x64->64 multiplies +// plus some adds and shifts. Last resort for CPUs like UltraSPARC, +// with 64-bit registers/ALU, but no 64x64->128 multiply. +inline void bigint_2word_mul(word a, word b, word* z1, word* z0) + { + const u32bit MP_HWORD_BITS = MP_WORD_BITS / 2; + const word MP_HWORD_MASK = ((word)1 << MP_HWORD_BITS) - 1; + + const word a_hi = (a >> MP_HWORD_BITS); + const word a_lo = (a & MP_HWORD_MASK); + const word b_hi = (b >> MP_HWORD_BITS); + const word b_lo = (b & MP_HWORD_MASK); + + word x0 = a_hi * b_hi; + word x1 = a_lo * b_hi; + word x2 = a_hi * b_lo; + word x3 = a_lo * b_lo; + + x2 += x3 >> (MP_HWORD_BITS); + x2 += x1; + if(x2 < x1) + x0 += ((word)1 << MP_HWORD_BITS); + + *z0 = x0 + (x2 >> MP_HWORD_BITS); + *z1 = ((x2 & MP_HWORD_MASK) << MP_HWORD_BITS) + (x3 & MP_HWORD_MASK); + } + +#define BOTAN_WORD_MUL(a,b,z1,z0) bigint_2word_mul(a, b, &z1, &z0) + +#endif + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + word z0 = 0, z1 = 0; + + BOTAN_WORD_MUL(a, b, z1, z0); + + z1 += *c; if(z1 < *c) z0++; + + *c = z0; + return z1; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + word z0 = 0, z1 = 0; + + BOTAN_WORD_MUL(a, b, z1, z0); + + z1 += c; if(z1 < c) z0++; + z1 += *d; if(z1 < *d) z0++; + + *d = z0; + return z1; + } + +} + +#endif diff --git a/src/math/bigint/mp_comba.cpp b/src/math/bigint/mp_comba.cpp new file mode 100644 index 000000000..c7a9c964c --- /dev/null +++ b/src/math/bigint/mp_comba.cpp @@ -0,0 +1,918 @@ +/************************************************* +* Comba Multiplication and Squaring Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/mp_core.h> +#include <botan/mp_asmi.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Comba 4x4 Squaring * +*************************************************/ +void bigint_comba_sqr4(word z[8], const word x[4]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], x[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); + word3_muladd(&w2, &w1, &w0, x[1], x[1]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); + word3_muladd(&w2, &w1, &w0, x[2], x[2]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[3], x[3]); + z[6] = w0; + z[7] = w1; + } + +/************************************************* +* Comba 4x4 Multiplication * +*************************************************/ +void bigint_comba_mul4(word z[8], const word x[4], const word y[4]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[1]); + word3_muladd(&w2, &w1, &w0, x[1], y[0]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[2]); + word3_muladd(&w2, &w1, &w0, x[1], y[1]); + word3_muladd(&w2, &w1, &w0, x[2], y[0]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[3]); + word3_muladd(&w2, &w1, &w0, x[1], y[2]); + word3_muladd(&w2, &w1, &w0, x[2], y[1]); + word3_muladd(&w2, &w1, &w0, x[3], y[0]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[1], y[3]); + word3_muladd(&w2, &w1, &w0, x[2], y[2]); + word3_muladd(&w2, &w1, &w0, x[3], y[1]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[2], y[3]); + word3_muladd(&w2, &w1, &w0, x[3], y[2]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[3], y[3]); + z[6] = w0; + z[7] = w1; + } + +/************************************************* +* Comba 6x6 Squaring * +*************************************************/ +void bigint_comba_sqr6(word z[12], const word x[6]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], x[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); + word3_muladd(&w2, &w1, &w0, x[1], x[1]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); + word3_muladd(&w2, &w1, &w0, x[2], x[2]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[1], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[4]); + word3_muladd(&w2, &w1, &w0, x[3], x[3]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[2], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[4]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[3], x[5]); + word3_muladd(&w2, &w1, &w0, x[4], x[4]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[4], x[5]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[5], x[5]); + z[10] = w0; + z[11] = w1; + } + +/************************************************* +* Comba 6x6 Multiplication * +*************************************************/ +void bigint_comba_mul6(word z[12], const word x[6], const word y[6]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[1]); + word3_muladd(&w2, &w1, &w0, x[1], y[0]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[2]); + word3_muladd(&w2, &w1, &w0, x[1], y[1]); + word3_muladd(&w2, &w1, &w0, x[2], y[0]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[3]); + word3_muladd(&w2, &w1, &w0, x[1], y[2]); + word3_muladd(&w2, &w1, &w0, x[2], y[1]); + word3_muladd(&w2, &w1, &w0, x[3], y[0]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[4]); + word3_muladd(&w2, &w1, &w0, x[1], y[3]); + word3_muladd(&w2, &w1, &w0, x[2], y[2]); + word3_muladd(&w2, &w1, &w0, x[3], y[1]); + word3_muladd(&w2, &w1, &w0, x[4], y[0]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[5]); + word3_muladd(&w2, &w1, &w0, x[1], y[4]); + word3_muladd(&w2, &w1, &w0, x[2], y[3]); + word3_muladd(&w2, &w1, &w0, x[3], y[2]); + word3_muladd(&w2, &w1, &w0, x[4], y[1]); + word3_muladd(&w2, &w1, &w0, x[5], y[0]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[1], y[5]); + word3_muladd(&w2, &w1, &w0, x[2], y[4]); + word3_muladd(&w2, &w1, &w0, x[3], y[3]); + word3_muladd(&w2, &w1, &w0, x[4], y[2]); + word3_muladd(&w2, &w1, &w0, x[5], y[1]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[2], y[5]); + word3_muladd(&w2, &w1, &w0, x[3], y[4]); + word3_muladd(&w2, &w1, &w0, x[4], y[3]); + word3_muladd(&w2, &w1, &w0, x[5], y[2]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[3], y[5]); + word3_muladd(&w2, &w1, &w0, x[4], y[4]); + word3_muladd(&w2, &w1, &w0, x[5], y[3]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[4], y[5]); + word3_muladd(&w2, &w1, &w0, x[5], y[4]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[5], y[5]); + z[10] = w0; + z[11] = w1; + } + +/************************************************* +* Comba 8x8 Squaring * +*************************************************/ +void bigint_comba_sqr8(word z[16], const word x[8]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], x[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); + word3_muladd(&w2, &w1, &w0, x[1], x[1]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); + word3_muladd(&w2, &w1, &w0, x[2], x[2]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[4]); + word3_muladd(&w2, &w1, &w0, x[3], x[3]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[4]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[1], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[5]); + word3_muladd(&w2, &w1, &w0, x[4], x[4]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[2], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[4], x[5]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[3], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[4], x[6]); + word3_muladd(&w2, &w1, &w0, x[5], x[5]); + z[10] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[4], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[5], x[6]); + z[11] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[5], x[7]); + word3_muladd(&w2, &w1, &w0, x[6], x[6]); + z[12] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[6], x[7]); + z[13] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[7], x[7]); + z[14] = w0; + z[15] = w1; + } + +/************************************************* +* Comba 8x8 Multiplication * +*************************************************/ +void bigint_comba_mul8(word z[16], const word x[8], const word y[8]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[1]); + word3_muladd(&w2, &w1, &w0, x[1], y[0]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[2]); + word3_muladd(&w2, &w1, &w0, x[1], y[1]); + word3_muladd(&w2, &w1, &w0, x[2], y[0]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[3]); + word3_muladd(&w2, &w1, &w0, x[1], y[2]); + word3_muladd(&w2, &w1, &w0, x[2], y[1]); + word3_muladd(&w2, &w1, &w0, x[3], y[0]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[4]); + word3_muladd(&w2, &w1, &w0, x[1], y[3]); + word3_muladd(&w2, &w1, &w0, x[2], y[2]); + word3_muladd(&w2, &w1, &w0, x[3], y[1]); + word3_muladd(&w2, &w1, &w0, x[4], y[0]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[5]); + word3_muladd(&w2, &w1, &w0, x[1], y[4]); + word3_muladd(&w2, &w1, &w0, x[2], y[3]); + word3_muladd(&w2, &w1, &w0, x[3], y[2]); + word3_muladd(&w2, &w1, &w0, x[4], y[1]); + word3_muladd(&w2, &w1, &w0, x[5], y[0]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[6]); + word3_muladd(&w2, &w1, &w0, x[1], y[5]); + word3_muladd(&w2, &w1, &w0, x[2], y[4]); + word3_muladd(&w2, &w1, &w0, x[3], y[3]); + word3_muladd(&w2, &w1, &w0, x[4], y[2]); + word3_muladd(&w2, &w1, &w0, x[5], y[1]); + word3_muladd(&w2, &w1, &w0, x[6], y[0]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], y[7]); + word3_muladd(&w2, &w1, &w0, x[1], y[6]); + word3_muladd(&w2, &w1, &w0, x[2], y[5]); + word3_muladd(&w2, &w1, &w0, x[3], y[4]); + word3_muladd(&w2, &w1, &w0, x[4], y[3]); + word3_muladd(&w2, &w1, &w0, x[5], y[2]); + word3_muladd(&w2, &w1, &w0, x[6], y[1]); + word3_muladd(&w2, &w1, &w0, x[7], y[0]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[1], y[7]); + word3_muladd(&w2, &w1, &w0, x[2], y[6]); + word3_muladd(&w2, &w1, &w0, x[3], y[5]); + word3_muladd(&w2, &w1, &w0, x[4], y[4]); + word3_muladd(&w2, &w1, &w0, x[5], y[3]); + word3_muladd(&w2, &w1, &w0, x[6], y[2]); + word3_muladd(&w2, &w1, &w0, x[7], y[1]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[2], y[7]); + word3_muladd(&w2, &w1, &w0, x[3], y[6]); + word3_muladd(&w2, &w1, &w0, x[4], y[5]); + word3_muladd(&w2, &w1, &w0, x[5], y[4]); + word3_muladd(&w2, &w1, &w0, x[6], y[3]); + word3_muladd(&w2, &w1, &w0, x[7], y[2]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[3], y[7]); + word3_muladd(&w2, &w1, &w0, x[4], y[6]); + word3_muladd(&w2, &w1, &w0, x[5], y[5]); + word3_muladd(&w2, &w1, &w0, x[6], y[4]); + word3_muladd(&w2, &w1, &w0, x[7], y[3]); + z[10] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[4], y[7]); + word3_muladd(&w2, &w1, &w0, x[5], y[6]); + word3_muladd(&w2, &w1, &w0, x[6], y[5]); + word3_muladd(&w2, &w1, &w0, x[7], y[4]); + z[11] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[5], y[7]); + word3_muladd(&w2, &w1, &w0, x[6], y[6]); + word3_muladd(&w2, &w1, &w0, x[7], y[5]); + z[12] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[6], y[7]); + word3_muladd(&w2, &w1, &w0, x[7], y[6]); + z[13] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[7], y[7]); + z[14] = w0; + z[15] = w1; + } + +/************************************************* +* Comba 16x16 Squaring * +*************************************************/ +void bigint_comba_sqr16(word z[32], const word x[16]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 1]); + z[ 1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 1], x[ 1]); + z[ 2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 4]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 2], x[ 2]); + z[ 4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 4]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 3]); + z[ 5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 4]); + z[ 7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 4], x[ 4]); + z[ 8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]); + z[ 9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 5], x[ 5]); + z[10] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 6]); + z[11] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], x[ 6]); + z[12] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 7]); + z[13] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 7], x[ 7]); + z[14] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 8]); + z[15] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 8], x[ 8]); + z[16] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[ 9]); + z[17] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[10]); + word3_muladd(&w2, &w1, &w0, x[ 9], x[ 9]); + z[18] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[10]); + z[19] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[11]); + word3_muladd(&w2, &w1, &w0, x[10], x[10]); + z[20] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[11]); + z[21] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[12]); + word3_muladd(&w2, &w1, &w0, x[11], x[11]); + z[22] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[12]); + z[23] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[13]); + word3_muladd(&w2, &w1, &w0, x[12], x[12]); + z[24] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[10], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[12], x[13]); + z[25] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[11], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[12], x[14]); + word3_muladd(&w2, &w1, &w0, x[13], x[13]); + z[26] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[12], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[13], x[14]); + z[27] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[13], x[15]); + word3_muladd(&w2, &w1, &w0, x[14], x[14]); + z[28] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[14], x[15]); + z[29] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[15], x[15]); + z[30] = w0; + z[31] = w1; + } + +/************************************************* +* Comba 16x16 Multiplication * +*************************************************/ +void bigint_comba_mul16(word z[32], const word x[16], const word y[16]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 0]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 0]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 0]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 0]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 0]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 0]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 0]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 0]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 0]); + z[10] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 0]); + z[11] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 0]); + z[12] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 0]); + z[13] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 0]); + z[14] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 0]); + z[15] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 1], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 1]); + z[16] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 2], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 2]); + z[17] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 3], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 3]); + z[18] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 4], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[10]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 4]); + z[19] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 5], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[11]); + word3_muladd(&w2, &w1, &w0, x[10], y[10]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 5]); + z[20] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 6], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[12]); + word3_muladd(&w2, &w1, &w0, x[10], y[11]); + word3_muladd(&w2, &w1, &w0, x[11], y[10]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 6]); + z[21] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 7], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[13]); + word3_muladd(&w2, &w1, &w0, x[10], y[12]); + word3_muladd(&w2, &w1, &w0, x[11], y[11]); + word3_muladd(&w2, &w1, &w0, x[12], y[10]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 7]); + z[22] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 8], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[14]); + word3_muladd(&w2, &w1, &w0, x[10], y[13]); + word3_muladd(&w2, &w1, &w0, x[11], y[12]); + word3_muladd(&w2, &w1, &w0, x[12], y[11]); + word3_muladd(&w2, &w1, &w0, x[13], y[10]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 8]); + z[23] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 9], y[15]); + word3_muladd(&w2, &w1, &w0, x[10], y[14]); + word3_muladd(&w2, &w1, &w0, x[11], y[13]); + word3_muladd(&w2, &w1, &w0, x[12], y[12]); + word3_muladd(&w2, &w1, &w0, x[13], y[11]); + word3_muladd(&w2, &w1, &w0, x[14], y[10]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 9]); + z[24] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[10], y[15]); + word3_muladd(&w2, &w1, &w0, x[11], y[14]); + word3_muladd(&w2, &w1, &w0, x[12], y[13]); + word3_muladd(&w2, &w1, &w0, x[13], y[12]); + word3_muladd(&w2, &w1, &w0, x[14], y[11]); + word3_muladd(&w2, &w1, &w0, x[15], y[10]); + z[25] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[11], y[15]); + word3_muladd(&w2, &w1, &w0, x[12], y[14]); + word3_muladd(&w2, &w1, &w0, x[13], y[13]); + word3_muladd(&w2, &w1, &w0, x[14], y[12]); + word3_muladd(&w2, &w1, &w0, x[15], y[11]); + z[26] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[12], y[15]); + word3_muladd(&w2, &w1, &w0, x[13], y[14]); + word3_muladd(&w2, &w1, &w0, x[14], y[13]); + word3_muladd(&w2, &w1, &w0, x[15], y[12]); + z[27] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[13], y[15]); + word3_muladd(&w2, &w1, &w0, x[14], y[14]); + word3_muladd(&w2, &w1, &w0, x[15], y[13]); + z[28] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[14], y[15]); + word3_muladd(&w2, &w1, &w0, x[15], y[14]); + z[29] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[15], y[15]); + z[30] = w0; + z[31] = w1; + } + +} + +} diff --git a/src/math/bigint/mp_core.h b/src/math/bigint/mp_core.h new file mode 100644 index 000000000..92949cd83 --- /dev/null +++ b/src/math/bigint/mp_core.h @@ -0,0 +1,96 @@ +/************************************************* +* MPI Algorithms Header File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_MP_CORE_H__ +#define BOTAN_MP_CORE_H__ + +#include <botan/mp_types.h> + +namespace Botan { + +/************************************************* +* The size of the word type, in bits * +*************************************************/ +const u32bit MP_WORD_BITS = BOTAN_MP_WORD_BITS; + +extern "C" { + +/************************************************* +* Addition/Subtraction Operations * +*************************************************/ +void bigint_add2(word[], u32bit, const word[], u32bit); +void bigint_add3(word[], const word[], u32bit, const word[], u32bit); + +word bigint_add2_nc(word[], u32bit, const word[], u32bit); +word bigint_add3_nc(word[], const word[], u32bit, const word[], u32bit); + +void bigint_sub2(word[], u32bit, const word[], u32bit); +void bigint_sub3(word[], const word[], u32bit, const word[], u32bit); + +/************************************************* +* Shift Operations * +*************************************************/ +void bigint_shl1(word[], u32bit, u32bit, u32bit); +void bigint_shl2(word[], const word[], u32bit, u32bit, u32bit); +void bigint_shr1(word[], u32bit, u32bit, u32bit); +void bigint_shr2(word[], const word[], u32bit, u32bit, u32bit); + +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +*************************************************/ +void bigint_simple_mul(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size); +void bigint_simple_sqr(word z[], const word x[], u32bit x_size); + +/************************************************* +* Linear Multiply * +*************************************************/ +void bigint_linmul2(word[], u32bit, word); +void bigint_linmul3(word[], const word[], u32bit, word); +void bigint_linmul_add(word[], u32bit, const word[], u32bit, word); + +/************************************************* +* Montgomery Reduction * +*************************************************/ +void bigint_monty_redc(word[], u32bit, const word[], u32bit, word); + +/************************************************* +* Misc Utility Operations * +*************************************************/ +u32bit bigint_divcore(word, word, word, word, word, word); +s32bit bigint_cmp(const word[], u32bit, const word[], u32bit); +word bigint_divop(word, word, word); +word bigint_modop(word, word, word); +void bigint_wordmul(word, word, word*, word*); + +/************************************************* +* Comba Multiplication / Squaring * +*************************************************/ +void bigint_comba_mul4(word[8], const word[4], const word[4]); +void bigint_comba_mul6(word[12], const word[6], const word[6]); +void bigint_comba_mul8(word[16], const word[8], const word[8]); +void bigint_comba_mul16(word[32], const word[16], const word[16]); + +void bigint_comba_sqr4(word[8], const word[4]); +void bigint_comba_sqr6(word[12], const word[6]); +void bigint_comba_sqr8(word[16], const word[8]); +void bigint_comba_sqr8(word[32], const word[16]); +void bigint_comba_sqr16(word[64], const word[32]); + +} + +/************************************************* +* High Level Multiplication/Squaring Interfaces * +*************************************************/ +void bigint_mul(word[], u32bit, word[], + const word[], u32bit, u32bit, + const word[], u32bit, u32bit); + +void bigint_sqr(word[], u32bit, word[], + const word[], u32bit, u32bit); + +} + +#endif diff --git a/src/math/bigint/mp_generic/info.txt b/src/math/bigint/mp_generic/info.txt new file mode 100644 index 000000000..8bf75fec3 --- /dev/null +++ b/src/math/bigint/mp_generic/info.txt @@ -0,0 +1,8 @@ +realname "MPI Core (C++)" + +load_on dep + +<add> +mp_asm.h +mp_asmi.h +</add> diff --git a/src/math/bigint/mp_generic/mp_asm.h b/src/math/bigint/mp_generic/mp_asm.h new file mode 100644 index 000000000..e62a57110 --- /dev/null +++ b/src/math/bigint/mp_generic/mp_asm.h @@ -0,0 +1,52 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_H__ +#define BOTAN_MP_ASM_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS == 8) + typedef Botan::u16bit dword; +#elif (BOTAN_MP_WORD_BITS == 16) + typedef Botan::u32bit dword; +#elif (BOTAN_MP_WORD_BITS == 32) + typedef Botan::u64bit dword; +#elif (BOTAN_MP_WORD_BITS == 64) + #error BOTAN_MP_WORD_BITS can be 64 only with assembly support +#else + #error BOTAN_MP_WORD_BITS must be 8, 16, 32, or 64 +#endif + +namespace Botan { + +extern "C" { + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + dword z = (dword)a * b + *c; + *c = (word)(z >> BOTAN_MP_WORD_BITS); + return (word)z; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + dword z = (dword)a * b + c + *d; + *d = (word)(z >> BOTAN_MP_WORD_BITS); + return (word)z; + } + +} + +} + +#endif diff --git a/src/math/bigint/mp_generic/mp_asmi.h b/src/math/bigint/mp_generic/mp_asmi.h new file mode 100644 index 000000000..d15295154 --- /dev/null +++ b/src/math/bigint/mp_generic/mp_asmi.h @@ -0,0 +1,189 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/mp_asm.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + word z = x + y; + word c1 = (z < x); + z += *carry; + *carry = c1 | (z < *carry); + return z; + } + +/************************************************* +* Eight Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + x[0] = word_add(x[0], y[0], &carry); + x[1] = word_add(x[1], y[1], &carry); + x[2] = word_add(x[2], y[2], &carry); + x[3] = word_add(x[3], y[3], &carry); + x[4] = word_add(x[4], y[4], &carry); + x[5] = word_add(x[5], y[5], &carry); + x[6] = word_add(x[6], y[6], &carry); + x[7] = word_add(x[7], y[7], &carry); + return carry; + } + +/************************************************* +* Eight Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], + const word y[8], word carry) + { + z[0] = word_add(x[0], y[0], &carry); + z[1] = word_add(x[1], y[1], &carry); + z[2] = word_add(x[2], y[2], &carry); + z[3] = word_add(x[3], y[3], &carry); + z[4] = word_add(x[4], y[4], &carry); + z[5] = word_add(x[5], y[5], &carry); + z[6] = word_add(x[6], y[6], &carry); + z[7] = word_add(x[7], y[7], &carry); + return carry; + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + word t0 = x - y; + word c1 = (t0 > x); + word z = t0 - *carry; + *carry = c1 | (z > t0); + return z; + } + +/************************************************* +* Eight Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[4], const word y[4], word carry) + { + x[0] = word_sub(x[0], y[0], &carry); + x[1] = word_sub(x[1], y[1], &carry); + x[2] = word_sub(x[2], y[2], &carry); + x[3] = word_sub(x[3], y[3], &carry); + x[4] = word_sub(x[4], y[4], &carry); + x[5] = word_sub(x[5], y[5], &carry); + x[6] = word_sub(x[6], y[6], &carry); + x[7] = word_sub(x[7], y[7], &carry); + return carry; + } + +/************************************************* +* Eight Word Block Subtraction, Three Argument * +*************************************************/ +inline word word8_sub3(word z[8], const word x[8], + const word y[8], word carry) + { + z[0] = word_sub(x[0], y[0], &carry); + z[1] = word_sub(x[1], y[1], &carry); + z[2] = word_sub(x[2], y[2], &carry); + z[3] = word_sub(x[3], y[3], &carry); + z[4] = word_sub(x[4], y[4], &carry); + z[5] = word_sub(x[5], y[5], &carry); + z[6] = word_sub(x[6], y[6], &carry); + z[7] = word_sub(x[7], y[7], &carry); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[4], word y, word carry) + { + x[0] = word_madd2(x[0], y, &carry); + x[1] = word_madd2(x[1], y, &carry); + x[2] = word_madd2(x[2], y, &carry); + x[3] = word_madd2(x[3], y, &carry); + x[4] = word_madd2(x[4], y, &carry); + x[5] = word_madd2(x[5], y, &carry); + x[6] = word_madd2(x[6], y, &carry); + x[7] = word_madd2(x[7], y, &carry); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd2(x[0], y, &carry); + z[1] = word_madd2(x[1], y, &carry); + z[2] = word_madd2(x[2], y, &carry); + z[3] = word_madd2(x[3], y, &carry); + z[4] = word_madd2(x[4], y, &carry); + z[5] = word_madd2(x[5], y, &carry); + z[6] = word_madd2(x[6], y, &carry); + z[7] = word_madd2(x[7], y, &carry); + return carry; + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd3(x[0], y, z[0], &carry); + z[1] = word_madd3(x[1], y, z[1], &carry); + z[2] = word_madd3(x[2], y, z[2], &carry); + z[3] = word_madd3(x[3], y, z[3], &carry); + z[4] = word_madd3(x[4], y, z[4], &carry); + z[5] = word_madd3(x[5], y, z[5], &carry); + z[6] = word_madd3(x[6], y, z[6], &carry); + z[7] = word_madd3(x[7], y, z[7], &carry); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b) + { + word carry = *w0; + *w0 = word_madd2(a, b, &carry); + *w1 += carry; + *w2 += (*w1 < carry) ? 1 : 0; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b) + { + word carry = 0; + a = word_madd2(a, b, &carry); + b = carry; + + word top = (b >> (BOTAN_MP_WORD_BITS-1)); + b <<= 1; + b |= (a >> (BOTAN_MP_WORD_BITS-1)); + a <<= 1; + + carry = 0; + *w0 = word_add(*w0, a, &carry); + *w1 = word_add(*w1, b, &carry); + *w2 = word_add(*w2, top, &carry); + } + +} + +} + +#endif diff --git a/src/math/bigint/mp_ia32/info.txt b/src/math/bigint/mp_ia32/info.txt new file mode 100644 index 000000000..51f98fda8 --- /dev/null +++ b/src/math/bigint/mp_ia32/info.txt @@ -0,0 +1,19 @@ +realname "MPI Core (IA-32)" + +mp_bits 32 + +load_on asm_ok + +<add> +mp_asm.h +mp_asmi.h +</add> + +<arch> +ia32 +</arch> + +<cc> +gcc +icc +</cc> diff --git a/src/math/bigint/mp_ia32/mp_asm.h b/src/math/bigint/mp_ia32/mp_asm.h new file mode 100644 index 000000000..b45140321 --- /dev/null +++ b/src/math/bigint/mp_ia32/mp_asm.h @@ -0,0 +1,65 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_H__ +#define BOTAN_MP_ASM_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS != 32) + #error The mp_ia32 module requires that BOTAN_MP_WORD_BITS == 32 +#endif + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for x86 Assembly * +*************************************************/ +#define ASM(x) x "\n\t" + +/************************************************* +* Word Multiply * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + asm( + ASM("mull %[b]") + ASM("addl %[c],%[a]") + ASM("adcl $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c) + : "0"(a), "1"(b), [c]"g"(*c) : "cc"); + + return a; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + asm( + ASM("mull %[b]") + + ASM("addl %[c],%[a]") + ASM("adcl $0,%[carry]") + + ASM("addl %[d],%[a]") + ASM("adcl $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d) + : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc"); + + return a; + } + +} + +} + +#endif diff --git a/src/math/bigint/mp_ia32/mp_asmi.h b/src/math/bigint/mp_ia32/mp_asmi.h new file mode 100644 index 000000000..9de0c11e3 --- /dev/null +++ b/src/math/bigint/mp_ia32/mp_asmi.h @@ -0,0 +1,225 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2007 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/mp_asm.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for x86 Assembly * +*************************************************/ +#ifndef ASM + #define ASM(x) x "\n\t" +#endif + +#define ADDSUB2_OP(OPERATION, INDEX) \ + ASM("movl 4*" #INDEX "(%[y]), %[carry]") \ + ASM(OPERATION " %[carry], 4*" #INDEX "(%[x])") \ + +#define ADDSUB3_OP(OPERATION, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]), %[carry]") \ + ASM(OPERATION " 4*" #INDEX "(%[y]), %[carry]") \ + ASM("movl %[carry], 4*" #INDEX "(%[z])") \ + +#define LINMUL_OP(WRITE_TO, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]),%%eax") \ + ASM("mull %[y]") \ + ASM("addl %[carry],%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("movl %%edx,%[carry]") \ + ASM("movl %%eax, 4*" #INDEX "(%[" WRITE_TO "])") + +#define MULADD_OP(IGNORED, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]),%%eax") \ + ASM("mull %[y]") \ + ASM("addl %[carry],%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("addl 4*" #INDEX "(%[z]),%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("movl %%edx,%[carry]") \ + ASM("movl %%eax, 4*" #INDEX " (%[z])") + +#define DO_8_TIMES(MACRO, ARG) \ + MACRO(ARG, 0) \ + MACRO(ARG, 1) \ + MACRO(ARG, 2) \ + MACRO(ARG, 3) \ + MACRO(ARG, 4) \ + MACRO(ARG, 5) \ + MACRO(ARG, 6) \ + MACRO(ARG, 7) + +#define ADD_OR_SUBTRACT(CORE_CODE) \ + ASM("rorl %[carry]") \ + CORE_CODE \ + ASM("sbbl %[carry],%[carry]") \ + ASM("negl %[carry]") + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("adcl %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("sbbl %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Subtraction, Three Argument * +*************************************************/ +inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "x") + : [carry]"=r"(carry) + : [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "z") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(MULADD_OP, "") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mull %[y]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mull %[y]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +} + +} + +#endif diff --git a/src/math/bigint/mp_ia32_msvc/info.txt b/src/math/bigint/mp_ia32_msvc/info.txt new file mode 100644 index 000000000..9c7ac9b43 --- /dev/null +++ b/src/math/bigint/mp_ia32_msvc/info.txt @@ -0,0 +1,18 @@ +realname "x86 MPI Assembler Core (MSVC)" + +mp_bits 32 + +load_on dep + +<add> +mp_generic:mp_asm.h +mp_asmi.h +</add> + +<arch> +ia32 +</arch> + +<cc> +msvc +</cc> diff --git a/src/math/bigint/mp_ia32_msvc/mp_asmi.h b/src/math/bigint/mp_ia32_msvc/mp_asmi.h new file mode 100644 index 000000000..5eaa46eb4 --- /dev/null +++ b/src/math/bigint/mp_ia32_msvc/mp_asmi.h @@ -0,0 +1,547 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2006 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include "mp_asm.h" + +namespace Botan { + +extern "C" { + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + word z = x + y; + word c1 = (z < x); + z += *carry; + *carry = c1 | (z < *carry); + return z; + } + +/************************************************* +* Four Word Block Addition, Two Argument * +*************************************************/ +inline word word4_addcarry(word x[4], word carry) + { + __asm { + mov edx,[x] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + adc [edx],0 + mov eax,[esi+4] + adc [edx+4],0 + mov eax,[esi+8] + adc [edx+8],0 + mov eax,[esi+12] + adc [edx+12],0 + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + __asm { + mov edx,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[esi] + adc [edx],eax + mov eax,[esi+4] + adc [edx+4],eax + mov eax,[esi+8] + adc [edx+8],eax + mov eax,[esi+12] + adc [edx+12],eax + mov eax,[esi+16] + adc [edx+16],eax + mov eax,[esi+20] + adc [edx+20],eax + mov eax,[esi+24] + adc [edx+24],eax + mov eax,[esi+28] + adc [edx+28],eax + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + mov ebx,[z] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + adc eax,[esi] + mov [ebx],eax + + mov eax,[edi+4] + adc eax,[esi+4] + mov [ebx+4],eax + + mov eax,[edi+8] + adc eax,[esi+8] + mov [ebx+8],eax + + mov eax,[edi+12] + adc eax,[esi+12] + mov [ebx+12],eax + + mov eax,[edi+16] + adc eax,[esi+16] + mov [ebx+16],eax + + mov eax,[edi+20] + adc eax,[esi+20] + mov [ebx+20],eax + + mov eax,[edi+24] + adc eax,[esi+24] + mov [ebx+24],eax + + mov eax,[edi+28] + adc eax,[esi+28] + mov [ebx+28],eax + + sbb eax,eax + neg eax + } + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + word t0 = x - y; + word c1 = (t0 > x); + word z = t0 - *carry; + *carry = c1 | (z > t0); + return z; + } + +/************************************************* +* Four Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + _asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + sbb eax,[esi] + mov [edi],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [edi+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [edi+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [edi+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [edi+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [edi+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [edi+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [edi+28],eax + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Subtraction, Three Argument * +*************************************************/ +__forceinline word word8_sub3(word z[8], const word x[8], + const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov ebx,[z] + mov eax,[edi] + sbb eax,[esi] + mov [ebx],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [ebx+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [ebx+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [ebx+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [ebx+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [ebx+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [ebx+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [ebx+28],eax + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[8], word y, word carry) + { + __asm + { + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [esi+28],eax //load a + + mov eax,edx //store carry + } + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +__forceinline word word8_muladd(word z[8], const word x[8], + word y, word carry) + { + __asm + { + mov esi,[x] + mov ebx,[y] + mov edi,[z] + mov eax,[esi] //load a + mul ebx //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + add eax,[edi] //sum lo z + adc edx,0 //sum hi z + mov ecx,edx //carry for next block = hi z + mov [edi],eax //save lo z + + mov eax,[esi+4] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+4] + adc edx,0 + mov ecx,edx + mov [edi+4],eax + + mov eax,[esi+8] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+8] + adc edx,0 + mov ecx,edx + mov [edi+8],eax + + mov eax,[esi+12] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+12] + adc edx,0 + mov ecx,edx + mov [edi+12],eax + + mov eax,[esi+16] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+16] + adc edx,0 + mov ecx,edx + mov [edi+16],eax + + mov eax,[esi+20] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+20] + adc edx,0 + mov ecx,edx + mov [edi+20],eax + + mov eax,[esi+24] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+24] + adc edx,0 + mov ecx,edx + mov [edi+24],eax + + mov eax,[esi+28] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+28] + adc edx,0 + mov [edi+28],eax + mov eax,edx + } + } + +__forceinline word word8_linmul3(word z[4], const word x[4], word y, word carry) + { + __asm + { +#if 0 + //it's slower!!! + mov edx,[z] + mov eax,[x] + movd mm7,[y] + + movd mm0,[eax] + movd mm1,[eax+4] + movd mm2,[eax+8] + pmuludq mm0,mm7 + pmuludq mm1,mm7 + pmuludq mm2,mm7 + + movd mm6,[carry] + paddq mm0,mm6 + movd [edx],mm0 + + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+4],mm1 + + movd mm3,[eax+12] + psrlq mm1,32 + paddq mm2,mm1 + movd [edx+8],mm2 + + pmuludq mm3,mm7 + movd mm4,[eax+16] + psrlq mm2,32 + paddq mm3,mm2 + movd [edx+12],mm3 + + pmuludq mm4,mm7 + movd mm5,[eax+20] + psrlq mm3,32 + paddq mm4,mm3 + movd [edx+16],mm4 + + pmuludq mm5,mm7 + movd mm0,[eax+24] + psrlq mm4,32 + paddq mm5,mm4 + movd [edx+20],mm5 + + pmuludq mm0,mm7 + movd mm1,[eax+28] + psrlq mm5,32 + paddq mm0,mm5 + movd [edx+24],mm0 + + pmuludq mm1,mm7 + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+28],mm1 + + psrlq mm1,32 + movd eax,mm1 + emms +#else + mov edi,[z] + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [edi+28],eax //load a + mov eax,edx //store carry +#endif + } + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd3(x[0], y, z[0], &carry); + z[1] = word_madd3(x[1], y, z[1], &carry); + z[2] = word_madd3(x[2], y, z[2], &carry); + z[3] = word_madd3(x[3], y, z[3], &carry); + z[4] = word_madd3(x[4], y, z[4], &carry); + z[5] = word_madd3(x[5], y, z[5], &carry); + z[6] = word_madd3(x[6], y, z[6], &carry); + z[7] = word_madd3(x[7], y, z[7], &carry); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b) + { + dword z = (dword)a * b + (*w0); + *w0 = (word)z; //lo + + word t1 = (word)(z >> BOTAN_MP_WORD_BITS); //hi + *w1 += t1; //w1+=lo + *w2 += (*w1 < t1) ? 1 : 0; //w2+=carry + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b) + { + dword z = (dword)a * b; + word t0 = (word)z; + word t1 = (word)(z >> BOTAN_MP_WORD_BITS); + + *w0 += t0; + *w1 += t1 + ((*w0 < t0) ? 1 : 0); + *w2 += (*w1 < t1) ? 1 : 0; + + *w0 += t0; + *w1 += t1 + ((*w0 < t0) ? 1 : 0); + *w2 += (*w1 < t1) ? 1 : 0; + } + +} + +} + +#endif diff --git a/src/math/bigint/mp_karat.cpp b/src/math/bigint/mp_karat.cpp new file mode 100644 index 000000000..15b0551fd --- /dev/null +++ b/src/math/bigint/mp_karat.cpp @@ -0,0 +1,334 @@ +/************************************************* +* Karatsuba Multiplication/Squaring Source File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/mp_core.h> +#include <botan/mem_ops.h> +#include <botan/mp_asmi.h> + +namespace Botan { + +namespace { + +/************************************************* +* Karatsuba Multiplication Operation * +*************************************************/ +void karatsuba_mul(word z[], const word x[], const word y[], u32bit N, + word workspace[]) + { + if(N == 6) + bigint_comba_mul6(z, x, y); + else if(N == 8) + bigint_comba_mul8(z, x, y); + else if(N == 16) + bigint_comba_mul16(z, x, y); + else if(N < BOTAN_KARAT_MUL_THRESHOLD || N % 2) + bigint_simple_mul(z, x, N, y, N); + else + { + const u32bit N2 = N / 2; + + const word* x0 = x; + const word* x1 = x + N2; + const word* y0 = y; + const word* y1 = y + N2; + word* z0 = z; + word* z1 = z + N; + + const s32bit cmp0 = bigint_cmp(x0, N2, x1, N2); + const s32bit cmp1 = bigint_cmp(y1, N2, y0, N2); + + clear_mem(workspace, 2*N); + + if(cmp0 && cmp1) + { + if(cmp0 > 0) + bigint_sub3(z0, x0, N2, x1, N2); + else + bigint_sub3(z0, x1, N2, x0, N2); + + if(cmp1 > 0) + bigint_sub3(z1, y1, N2, y0, N2); + else + bigint_sub3(z1, y0, N2, y1, N2); + + karatsuba_mul(workspace, z0, z1, N2, workspace+N); + } + + karatsuba_mul(z0, x0, y0, N2, workspace+N); + karatsuba_mul(z1, x1, y1, N2, workspace+N); + + const u32bit blocks_of_8 = N - (N % 8); + + word carry = 0; + + for(u32bit j = 0; j != blocks_of_8; j += 8) + carry = word8_add3(workspace + N + j, z0 + j, z1 + j, carry); + + for(u32bit j = blocks_of_8; j != N; ++j) + workspace[N + j] = word_add(z0[j], z1[j], &carry); + + word carry2 = 0; + + for(u32bit j = 0; j != blocks_of_8; j += 8) + carry2 = word8_add2(z + N2 + j, workspace + N + j, carry2); + + for(u32bit j = blocks_of_8; j != N; ++j) + z[N2 + j] = word_add(z[N2 + j], workspace[N + j], &carry2); + + z[N + N2] = word_add(z[N + N2], carry2, &carry); + + if(carry) + for(u32bit j = 1; j != N2; ++j) + if(++z[N + N2 + j]) + break; + + if((cmp0 == cmp1) || (cmp0 == 0) || (cmp1 == 0)) + bigint_add2(z + N2, 2*N-N2, workspace, N); + else + bigint_sub2(z + N2, 2*N-N2, workspace, N); + } + } + +/************************************************* +* Karatsuba Squaring Operation * +*************************************************/ +void karatsuba_sqr(word z[], const word x[], u32bit N, word workspace[]) + { + if(N == 6) + bigint_comba_sqr6(z, x); + else if(N == 8) + bigint_comba_sqr8(z, x); + else if(N == 16) + bigint_comba_sqr16(z, x); + else if(N < BOTAN_KARAT_SQR_THRESHOLD || N % 2) + bigint_simple_sqr(z, x, N); + else + { + const u32bit N2 = N / 2; + + const word* x0 = x; + const word* x1 = x + N2; + word* z0 = z; + word* z1 = z + N; + + const s32bit cmp = bigint_cmp(x0, N2, x1, N2); + + clear_mem(workspace, 2*N); + + if(cmp) + { + if(cmp > 0) + bigint_sub3(z0, x0, N2, x1, N2); + else + bigint_sub3(z0, x1, N2, x0, N2); + + karatsuba_sqr(workspace, z0, N2, workspace+N); + } + + karatsuba_sqr(z0, x0, N2, workspace+N); + karatsuba_sqr(z1, x1, N2, workspace+N); + + const u32bit blocks_of_8 = N - (N % 8); + + word carry = 0; + + for(u32bit j = 0; j != blocks_of_8; j += 8) + carry = word8_add3(workspace + N + j, z0 + j, z1 + j, carry); + + for(u32bit j = blocks_of_8; j != N; ++j) + workspace[N + j] = word_add(z0[j], z1[j], &carry); + + word carry2 = 0; + + for(u32bit j = 0; j != blocks_of_8; j += 8) + carry2 = word8_add2(z + N2 + j, workspace + N + j, carry2); + + for(u32bit j = blocks_of_8; j != N; ++j) + z[N2 + j] = word_add(z[N2 + j], workspace[N + j], &carry2); + + z[N + N2] = word_add(z[N + N2], carry2, &carry); + + if(carry) + for(u32bit j = 1; j != N2; ++j) + if(++z[N + N2 + j]) + break; + + if(cmp == 0) + bigint_add2(z + N2, 2*N-N2, workspace, N); + else + bigint_sub2(z + N2, 2*N-N2, workspace, N); + } + } + +/************************************************* +* Pick a good size for the Karatsuba multiply * +*************************************************/ +u32bit karatsuba_size(u32bit z_size, + u32bit x_size, u32bit x_sw, + u32bit y_size, u32bit y_sw) + { + if(x_sw > x_size || x_sw > y_size || y_sw > x_size || y_sw > y_size) + return 0; + + if(((x_size == x_sw) && (x_size % 2)) || + ((y_size == y_sw) && (y_size % 2))) + return 0; + + const u32bit start = (x_sw > y_sw) ? x_sw : y_sw; + const u32bit end = (x_size < y_size) ? x_size : y_size; + + if(start == end) + { + if(start % 2) + return 0; + return start; + } + + for(u32bit j = start; j <= end; ++j) + { + if(j % 2) + continue; + + if(2*j > z_size) + return 0; + + if(x_sw <= j && j <= x_size && y_sw <= j && j <= y_size) + { + if(j % 4 == 2 && + (j+2) <= x_size && (j+2) <= y_size && 2*(j+2) <= z_size) + return j+2; + return j; + } + } + + return 0; + } + +/************************************************* +* Pick a good size for the Karatsuba squaring * +*************************************************/ +u32bit karatsuba_size(u32bit z_size, u32bit x_size, u32bit x_sw) + { + if(x_sw == x_size) + { + if(x_sw % 2) + return 0; + return x_sw; + } + + for(u32bit j = x_sw; j <= x_size; ++j) + { + if(j % 2) + continue; + + if(2*j > z_size) + return 0; + + if(j % 4 == 2 && (j+2) <= x_size && 2*(j+2) <= z_size) + return j+2; + return j; + } + + return 0; + } + +} + +/************************************************* +* Multiplication Algorithm Dispatcher * +*************************************************/ +void bigint_mul(word z[], u32bit z_size, word workspace[], + const word x[], u32bit x_size, u32bit x_sw, + const word y[], u32bit y_size, u32bit y_sw) + { + if(x_sw == 1) + { + bigint_linmul3(z, y, y_sw, x[0]); + } + else if(y_sw == 1) + { + bigint_linmul3(z, x, x_sw, y[0]); + } + else if(x_sw <= 4 && x_size >= 4 && + y_sw <= 4 && y_size >= 4 && z_size >= 8) + { + bigint_comba_mul4(z, x, y); + } + else if(x_sw <= 6 && x_size >= 6 && + y_sw <= 6 && y_size >= 6 && z_size >= 12) + { + bigint_comba_mul6(z, x, y); + } + else if(x_sw <= 8 && x_size >= 8 && + y_sw <= 8 && y_size >= 8 && z_size >= 16) + { + bigint_comba_mul8(z, x, y); + } + else if(x_sw <= 16 && x_size >= 16 && + y_sw <= 16 && y_size >= 16 && z_size >= 32) + { + bigint_comba_mul16(z, x, y); + } + else if(x_sw < BOTAN_KARAT_MUL_THRESHOLD || y_sw < BOTAN_KARAT_MUL_THRESHOLD) + bigint_simple_mul(z, x, x_sw, y, y_sw); + else + { + const u32bit N = karatsuba_size(z_size, x_size, x_sw, y_size, y_sw); + + if(N) + { + clear_mem(workspace, 2*N); + karatsuba_mul(z, x, y, N, workspace); + } + else + bigint_simple_mul(z, x, x_sw, y, y_sw); + } + } + +/************************************************* +* Squaring Algorithm Dispatcher * +*************************************************/ +void bigint_sqr(word z[], u32bit z_size, word workspace[], + const word x[], u32bit x_size, u32bit x_sw) + { + if(x_sw == 1) + { + bigint_linmul3(z, x, x_sw, x[0]); + } + else if(x_sw <= 4 && x_size >= 4 && z_size >= 8) + { + bigint_comba_sqr4(z, x); + } + else if(x_sw <= 6 && x_size >= 6 && z_size >= 12) + { + bigint_comba_sqr6(z, x); + } + else if(x_sw <= 8 && x_size >= 8 && z_size >= 16) + { + bigint_comba_sqr8(z, x); + } + else if(x_sw <= 16 && x_size >= 16 && z_size >= 32) + { + bigint_comba_sqr16(z, x); + } + else if(x_size < BOTAN_KARAT_SQR_THRESHOLD) + { + bigint_simple_sqr(z, x, x_sw); + } + else + { + const u32bit N = karatsuba_size(z_size, x_size, x_sw); + + if(N) + { + clear_mem(workspace, 2*N); + karatsuba_sqr(z, x, N, workspace); + } + else + bigint_simple_sqr(z, x, x_sw); + } + } + +} diff --git a/src/math/bigint/mp_misc.cpp b/src/math/bigint/mp_misc.cpp new file mode 100644 index 000000000..db9c8cda0 --- /dev/null +++ b/src/math/bigint/mp_misc.cpp @@ -0,0 +1,92 @@ +/************************************************* +* MP Misc Functions Source File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/mp_core.h> +#include <botan/mp_asm.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Core Division Operation * +*************************************************/ +u32bit bigint_divcore(word q, word y1, word y2, + word x1, word x2, word x3) + { + word y0 = 0; + y2 = word_madd2(q, y2, &y0); + y1 = word_madd2(q, y1, &y0); + + if(y0 > x1) return 1; + if(y0 < x1) return 0; + if(y1 > x2) return 1; + if(y1 < x2) return 0; + if(y2 > x3) return 1; + if(y2 < x3) return 0; + return 0; + } + +/************************************************* +* Compare two MP integers * +*************************************************/ +s32bit bigint_cmp(const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + if(x_size < y_size) { return (-bigint_cmp(y, y_size, x, x_size)); } + + while(x_size > y_size) + { + if(x[x_size-1]) + return 1; + x_size--; + } + for(u32bit j = x_size; j > 0; --j) + { + if(x[j-1] > y[j-1]) return 1; + if(x[j-1] < y[j-1]) return -1; + } + return 0; + } + +/************************************************* +* Do a 2-word/1-word Division * +*************************************************/ +word bigint_divop(word n1, word n0, word d) + { + word high = n1 % d, quotient = 0; + + for(u32bit j = 0; j != MP_WORD_BITS; ++j) + { + word high_top_bit = (high & MP_WORD_TOP_BIT); + + high <<= 1; + high |= (n0 >> (MP_WORD_BITS-1-j)) & 1; + quotient <<= 1; + + if(high_top_bit || high >= d) + { + high -= d; + quotient |= 1; + } + } + + return quotient; + } + +/************************************************* +* Do a 2-word/1-word Modulo * +*************************************************/ +word bigint_modop(word n1, word n0, word d) + { + word z = bigint_divop(n1, n0, d); + word dummy = 0; + z = word_madd2(z, d, &dummy); + return (n0-z); + } + +} + +} diff --git a/src/math/bigint/mp_shift.cpp b/src/math/bigint/mp_shift.cpp new file mode 100644 index 000000000..033774e46 --- /dev/null +++ b/src/math/bigint/mp_shift.cpp @@ -0,0 +1,136 @@ +/************************************************* +* MP Shift Algorithms Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/mp_core.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Single Operand Left Shift * +*************************************************/ +void bigint_shl1(word x[], u32bit x_size, u32bit word_shift, u32bit bit_shift) + { + if(word_shift) + { + for(u32bit j = 1; j != x_size + 1; ++j) + x[(x_size - j) + word_shift] = x[x_size - j]; + clear_mem(x, word_shift); + } + + if(bit_shift) + { + word carry = 0; + for(u32bit j = word_shift; j != x_size + word_shift + 1; ++j) + { + word temp = x[j]; + x[j] = (temp << bit_shift) | carry; + carry = (temp >> (MP_WORD_BITS - bit_shift)); + } + } + } + +/************************************************* +* Single Operand Right Shift * +*************************************************/ +void bigint_shr1(word x[], u32bit x_size, u32bit word_shift, u32bit bit_shift) + { + if(x_size < word_shift) + { + clear_mem(x, x_size); + return; + } + + if(word_shift) + { + copy_mem(x, x + word_shift, x_size - word_shift); + clear_mem(x + x_size - word_shift, word_shift); + } + + if(bit_shift) + { + word carry = 0; + + u32bit top = x_size - word_shift; + + while(top >= 4) + { + word w = x[top-1]; + x[top-1] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + w = x[top-2]; + x[top-2] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + w = x[top-3]; + x[top-3] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + w = x[top-4]; + x[top-4] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + top -= 4; + } + + while(top) + { + word w = x[top-1]; + x[top-1] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + + top--; + } + } + } + +/************************************************* +* Two Operand Left Shift * +*************************************************/ +void bigint_shl2(word y[], const word x[], u32bit x_size, + u32bit word_shift, u32bit bit_shift) + { + for(u32bit j = 0; j != x_size; ++j) + y[j + word_shift] = x[j]; + if(bit_shift) + { + word carry = 0; + for(u32bit j = word_shift; j != x_size + word_shift + 1; ++j) + { + word w = y[j]; + y[j] = (w << bit_shift) | carry; + carry = (w >> (MP_WORD_BITS - bit_shift)); + } + } + } + +/************************************************* +* Two Operand Right Shift * +*************************************************/ +void bigint_shr2(word y[], const word x[], u32bit x_size, + u32bit word_shift, u32bit bit_shift) + { + if(x_size < word_shift) return; + + for(u32bit j = 0; j != x_size - word_shift; ++j) + y[j] = x[j + word_shift]; + if(bit_shift) + { + word carry = 0; + for(u32bit j = x_size - word_shift; j > 0; --j) + { + word w = y[j-1]; + y[j-1] = (w >> bit_shift) | carry; + carry = (w << (MP_WORD_BITS - bit_shift)); + } + } + } + +} + +} diff --git a/src/math/bigint/mp_types.h b/src/math/bigint/mp_types.h new file mode 100644 index 000000000..81b6d7395 --- /dev/null +++ b/src/math/bigint/mp_types.h @@ -0,0 +1,31 @@ +/************************************************* +* Low Level MPI Types Header File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_MPI_TYPES_H__ +#define BOTAN_MPI_TYPES_H__ + +#include <botan/types.h> + +namespace Botan { + +#if (BOTAN_MP_WORD_BITS == 8) + typedef byte word; +#elif (BOTAN_MP_WORD_BITS == 16) + typedef u16bit word; +#elif (BOTAN_MP_WORD_BITS == 32) + typedef u32bit word; +#elif (BOTAN_MP_WORD_BITS == 64) + typedef u64bit word; +#else + #error BOTAN_MP_WORD_BITS must be 8, 16, 32, or 64 +#endif + +const word MP_WORD_MASK = ~static_cast<word>(0); +const word MP_WORD_TOP_BIT = static_cast<word>(1) << (8*sizeof(word) - 1); +const word MP_WORD_MAX = MP_WORD_MASK; + +} + +#endif diff --git a/src/math/bigint/mulop_amd64/info.txt b/src/math/bigint/mulop_amd64/info.txt new file mode 100644 index 000000000..670780d9c --- /dev/null +++ b/src/math/bigint/mulop_amd64/info.txt @@ -0,0 +1,31 @@ +realname "BigInt Multiply-Add (x86-64)" + +mp_bits 64 + +load_on dep + +<add> +mp_mulop_amd64.S +</add> + +<requires> +asm_amd64 +</requires> + +<arch> +amd64 +</arch> + +<cc> +gcc +icc +</cc> + +# ELF systems +<os> +linux +freebsd +netbsd +openbsd +solaris +</os> diff --git a/src/math/bigint/mulop_amd64/mp_mulop.cpp b/src/math/bigint/mulop_amd64/mp_mulop.cpp new file mode 100644 index 000000000..d1aa51489 --- /dev/null +++ b/src/math/bigint/mulop_amd64/mp_mulop.cpp @@ -0,0 +1,94 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/mp_asm.h> +#include <botan/mp_asmi.h> +#include <botan/mp_core.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Simple O(N^2) Multiplication * +*************************************************/ +void bigint_simple_mul(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + const u32bit blocks = x_size - (x_size % 8); + + clear_mem(z, x_size + y_size); + + for(u32bit i = 0; i != y_size; ++i) + { + word carry = 0; + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_madd3(z + i + j, x + j, y[i], carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +inline word word_sqr(word x, + +/************************************************* +* Simple O(N^2) Squaring + +This is exactly the same algorithm as bigint_simple_mul, +however because C/C++ compilers suck at alias analysis it +is good to have the version where the compiler knows +that x == y +*************************************************/ +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + const word x_i = x[i]; + + word carry = z[2*i]; + z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry); + + for(u32bit j = i; j != x_size; ++j) + { + // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry; + + /* + load z[i+j] into register + load x[j] into %hi + mulq %[x_i] -> x[i] * x[j] -> %lo:%hi + shlq %lo, $1 + + // put carry bit (cf) from %lo into %temp + xorl %temp + adcq $0, %temp + + // high bit of lo now in cf + shl %hi, $1 + // add in lowest bid from %lo + orl %temp, %hi + + addq %[c], %[lo] + adcq $0, %[hi] + addq %[z_ij], %[lo] + adcq $0, %[hi] + + */ + + } + + z[x_size+i] = carry; + } + } + +} + +} diff --git a/src/math/bigint/mulop_amd64/mp_mulop_amd64.S b/src/math/bigint/mulop_amd64/mp_mulop_amd64.S new file mode 100644 index 000000000..e5bba23fb --- /dev/null +++ b/src/math/bigint/mulop_amd64/mp_mulop_amd64.S @@ -0,0 +1,128 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(mp_mulop.S) + +#if 0 +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + const u32bit blocks = x_size - (x_size % 8); + + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + word carry = 0; + + /* + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_madd3(z + i + j, x + j, x[i], carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + */ + + + for(u32bit j = 0; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + + for(u32bit j = 0; j != x_size; ++j) + { + dword z = (dword)a * b + c + *d; + *d = (word)(z >> BOTAN_MP_WORD_BITS); + return (word)z; + } + + + + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + + } + + + + z[x_size+i] = carry; + } + } + +#endif + +START_FUNCTION(bigint_simple_sqr) + +#define Z_ARR ARG_1 +#define X_ARR ARG_2 +//#define X_SIZE ARG_3_32 + +#define CARRY TEMP_1 +#define Z_WORD TEMP_2 +#define LOOP_I TEMP_3 +#define LOOP_J TEMP_4 +#define X_SIZE TEMP_5 +#define MUL_LO %rax +// arg 3, xsize +#define MUL_HI %rdx + +// need arg3 == rdx for multiply + ASSIGN(X_SIZE, ARG3_32) + + ZEROIZE(CARRY) + + ZEROIZE(LOOP_I) + +.LOOP_ZEROIZE_Z: + + cmp LOOP_I, X_SIZE + + + + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) + +#define MULADD_OP(N) \ + ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ + ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ + MUL(Y) ; \ + ADD(Z_WORD, CARRY) ; \ + ASSIGN(CARRY, MUL_HI) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ADD(Z_WORD, MUL_LO) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) + +.LOOP_MULADD8: + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(Z_ARR, 64) + ADD_IMM(X_ARR, 64) + cmp IMM(8), LOOP_CTR + jge .LOOP_MULADD8 + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + +ALIGN +.LOOP_MULADD1: + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(Z_ARR, 8) + ADD_IMM(X_ARR, 8) + + cmp IMM(0), LOOP_CTR + jne .LOOP_MULADD1 + +.L_MULADD_DONE: + RETURN_VALUE_IS(CARRY) +END_FUNCTION(bigint_simple_square) diff --git a/src/math/bigint/mulop_generic/info.txt b/src/math/bigint/mulop_generic/info.txt new file mode 100644 index 000000000..28ebe41eb --- /dev/null +++ b/src/math/bigint/mulop_generic/info.txt @@ -0,0 +1,7 @@ +realname "BigInt Multiply-Add" + +load_on dep + +<add> +mp_mulop.cpp +</add> diff --git a/src/math/bigint/mulop_generic/mp_mulop.cpp b/src/math/bigint/mulop_generic/mp_mulop.cpp new file mode 100644 index 000000000..3ab28d306 --- /dev/null +++ b/src/math/bigint/mulop_generic/mp_mulop.cpp @@ -0,0 +1,72 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/mp_asm.h> +#include <botan/mp_asmi.h> +#include <botan/mp_core.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Simple O(N^2) Multiplication * +*************************************************/ +void bigint_simple_mul(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + const u32bit x_size_8 = x_size - (x_size % 8); + + clear_mem(z, x_size + y_size); + + for(u32bit i = 0; i != y_size; ++i) + { + const word y_i = y[i]; + + word carry = 0; + + for(u32bit j = 0; j != x_size_8; j += 8) + carry = word8_madd3(z + i + j, x + j, y_i, carry); + + for(u32bit j = x_size_8; j != x_size; ++j) + z[i+j] = word_madd3(x[j], y_i, z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +/************************************************* +* Simple O(N^2) Squaring + +This is exactly the same algorithm as bigint_simple_mul, +however because C/C++ compilers suck at alias analysis it +is good to have the version where the compiler knows +that x == y +*************************************************/ +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + const u32bit x_size_8 = x_size - (x_size % 8); + + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + const word x_i = x[i]; + word carry = 0; + + for(u32bit j = 0; j != x_size_8; j += 8) + carry = word8_madd3(z + i + j, x + j, x_i, carry); + + for(u32bit j = x_size_8; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x_i, z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +} + +} diff --git a/src/math/bigint/mulop_ia32/info.txt b/src/math/bigint/mulop_ia32/info.txt new file mode 100644 index 000000000..1c89e95c1 --- /dev/null +++ b/src/math/bigint/mulop_ia32/info.txt @@ -0,0 +1,33 @@ +realname "BigInt Multiply-Add (IA-32)" + +mp_bits 32 + +# Out of date, still implements bigint_mul_add_words + +load_on request + +<add> +mp_mulop.S +</add> + +<requires> +asm_ia32 +</requires> + +<arch> +ia32 +</arch> + +<cc> +gcc +icc +</cc> + +# ELF systems +<os> +linux +freebsd +netbsd +openbsd +solaris +</os> diff --git a/src/math/bigint/mulop_ia32/mp_mulop.S b/src/math/bigint/mulop_ia32/mp_mulop.S new file mode 100644 index 000000000..a5f0d3b27 --- /dev/null +++ b/src/math/bigint/mulop_ia32/mp_mulop.S @@ -0,0 +1,62 @@ +/************************************************* +* Multiply/Add Algorithm Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(mp_muladd.S) + +START_FUNCTION(bigint_mul_add_words) + SPILL_REGS() +#define PUSHED 4 + +#define LOOP_CTR ESI + ASSIGN(LOOP_CTR, ARG(3)) /* x_size */ + ZEROIZE(EDI) + + ASSIGN(ECX, ARG(1)) /* z[] */ + ASSIGN(EBX, ARG(2)) /* x[] */ + ASSIGN(EBP, ARG(4)) /* y */ + +#define MULADD_OP(N) \ + ASSIGN(EAX, ARRAY4(EBX, N)) ; \ + MUL(EBP) ; \ + ADD_W_CARRY(EAX, EDX, EDI) ; \ + ASSIGN(EDI, EDX) ; \ + ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ; + + JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP) + +START_LOOP(.MULADD8) + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(EBX, 32) + ADD_IMM(ECX, 32) +LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8) + + JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) + +START_LOOP(.MULADD1) + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(EBX, 4) + ADD_IMM(ECX, 4) +LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1) + +.MUL_ADD_DONE: + + ASSIGN(EAX, EDI) +#undef PUSHED + RESTORE_REGS() +END_FUNCTION(bigint_mul_add_words) |