aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlloyd <[email protected]>2010-03-16 00:21:30 +0000
committerlloyd <[email protected]>2010-03-16 00:21:30 +0000
commit4be0c32f1715ca97f6cc85bc1905899f5cd01cef (patch)
tree3f7d40f046f6c3fb12f3a23be909916a099344a3
parentb96139b84113b583a453c820127c4765c6f0f66f (diff)
Add a special handler for the case of doing a subtraction as in:
x -= y; where abs(x) < abs(y). This change alone increases ECDSA performance by 5 to 15%
-rw-r--r--src/math/bigint/big_ops2.cpp6
-rw-r--r--src/math/bigint/mp_amd64/mp_asmi.h15
-rw-r--r--src/math/bigint/mp_asm.cpp20
-rw-r--r--src/math/bigint/mp_core.h9
-rw-r--r--src/math/bigint/mp_generic/mp_asmi.h22
-rw-r--r--src/math/bigint/mp_ia32/mp_asmi.h15
-rw-r--r--src/math/bigint/mp_ia32_msvc/mp_asmi.h774
7 files changed, 470 insertions, 391 deletions
diff --git a/src/math/bigint/big_ops2.cpp b/src/math/bigint/big_ops2.cpp
index 1137fe4b2..cc50c26e5 100644
--- a/src/math/bigint/big_ops2.cpp
+++ b/src/math/bigint/big_ops2.cpp
@@ -62,11 +62,7 @@ BigInt& BigInt::operator-=(const BigInt& y)
if(relative_size < 0)
{
if(sign() == y.sign())
- {
- SecureVector<word> z(reg_size - 1);
- bigint_sub3(z, y.data(), reg_size - 1, data(), x_sw);
- copy_mem(get_reg().begin(), z.begin(), z.size());
- }
+ bigint_sub2_rev(get_reg(), y.data(), y_sw);
else
bigint_add2(get_reg(), reg_size - 1, y.data(), y_sw);
diff --git a/src/math/bigint/mp_amd64/mp_asmi.h b/src/math/bigint/mp_amd64/mp_asmi.h
index d8f681d77..adf7774ef 100644
--- a/src/math/bigint/mp_amd64/mp_asmi.h
+++ b/src/math/bigint/mp_amd64/mp_asmi.h
@@ -1,6 +1,6 @@
/*
* Lowest Level MPI Algorithms
-* (C) 1999-2007 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
* 2006 Luca Piccarreta
*
* Distributed under the terms of the Botan license
@@ -131,6 +131,19 @@ inline word word8_sub2(word x[8], const word y[8], word carry)
}
/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*
* Eight Word Block Subtraction, Three Argument
*/
inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry)
diff --git a/src/math/bigint/mp_asm.cpp b/src/math/bigint/mp_asm.cpp
index 9827bff06..69a5e6469 100644
--- a/src/math/bigint/mp_asm.cpp
+++ b/src/math/bigint/mp_asm.cpp
@@ -9,6 +9,7 @@
#include <botan/internal/mp_asm.h>
#include <botan/internal/mp_asmi.h>
#include <botan/internal/mp_core.h>
+#include <botan/exceptn.h>
#include <botan/mem_ops.h>
namespace Botan {
@@ -114,6 +115,25 @@ void bigint_sub2(word x[], u32bit x_size, const word y[], u32bit y_size)
}
/*
+* Two Operand Subtraction x = y - x
+*/
+void bigint_sub2_rev(word x[], const word y[], u32bit y_size)
+ {
+ word carry = 0;
+
+ const u32bit blocks = y_size - (y_size % 8);
+
+ for(u32bit j = 0; j != blocks; j += 8)
+ carry = word8_sub2_rev(x + j, y + j, carry);
+
+ for(u32bit j = blocks; j != y_size; ++j)
+ x[j] = word_sub(y[j], x[j], &carry);
+
+ if(carry)
+ throw Internal_Error("bigint_sub2_rev: x >= y");
+ }
+
+/*
* Three Operand Subtraction
*/
void bigint_sub3(word z[], const word x[], u32bit x_size,
diff --git a/src/math/bigint/mp_core.h b/src/math/bigint/mp_core.h
index 7d601d820..bbe6b1310 100644
--- a/src/math/bigint/mp_core.h
+++ b/src/math/bigint/mp_core.h
@@ -35,7 +35,14 @@ word bigint_add3_nc(word z[],
const word x[], u32bit x_size,
const word y[], u32bit y_size);
-void bigint_sub2(word x[], u32bit x_size, const word y[], u32bit y_size);
+void bigint_sub2(word x[], u32bit x_size,
+ const word y[], u32bit y_size);
+
+/**
+* x = y - x; assumes y >= x
+*/
+void bigint_sub2_rev(word x[], const word y[], u32bit y_size);
+
void bigint_sub3(word z[],
const word x[], u32bit x_size,
const word y[], u32bit y_size);
diff --git a/src/math/bigint/mp_generic/mp_asmi.h b/src/math/bigint/mp_generic/mp_asmi.h
index 9913c6ba9..8225f372d 100644
--- a/src/math/bigint/mp_generic/mp_asmi.h
+++ b/src/math/bigint/mp_generic/mp_asmi.h
@@ -1,6 +1,6 @@
/*
* Lowest Level MPI Algorithms
-* (C) 1999-2008 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
* 2006 Luca Piccarreta
*
* Distributed under the terms of the Botan license
@@ -75,7 +75,7 @@ inline word word_sub(word x, word y, word* carry)
/*
* Eight Word Block Subtraction, Two Argument
*/
-inline word word8_sub2(word x[4], const word y[4], word carry)
+inline word word8_sub2(word x[8], const word y[8], word carry)
{
x[0] = word_sub(x[0], y[0], &carry);
x[1] = word_sub(x[1], y[1], &carry);
@@ -89,6 +89,22 @@ inline word word8_sub2(word x[4], const word y[4], word carry)
}
/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+ {
+ x[0] = word_sub(y[0], x[0], &carry);
+ x[1] = word_sub(y[1], x[1], &carry);
+ x[2] = word_sub(y[2], x[2], &carry);
+ x[3] = word_sub(y[3], x[3], &carry);
+ x[4] = word_sub(y[4], x[4], &carry);
+ x[5] = word_sub(y[5], x[5], &carry);
+ x[6] = word_sub(y[6], x[6], &carry);
+ x[7] = word_sub(y[7], x[7], &carry);
+ return carry;
+ }
+
+/*
* Eight Word Block Subtraction, Three Argument
*/
inline word word8_sub3(word z[8], const word x[8],
@@ -108,7 +124,7 @@ inline word word8_sub3(word z[8], const word x[8],
/*
* Eight Word Block Linear Multiplication
*/
-inline word word8_linmul2(word x[4], word y, word carry)
+inline word word8_linmul2(word x[8], word y, word carry)
{
x[0] = word_madd2(x[0], y, &carry);
x[1] = word_madd2(x[1], y, &carry);
diff --git a/src/math/bigint/mp_ia32/mp_asmi.h b/src/math/bigint/mp_ia32/mp_asmi.h
index 0b8708e53..c7b679e80 100644
--- a/src/math/bigint/mp_ia32/mp_asmi.h
+++ b/src/math/bigint/mp_ia32/mp_asmi.h
@@ -1,6 +1,6 @@
/*
* Lowest Level MPI Algorithms
-* (C) 1999-2007 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
* 2006 Luca Piccarreta
*
* Distributed under the terms of the Botan license
@@ -131,6 +131,19 @@ inline word word8_sub2(word x[8], const word y[8], word carry)
}
/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*
* Eight Word Block Subtraction, Three Argument
*/
inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry)
diff --git a/src/math/bigint/mp_ia32_msvc/mp_asmi.h b/src/math/bigint/mp_ia32_msvc/mp_asmi.h
index 4c3027fde..aee457d65 100644
--- a/src/math/bigint/mp_ia32_msvc/mp_asmi.h
+++ b/src/math/bigint/mp_ia32_msvc/mp_asmi.h
@@ -1,6 +1,6 @@
/*
* Lowest Level MPI Algorithms
-* (C) 1999-2006 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
* 2006 Luca Piccarreta
*
* Distributed under the terms of the Botan license
@@ -33,28 +33,28 @@ inline word word_add(word x, word y, word* carry)
inline word word8_add2(word x[8], const word y[8], word carry)
{
__asm {
- mov edx,[x]
- mov esi,[y]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[esi]
- adc [edx],eax
- mov eax,[esi+4]
- adc [edx+4],eax
- mov eax,[esi+8]
- adc [edx+8],eax
- mov eax,[esi+12]
- adc [edx+12],eax
- mov eax,[esi+16]
- adc [edx+16],eax
- mov eax,[esi+20]
- adc [edx+20],eax
- mov eax,[esi+24]
- adc [edx+24],eax
- mov eax,[esi+28]
- adc [edx+28],eax
- sbb eax,eax
- neg eax
+ mov edx,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[esi]
+ adc [edx],eax
+ mov eax,[esi+4]
+ adc [edx+4],eax
+ mov eax,[esi+8]
+ adc [edx+8],eax
+ mov eax,[esi+12]
+ adc [edx+12],eax
+ mov eax,[esi+16]
+ adc [edx+16],eax
+ mov eax,[esi+20]
+ adc [edx+20],eax
+ mov eax,[esi+24]
+ adc [edx+24],eax
+ mov eax,[esi+28]
+ adc [edx+28],eax
+ sbb eax,eax
+ neg eax
}
}
@@ -64,46 +64,46 @@ inline word word8_add2(word x[8], const word y[8], word carry)
inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
{
__asm {
- mov edi,[x]
- mov esi,[y]
- mov ebx,[z]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[edi]
- adc eax,[esi]
- mov [ebx],eax
-
- mov eax,[edi+4]
- adc eax,[esi+4]
- mov [ebx+4],eax
-
- mov eax,[edi+8]
- adc eax,[esi+8]
- mov [ebx+8],eax
-
- mov eax,[edi+12]
- adc eax,[esi+12]
- mov [ebx+12],eax
-
- mov eax,[edi+16]
- adc eax,[esi+16]
- mov [ebx+16],eax
-
- mov eax,[edi+20]
- adc eax,[esi+20]
- mov [ebx+20],eax
-
- mov eax,[edi+24]
- adc eax,[esi+24]
- mov [ebx+24],eax
-
- mov eax,[edi+28]
- adc eax,[esi+28]
- mov [ebx+28],eax
-
- sbb eax,eax
- neg eax
- }
+ mov edi,[x]
+ mov esi,[y]
+ mov ebx,[z]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ adc eax,[esi]
+ mov [ebx],eax
+
+ mov eax,[edi+4]
+ adc eax,[esi+4]
+ mov [ebx+4],eax
+
+ mov eax,[edi+8]
+ adc eax,[esi+8]
+ mov [ebx+8],eax
+
+ mov eax,[edi+12]
+ adc eax,[esi+12]
+ mov [ebx+12],eax
+
+ mov eax,[edi+16]
+ adc eax,[esi+16]
+ mov [ebx+16],eax
+
+ mov eax,[edi+20]
+ adc eax,[esi+20]
+ mov [ebx+20],eax
+
+ mov eax,[edi+24]
+ adc eax,[esi+24]
+ mov [ebx+24],eax
+
+ mov eax,[edi+28]
+ adc eax,[esi+28]
+ mov [ebx+28],eax
+
+ sbb eax,eax
+ neg eax
+ }
}
/*
@@ -123,40 +123,57 @@ inline word word_sub(word x, word y, word* carry)
*/
inline word word8_sub2(word x[8], const word y[8], word carry)
{
- _asm {
- mov edi,[x]
- mov esi,[y]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[edi]
- sbb eax,[esi]
- mov [edi],eax
- mov eax,[edi+4]
- sbb eax,[esi+4]
- mov [edi+4],eax
- mov eax,[edi+8]
- sbb eax,[esi+8]
- mov [edi+8],eax
- mov eax,[edi+12]
- sbb eax,[esi+12]
- mov [edi+12],eax
- mov eax,[edi+16]
- sbb eax,[esi+16]
- mov [edi+16],eax
- mov eax,[edi+20]
- sbb eax,[esi+20]
- mov [edi+20],eax
- mov eax,[edi+24]
- sbb eax,[esi+24]
- mov [edi+24],eax
- mov eax,[edi+28]
- sbb eax,[esi+28]
- mov [edi+28],eax
- sbb eax,eax
- neg eax
- }
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [edi],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [edi+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [edi+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [edi+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [edi+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [edi+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [edi+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [edi+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+ {
+ x[0] = word_sub(y[0], x[0], &carry);
+ x[1] = word_sub(y[1], x[1], &carry);
+ x[2] = word_sub(y[2], x[2], &carry);
+ x[3] = word_sub(y[3], x[3], &carry);
+ x[4] = word_sub(y[4], x[4], &carry);
+ x[5] = word_sub(y[5], x[5], &carry);
+ x[6] = word_sub(y[6], x[6], &carry);
+ x[7] = word_sub(y[7], x[7], &carry);
+ return carry;
}
+
/*
* Eight Word Block Subtraction, Three Argument
*/
@@ -164,38 +181,38 @@ inline word word8_sub3(word z[8], const word x[8],
const word y[8], word carry)
{
__asm {
- mov edi,[x]
- mov esi,[y]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov ebx,[z]
- mov eax,[edi]
- sbb eax,[esi]
- mov [ebx],eax
- mov eax,[edi+4]
- sbb eax,[esi+4]
- mov [ebx+4],eax
- mov eax,[edi+8]
- sbb eax,[esi+8]
- mov [ebx+8],eax
- mov eax,[edi+12]
- sbb eax,[esi+12]
- mov [ebx+12],eax
- mov eax,[edi+16]
- sbb eax,[esi+16]
- mov [ebx+16],eax
- mov eax,[edi+20]
- sbb eax,[esi+20]
- mov [ebx+20],eax
- mov eax,[edi+24]
- sbb eax,[esi+24]
- mov [ebx+24],eax
- mov eax,[edi+28]
- sbb eax,[esi+28]
- mov [ebx+28],eax
- sbb eax,eax
- neg eax
- }
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov ebx,[z]
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [ebx],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [ebx+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [ebx+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [ebx+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [ebx+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [ebx+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [ebx+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [ebx+28],eax
+ sbb eax,eax
+ neg eax
+ }
}
/*
@@ -203,65 +220,64 @@ inline word word8_sub3(word z[8], const word x[8],
*/
inline word word8_linmul2(word x[8], word y, word carry)
{
- __asm
- {
- mov esi,[x]
- mov eax,[esi] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,[carry] //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi],eax //load a
-
- mov eax,[esi+4] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+4],eax //load a
-
- mov eax,[esi+8] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+8],eax //load a
-
- mov eax,[esi+12] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+12],eax //load a
-
- mov eax,[esi+16] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+16],eax //load a
-
- mov eax,[esi+20] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+20],eax //load a
-
- mov eax,[esi+24] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+24],eax //load a
-
- mov eax,[esi+28] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov [esi+28],eax //load a
-
- mov eax,edx //store carry
+ __asm {
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+28],eax //load a
+
+ mov eax,edx //store carry
}
}
@@ -271,207 +287,205 @@ inline word word8_linmul2(word x[8], word y, word carry)
inline word word8_muladd(word z[8], const word x[8],
word y, word carry)
{
- __asm
- {
- mov esi,[x]
- mov ebx,[y]
- mov edi,[z]
- mov eax,[esi] //load a
- mul ebx //edx(hi):eax(lo)=a*b
- add eax,[carry] //sum lo carry
- adc edx,0 //sum hi carry
- add eax,[edi] //sum lo z
- adc edx,0 //sum hi z
- mov ecx,edx //carry for next block = hi z
- mov [edi],eax //save lo z
-
- mov eax,[esi+4]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+4]
- adc edx,0
- mov ecx,edx
- mov [edi+4],eax
-
- mov eax,[esi+8]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+8]
- adc edx,0
- mov ecx,edx
- mov [edi+8],eax
-
- mov eax,[esi+12]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+12]
- adc edx,0
- mov ecx,edx
- mov [edi+12],eax
-
- mov eax,[esi+16]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+16]
- adc edx,0
- mov ecx,edx
- mov [edi+16],eax
-
- mov eax,[esi+20]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+20]
- adc edx,0
- mov ecx,edx
- mov [edi+20],eax
-
- mov eax,[esi+24]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+24]
- adc edx,0
- mov ecx,edx
- mov [edi+24],eax
-
- mov eax,[esi+28]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+28]
- adc edx,0
- mov [edi+28],eax
- mov eax,edx
- }
+ __asm {
+ mov esi,[x]
+ mov ebx,[y]
+ mov edi,[z]
+ mov eax,[esi] //load a
+ mul ebx //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ add eax,[edi] //sum lo z
+ adc edx,0 //sum hi z
+ mov ecx,edx //carry for next block = hi z
+ mov [edi],eax //save lo z
+
+ mov eax,[esi+4]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+4]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+4],eax
+
+ mov eax,[esi+8]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+8]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+8],eax
+
+ mov eax,[esi+12]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+12]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+12],eax
+
+ mov eax,[esi+16]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+16]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+16],eax
+
+ mov eax,[esi+20]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+20]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+20],eax
+
+ mov eax,[esi+24]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+24]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+24],eax
+
+ mov eax,[esi+28]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+28]
+ adc edx,0
+ mov [edi+28],eax
+ mov eax,edx
+ }
}
inline word word8_linmul3(word z[4], const word x[4], word y, word carry)
{
- __asm
- {
+ __asm {
#if 0
- //it's slower!!!
- mov edx,[z]
- mov eax,[x]
- movd mm7,[y]
-
- movd mm0,[eax]
- movd mm1,[eax+4]
- movd mm2,[eax+8]
- pmuludq mm0,mm7
- pmuludq mm1,mm7
- pmuludq mm2,mm7
-
- movd mm6,[carry]
- paddq mm0,mm6
- movd [edx],mm0
-
- psrlq mm0,32
- paddq mm1,mm0
- movd [edx+4],mm1
-
- movd mm3,[eax+12]
- psrlq mm1,32
- paddq mm2,mm1
- movd [edx+8],mm2
-
- pmuludq mm3,mm7
- movd mm4,[eax+16]
- psrlq mm2,32
- paddq mm3,mm2
- movd [edx+12],mm3
-
- pmuludq mm4,mm7
- movd mm5,[eax+20]
- psrlq mm3,32
- paddq mm4,mm3
- movd [edx+16],mm4
-
- pmuludq mm5,mm7
- movd mm0,[eax+24]
- psrlq mm4,32
- paddq mm5,mm4
- movd [edx+20],mm5
-
- pmuludq mm0,mm7
- movd mm1,[eax+28]
- psrlq mm5,32
- paddq mm0,mm5
- movd [edx+24],mm0
-
- pmuludq mm1,mm7
- psrlq mm0,32
- paddq mm1,mm0
- movd [edx+28],mm1
-
- psrlq mm1,32
- movd eax,mm1
- emms
+ //it's slower!!!
+ mov edx,[z]
+ mov eax,[x]
+ movd mm7,[y]
+
+ movd mm0,[eax]
+ movd mm1,[eax+4]
+ movd mm2,[eax+8]
+ pmuludq mm0,mm7
+ pmuludq mm1,mm7
+ pmuludq mm2,mm7
+
+ movd mm6,[carry]
+ paddq mm0,mm6
+ movd [edx],mm0
+
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+4],mm1
+
+ movd mm3,[eax+12]
+ psrlq mm1,32
+ paddq mm2,mm1
+ movd [edx+8],mm2
+
+ pmuludq mm3,mm7
+ movd mm4,[eax+16]
+ psrlq mm2,32
+ paddq mm3,mm2
+ movd [edx+12],mm3
+
+ pmuludq mm4,mm7
+ movd mm5,[eax+20]
+ psrlq mm3,32
+ paddq mm4,mm3
+ movd [edx+16],mm4
+
+ pmuludq mm5,mm7
+ movd mm0,[eax+24]
+ psrlq mm4,32
+ paddq mm5,mm4
+ movd [edx+20],mm5
+
+ pmuludq mm0,mm7
+ movd mm1,[eax+28]
+ psrlq mm5,32
+ paddq mm0,mm5
+ movd [edx+24],mm0
+
+ pmuludq mm1,mm7
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+28],mm1
+ psrlq mm1,32
+
+ movd eax,mm1
+ emms
#else
- mov edi,[z]
- mov esi,[x]
- mov eax,[esi] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,[carry] //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi],eax //load a
-
- mov eax,[esi+4] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+4],eax //load a
-
- mov eax,[esi+8] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+8],eax //load a
-
- mov eax,[esi+12] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+12],eax //load a
-
- mov eax,[esi+16] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+16],eax //load a
-
- mov eax,[esi+20] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+20],eax //load a
-
- mov eax,[esi+24] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+24],eax //load a
-
- mov eax,[esi+28] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov [edi+28],eax //load a
- mov eax,edx //store carry
+ mov edi,[z]
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [edi+28],eax //load a
+ mov eax,edx //store carry
#endif
- }
+ }
}
/*