diff options
author | lloyd <[email protected]> | 2010-03-16 00:21:30 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2010-03-16 00:21:30 +0000 |
commit | 4be0c32f1715ca97f6cc85bc1905899f5cd01cef (patch) | |
tree | 3f7d40f046f6c3fb12f3a23be909916a099344a3 | |
parent | b96139b84113b583a453c820127c4765c6f0f66f (diff) |
Add a special handler for the case of doing a subtraction as in:
x -= y;
where abs(x) < abs(y).
This change alone increases ECDSA performance by 5 to 15%
-rw-r--r-- | src/math/bigint/big_ops2.cpp | 6 | ||||
-rw-r--r-- | src/math/bigint/mp_amd64/mp_asmi.h | 15 | ||||
-rw-r--r-- | src/math/bigint/mp_asm.cpp | 20 | ||||
-rw-r--r-- | src/math/bigint/mp_core.h | 9 | ||||
-rw-r--r-- | src/math/bigint/mp_generic/mp_asmi.h | 22 | ||||
-rw-r--r-- | src/math/bigint/mp_ia32/mp_asmi.h | 15 | ||||
-rw-r--r-- | src/math/bigint/mp_ia32_msvc/mp_asmi.h | 774 |
7 files changed, 470 insertions, 391 deletions
diff --git a/src/math/bigint/big_ops2.cpp b/src/math/bigint/big_ops2.cpp index 1137fe4b2..cc50c26e5 100644 --- a/src/math/bigint/big_ops2.cpp +++ b/src/math/bigint/big_ops2.cpp @@ -62,11 +62,7 @@ BigInt& BigInt::operator-=(const BigInt& y) if(relative_size < 0) { if(sign() == y.sign()) - { - SecureVector<word> z(reg_size - 1); - bigint_sub3(z, y.data(), reg_size - 1, data(), x_sw); - copy_mem(get_reg().begin(), z.begin(), z.size()); - } + bigint_sub2_rev(get_reg(), y.data(), y_sw); else bigint_add2(get_reg(), reg_size - 1, y.data(), y_sw); diff --git a/src/math/bigint/mp_amd64/mp_asmi.h b/src/math/bigint/mp_amd64/mp_asmi.h index d8f681d77..adf7774ef 100644 --- a/src/math/bigint/mp_amd64/mp_asmi.h +++ b/src/math/bigint/mp_amd64/mp_asmi.h @@ -1,6 +1,6 @@ /* * Lowest Level MPI Algorithms -* (C) 1999-2007 Jack Lloyd +* (C) 1999-2010 Jack Lloyd * 2006 Luca Piccarreta * * Distributed under the terms of the Botan license @@ -131,6 +131,19 @@ inline word word8_sub2(word x[8], const word y[8], word carry) } /* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* * Eight Word Block Subtraction, Three Argument */ inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) diff --git a/src/math/bigint/mp_asm.cpp b/src/math/bigint/mp_asm.cpp index 9827bff06..69a5e6469 100644 --- a/src/math/bigint/mp_asm.cpp +++ b/src/math/bigint/mp_asm.cpp @@ -9,6 +9,7 @@ #include <botan/internal/mp_asm.h> #include <botan/internal/mp_asmi.h> #include <botan/internal/mp_core.h> +#include <botan/exceptn.h> #include <botan/mem_ops.h> namespace Botan { @@ -114,6 +115,25 @@ void bigint_sub2(word x[], u32bit x_size, const word y[], u32bit y_size) } /* +* Two Operand Subtraction x = y - x +*/ +void bigint_sub2_rev(word x[], const word y[], u32bit y_size) + { + word carry = 0; + + const u32bit blocks = y_size - (y_size % 8); + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_sub2_rev(x + j, y + j, carry); + + for(u32bit j = blocks; j != y_size; ++j) + x[j] = word_sub(y[j], x[j], &carry); + + if(carry) + throw Internal_Error("bigint_sub2_rev: x >= y"); + } + +/* * Three Operand Subtraction */ void bigint_sub3(word z[], const word x[], u32bit x_size, diff --git a/src/math/bigint/mp_core.h b/src/math/bigint/mp_core.h index 7d601d820..bbe6b1310 100644 --- a/src/math/bigint/mp_core.h +++ b/src/math/bigint/mp_core.h @@ -35,7 +35,14 @@ word bigint_add3_nc(word z[], const word x[], u32bit x_size, const word y[], u32bit y_size); -void bigint_sub2(word x[], u32bit x_size, const word y[], u32bit y_size); +void bigint_sub2(word x[], u32bit x_size, + const word y[], u32bit y_size); + +/** +* x = y - x; assumes y >= x +*/ +void bigint_sub2_rev(word x[], const word y[], u32bit y_size); + void bigint_sub3(word z[], const word x[], u32bit x_size, const word y[], u32bit y_size); diff --git a/src/math/bigint/mp_generic/mp_asmi.h b/src/math/bigint/mp_generic/mp_asmi.h index 9913c6ba9..8225f372d 100644 --- a/src/math/bigint/mp_generic/mp_asmi.h +++ b/src/math/bigint/mp_generic/mp_asmi.h @@ -1,6 +1,6 @@ /* * Lowest Level MPI Algorithms -* (C) 1999-2008 Jack Lloyd +* (C) 1999-2010 Jack Lloyd * 2006 Luca Piccarreta * * Distributed under the terms of the Botan license @@ -75,7 +75,7 @@ inline word word_sub(word x, word y, word* carry) /* * Eight Word Block Subtraction, Two Argument */ -inline word word8_sub2(word x[4], const word y[4], word carry) +inline word word8_sub2(word x[8], const word y[8], word carry) { x[0] = word_sub(x[0], y[0], &carry); x[1] = word_sub(x[1], y[1], &carry); @@ -89,6 +89,22 @@ inline word word8_sub2(word x[4], const word y[4], word carry) } /* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + x[0] = word_sub(y[0], x[0], &carry); + x[1] = word_sub(y[1], x[1], &carry); + x[2] = word_sub(y[2], x[2], &carry); + x[3] = word_sub(y[3], x[3], &carry); + x[4] = word_sub(y[4], x[4], &carry); + x[5] = word_sub(y[5], x[5], &carry); + x[6] = word_sub(y[6], x[6], &carry); + x[7] = word_sub(y[7], x[7], &carry); + return carry; + } + +/* * Eight Word Block Subtraction, Three Argument */ inline word word8_sub3(word z[8], const word x[8], @@ -108,7 +124,7 @@ inline word word8_sub3(word z[8], const word x[8], /* * Eight Word Block Linear Multiplication */ -inline word word8_linmul2(word x[4], word y, word carry) +inline word word8_linmul2(word x[8], word y, word carry) { x[0] = word_madd2(x[0], y, &carry); x[1] = word_madd2(x[1], y, &carry); diff --git a/src/math/bigint/mp_ia32/mp_asmi.h b/src/math/bigint/mp_ia32/mp_asmi.h index 0b8708e53..c7b679e80 100644 --- a/src/math/bigint/mp_ia32/mp_asmi.h +++ b/src/math/bigint/mp_ia32/mp_asmi.h @@ -1,6 +1,6 @@ /* * Lowest Level MPI Algorithms -* (C) 1999-2007 Jack Lloyd +* (C) 1999-2010 Jack Lloyd * 2006 Luca Piccarreta * * Distributed under the terms of the Botan license @@ -131,6 +131,19 @@ inline word word8_sub2(word x[8], const word y[8], word carry) } /* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry) + : "cc", "memory"); + return carry; + } + +/* * Eight Word Block Subtraction, Three Argument */ inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) diff --git a/src/math/bigint/mp_ia32_msvc/mp_asmi.h b/src/math/bigint/mp_ia32_msvc/mp_asmi.h index 4c3027fde..aee457d65 100644 --- a/src/math/bigint/mp_ia32_msvc/mp_asmi.h +++ b/src/math/bigint/mp_ia32_msvc/mp_asmi.h @@ -1,6 +1,6 @@ /* * Lowest Level MPI Algorithms -* (C) 1999-2006 Jack Lloyd +* (C) 1999-2010 Jack Lloyd * 2006 Luca Piccarreta * * Distributed under the terms of the Botan license @@ -33,28 +33,28 @@ inline word word_add(word x, word y, word* carry) inline word word8_add2(word x[8], const word y[8], word carry) { __asm { - mov edx,[x] - mov esi,[y] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov eax,[esi] - adc [edx],eax - mov eax,[esi+4] - adc [edx+4],eax - mov eax,[esi+8] - adc [edx+8],eax - mov eax,[esi+12] - adc [edx+12],eax - mov eax,[esi+16] - adc [edx+16],eax - mov eax,[esi+20] - adc [edx+20],eax - mov eax,[esi+24] - adc [edx+24],eax - mov eax,[esi+28] - adc [edx+28],eax - sbb eax,eax - neg eax + mov edx,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[esi] + adc [edx],eax + mov eax,[esi+4] + adc [edx+4],eax + mov eax,[esi+8] + adc [edx+8],eax + mov eax,[esi+12] + adc [edx+12],eax + mov eax,[esi+16] + adc [edx+16],eax + mov eax,[esi+20] + adc [edx+20],eax + mov eax,[esi+24] + adc [edx+24],eax + mov eax,[esi+28] + adc [edx+28],eax + sbb eax,eax + neg eax } } @@ -64,46 +64,46 @@ inline word word8_add2(word x[8], const word y[8], word carry) inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) { __asm { - mov edi,[x] - mov esi,[y] - mov ebx,[z] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov eax,[edi] - adc eax,[esi] - mov [ebx],eax - - mov eax,[edi+4] - adc eax,[esi+4] - mov [ebx+4],eax - - mov eax,[edi+8] - adc eax,[esi+8] - mov [ebx+8],eax - - mov eax,[edi+12] - adc eax,[esi+12] - mov [ebx+12],eax - - mov eax,[edi+16] - adc eax,[esi+16] - mov [ebx+16],eax - - mov eax,[edi+20] - adc eax,[esi+20] - mov [ebx+20],eax - - mov eax,[edi+24] - adc eax,[esi+24] - mov [ebx+24],eax - - mov eax,[edi+28] - adc eax,[esi+28] - mov [ebx+28],eax - - sbb eax,eax - neg eax - } + mov edi,[x] + mov esi,[y] + mov ebx,[z] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + adc eax,[esi] + mov [ebx],eax + + mov eax,[edi+4] + adc eax,[esi+4] + mov [ebx+4],eax + + mov eax,[edi+8] + adc eax,[esi+8] + mov [ebx+8],eax + + mov eax,[edi+12] + adc eax,[esi+12] + mov [ebx+12],eax + + mov eax,[edi+16] + adc eax,[esi+16] + mov [ebx+16],eax + + mov eax,[edi+20] + adc eax,[esi+20] + mov [ebx+20],eax + + mov eax,[edi+24] + adc eax,[esi+24] + mov [ebx+24],eax + + mov eax,[edi+28] + adc eax,[esi+28] + mov [ebx+28],eax + + sbb eax,eax + neg eax + } } /* @@ -123,40 +123,57 @@ inline word word_sub(word x, word y, word* carry) */ inline word word8_sub2(word x[8], const word y[8], word carry) { - _asm { - mov edi,[x] - mov esi,[y] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov eax,[edi] - sbb eax,[esi] - mov [edi],eax - mov eax,[edi+4] - sbb eax,[esi+4] - mov [edi+4],eax - mov eax,[edi+8] - sbb eax,[esi+8] - mov [edi+8],eax - mov eax,[edi+12] - sbb eax,[esi+12] - mov [edi+12],eax - mov eax,[edi+16] - sbb eax,[esi+16] - mov [edi+16],eax - mov eax,[edi+20] - sbb eax,[esi+20] - mov [edi+20],eax - mov eax,[edi+24] - sbb eax,[esi+24] - mov [edi+24],eax - mov eax,[edi+28] - sbb eax,[esi+28] - mov [edi+28],eax - sbb eax,eax - neg eax - } + __asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + sbb eax,[esi] + mov [edi],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [edi+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [edi+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [edi+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [edi+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [edi+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [edi+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [edi+28],eax + sbb eax,eax + neg eax + } + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + x[0] = word_sub(y[0], x[0], &carry); + x[1] = word_sub(y[1], x[1], &carry); + x[2] = word_sub(y[2], x[2], &carry); + x[3] = word_sub(y[3], x[3], &carry); + x[4] = word_sub(y[4], x[4], &carry); + x[5] = word_sub(y[5], x[5], &carry); + x[6] = word_sub(y[6], x[6], &carry); + x[7] = word_sub(y[7], x[7], &carry); + return carry; } + /* * Eight Word Block Subtraction, Three Argument */ @@ -164,38 +181,38 @@ inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) { __asm { - mov edi,[x] - mov esi,[y] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov ebx,[z] - mov eax,[edi] - sbb eax,[esi] - mov [ebx],eax - mov eax,[edi+4] - sbb eax,[esi+4] - mov [ebx+4],eax - mov eax,[edi+8] - sbb eax,[esi+8] - mov [ebx+8],eax - mov eax,[edi+12] - sbb eax,[esi+12] - mov [ebx+12],eax - mov eax,[edi+16] - sbb eax,[esi+16] - mov [ebx+16],eax - mov eax,[edi+20] - sbb eax,[esi+20] - mov [ebx+20],eax - mov eax,[edi+24] - sbb eax,[esi+24] - mov [ebx+24],eax - mov eax,[edi+28] - sbb eax,[esi+28] - mov [ebx+28],eax - sbb eax,eax - neg eax - } + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov ebx,[z] + mov eax,[edi] + sbb eax,[esi] + mov [ebx],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [ebx+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [ebx+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [ebx+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [ebx+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [ebx+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [ebx+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [ebx+28],eax + sbb eax,eax + neg eax + } } /* @@ -203,65 +220,64 @@ inline word word8_sub3(word z[8], const word x[8], */ inline word word8_linmul2(word x[8], word y, word carry) { - __asm - { - mov esi,[x] - mov eax,[esi] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,[carry] //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi],eax //load a - - mov eax,[esi+4] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+4],eax //load a - - mov eax,[esi+8] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+8],eax //load a - - mov eax,[esi+12] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+12],eax //load a - - mov eax,[esi+16] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+16],eax //load a - - mov eax,[esi+20] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+20],eax //load a - - mov eax,[esi+24] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+24],eax //load a - - mov eax,[esi+28] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov [esi+28],eax //load a - - mov eax,edx //store carry + __asm { + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [esi+28],eax //load a + + mov eax,edx //store carry } } @@ -271,207 +287,205 @@ inline word word8_linmul2(word x[8], word y, word carry) inline word word8_muladd(word z[8], const word x[8], word y, word carry) { - __asm - { - mov esi,[x] - mov ebx,[y] - mov edi,[z] - mov eax,[esi] //load a - mul ebx //edx(hi):eax(lo)=a*b - add eax,[carry] //sum lo carry - adc edx,0 //sum hi carry - add eax,[edi] //sum lo z - adc edx,0 //sum hi z - mov ecx,edx //carry for next block = hi z - mov [edi],eax //save lo z - - mov eax,[esi+4] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+4] - adc edx,0 - mov ecx,edx - mov [edi+4],eax - - mov eax,[esi+8] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+8] - adc edx,0 - mov ecx,edx - mov [edi+8],eax - - mov eax,[esi+12] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+12] - adc edx,0 - mov ecx,edx - mov [edi+12],eax - - mov eax,[esi+16] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+16] - adc edx,0 - mov ecx,edx - mov [edi+16],eax - - mov eax,[esi+20] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+20] - adc edx,0 - mov ecx,edx - mov [edi+20],eax - - mov eax,[esi+24] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+24] - adc edx,0 - mov ecx,edx - mov [edi+24],eax - - mov eax,[esi+28] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+28] - adc edx,0 - mov [edi+28],eax - mov eax,edx - } + __asm { + mov esi,[x] + mov ebx,[y] + mov edi,[z] + mov eax,[esi] //load a + mul ebx //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + add eax,[edi] //sum lo z + adc edx,0 //sum hi z + mov ecx,edx //carry for next block = hi z + mov [edi],eax //save lo z + + mov eax,[esi+4] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+4] + adc edx,0 + mov ecx,edx + mov [edi+4],eax + + mov eax,[esi+8] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+8] + adc edx,0 + mov ecx,edx + mov [edi+8],eax + + mov eax,[esi+12] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+12] + adc edx,0 + mov ecx,edx + mov [edi+12],eax + + mov eax,[esi+16] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+16] + adc edx,0 + mov ecx,edx + mov [edi+16],eax + + mov eax,[esi+20] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+20] + adc edx,0 + mov ecx,edx + mov [edi+20],eax + + mov eax,[esi+24] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+24] + adc edx,0 + mov ecx,edx + mov [edi+24],eax + + mov eax,[esi+28] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+28] + adc edx,0 + mov [edi+28],eax + mov eax,edx + } } inline word word8_linmul3(word z[4], const word x[4], word y, word carry) { - __asm - { + __asm { #if 0 - //it's slower!!! - mov edx,[z] - mov eax,[x] - movd mm7,[y] - - movd mm0,[eax] - movd mm1,[eax+4] - movd mm2,[eax+8] - pmuludq mm0,mm7 - pmuludq mm1,mm7 - pmuludq mm2,mm7 - - movd mm6,[carry] - paddq mm0,mm6 - movd [edx],mm0 - - psrlq mm0,32 - paddq mm1,mm0 - movd [edx+4],mm1 - - movd mm3,[eax+12] - psrlq mm1,32 - paddq mm2,mm1 - movd [edx+8],mm2 - - pmuludq mm3,mm7 - movd mm4,[eax+16] - psrlq mm2,32 - paddq mm3,mm2 - movd [edx+12],mm3 - - pmuludq mm4,mm7 - movd mm5,[eax+20] - psrlq mm3,32 - paddq mm4,mm3 - movd [edx+16],mm4 - - pmuludq mm5,mm7 - movd mm0,[eax+24] - psrlq mm4,32 - paddq mm5,mm4 - movd [edx+20],mm5 - - pmuludq mm0,mm7 - movd mm1,[eax+28] - psrlq mm5,32 - paddq mm0,mm5 - movd [edx+24],mm0 - - pmuludq mm1,mm7 - psrlq mm0,32 - paddq mm1,mm0 - movd [edx+28],mm1 - - psrlq mm1,32 - movd eax,mm1 - emms + //it's slower!!! + mov edx,[z] + mov eax,[x] + movd mm7,[y] + + movd mm0,[eax] + movd mm1,[eax+4] + movd mm2,[eax+8] + pmuludq mm0,mm7 + pmuludq mm1,mm7 + pmuludq mm2,mm7 + + movd mm6,[carry] + paddq mm0,mm6 + movd [edx],mm0 + + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+4],mm1 + + movd mm3,[eax+12] + psrlq mm1,32 + paddq mm2,mm1 + movd [edx+8],mm2 + + pmuludq mm3,mm7 + movd mm4,[eax+16] + psrlq mm2,32 + paddq mm3,mm2 + movd [edx+12],mm3 + + pmuludq mm4,mm7 + movd mm5,[eax+20] + psrlq mm3,32 + paddq mm4,mm3 + movd [edx+16],mm4 + + pmuludq mm5,mm7 + movd mm0,[eax+24] + psrlq mm4,32 + paddq mm5,mm4 + movd [edx+20],mm5 + + pmuludq mm0,mm7 + movd mm1,[eax+28] + psrlq mm5,32 + paddq mm0,mm5 + movd [edx+24],mm0 + + pmuludq mm1,mm7 + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+28],mm1 + psrlq mm1,32 + + movd eax,mm1 + emms #else - mov edi,[z] - mov esi,[x] - mov eax,[esi] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,[carry] //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi],eax //load a - - mov eax,[esi+4] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+4],eax //load a - - mov eax,[esi+8] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+8],eax //load a - - mov eax,[esi+12] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+12],eax //load a - - mov eax,[esi+16] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+16],eax //load a - - mov eax,[esi+20] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+20],eax //load a - - mov eax,[esi+24] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+24],eax //load a - - mov eax,[esi+28] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov [edi+28],eax //load a - mov eax,edx //store carry + mov edi,[z] + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [edi+28],eax //load a + mov eax,edx //store carry #endif - } + } } /* |