aboutsummaryrefslogtreecommitdiffstats
path: root/src/math/bigint/mp_ia32_msvc
diff options
context:
space:
mode:
authorlloyd <[email protected]>2010-03-16 00:21:30 +0000
committerlloyd <[email protected]>2010-03-16 00:21:30 +0000
commit4be0c32f1715ca97f6cc85bc1905899f5cd01cef (patch)
tree3f7d40f046f6c3fb12f3a23be909916a099344a3 /src/math/bigint/mp_ia32_msvc
parentb96139b84113b583a453c820127c4765c6f0f66f (diff)
Add a special handler for the case of doing a subtraction as in:
x -= y; where abs(x) < abs(y). This change alone increases ECDSA performance by 5 to 15%
Diffstat (limited to 'src/math/bigint/mp_ia32_msvc')
-rw-r--r--src/math/bigint/mp_ia32_msvc/mp_asmi.h774
1 files changed, 394 insertions, 380 deletions
diff --git a/src/math/bigint/mp_ia32_msvc/mp_asmi.h b/src/math/bigint/mp_ia32_msvc/mp_asmi.h
index 4c3027fde..aee457d65 100644
--- a/src/math/bigint/mp_ia32_msvc/mp_asmi.h
+++ b/src/math/bigint/mp_ia32_msvc/mp_asmi.h
@@ -1,6 +1,6 @@
/*
* Lowest Level MPI Algorithms
-* (C) 1999-2006 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
* 2006 Luca Piccarreta
*
* Distributed under the terms of the Botan license
@@ -33,28 +33,28 @@ inline word word_add(word x, word y, word* carry)
inline word word8_add2(word x[8], const word y[8], word carry)
{
__asm {
- mov edx,[x]
- mov esi,[y]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[esi]
- adc [edx],eax
- mov eax,[esi+4]
- adc [edx+4],eax
- mov eax,[esi+8]
- adc [edx+8],eax
- mov eax,[esi+12]
- adc [edx+12],eax
- mov eax,[esi+16]
- adc [edx+16],eax
- mov eax,[esi+20]
- adc [edx+20],eax
- mov eax,[esi+24]
- adc [edx+24],eax
- mov eax,[esi+28]
- adc [edx+28],eax
- sbb eax,eax
- neg eax
+ mov edx,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[esi]
+ adc [edx],eax
+ mov eax,[esi+4]
+ adc [edx+4],eax
+ mov eax,[esi+8]
+ adc [edx+8],eax
+ mov eax,[esi+12]
+ adc [edx+12],eax
+ mov eax,[esi+16]
+ adc [edx+16],eax
+ mov eax,[esi+20]
+ adc [edx+20],eax
+ mov eax,[esi+24]
+ adc [edx+24],eax
+ mov eax,[esi+28]
+ adc [edx+28],eax
+ sbb eax,eax
+ neg eax
}
}
@@ -64,46 +64,46 @@ inline word word8_add2(word x[8], const word y[8], word carry)
inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
{
__asm {
- mov edi,[x]
- mov esi,[y]
- mov ebx,[z]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[edi]
- adc eax,[esi]
- mov [ebx],eax
-
- mov eax,[edi+4]
- adc eax,[esi+4]
- mov [ebx+4],eax
-
- mov eax,[edi+8]
- adc eax,[esi+8]
- mov [ebx+8],eax
-
- mov eax,[edi+12]
- adc eax,[esi+12]
- mov [ebx+12],eax
-
- mov eax,[edi+16]
- adc eax,[esi+16]
- mov [ebx+16],eax
-
- mov eax,[edi+20]
- adc eax,[esi+20]
- mov [ebx+20],eax
-
- mov eax,[edi+24]
- adc eax,[esi+24]
- mov [ebx+24],eax
-
- mov eax,[edi+28]
- adc eax,[esi+28]
- mov [ebx+28],eax
-
- sbb eax,eax
- neg eax
- }
+ mov edi,[x]
+ mov esi,[y]
+ mov ebx,[z]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ adc eax,[esi]
+ mov [ebx],eax
+
+ mov eax,[edi+4]
+ adc eax,[esi+4]
+ mov [ebx+4],eax
+
+ mov eax,[edi+8]
+ adc eax,[esi+8]
+ mov [ebx+8],eax
+
+ mov eax,[edi+12]
+ adc eax,[esi+12]
+ mov [ebx+12],eax
+
+ mov eax,[edi+16]
+ adc eax,[esi+16]
+ mov [ebx+16],eax
+
+ mov eax,[edi+20]
+ adc eax,[esi+20]
+ mov [ebx+20],eax
+
+ mov eax,[edi+24]
+ adc eax,[esi+24]
+ mov [ebx+24],eax
+
+ mov eax,[edi+28]
+ adc eax,[esi+28]
+ mov [ebx+28],eax
+
+ sbb eax,eax
+ neg eax
+ }
}
/*
@@ -123,40 +123,57 @@ inline word word_sub(word x, word y, word* carry)
*/
inline word word8_sub2(word x[8], const word y[8], word carry)
{
- _asm {
- mov edi,[x]
- mov esi,[y]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[edi]
- sbb eax,[esi]
- mov [edi],eax
- mov eax,[edi+4]
- sbb eax,[esi+4]
- mov [edi+4],eax
- mov eax,[edi+8]
- sbb eax,[esi+8]
- mov [edi+8],eax
- mov eax,[edi+12]
- sbb eax,[esi+12]
- mov [edi+12],eax
- mov eax,[edi+16]
- sbb eax,[esi+16]
- mov [edi+16],eax
- mov eax,[edi+20]
- sbb eax,[esi+20]
- mov [edi+20],eax
- mov eax,[edi+24]
- sbb eax,[esi+24]
- mov [edi+24],eax
- mov eax,[edi+28]
- sbb eax,[esi+28]
- mov [edi+28],eax
- sbb eax,eax
- neg eax
- }
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [edi],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [edi+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [edi+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [edi+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [edi+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [edi+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [edi+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [edi+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+ {
+ x[0] = word_sub(y[0], x[0], &carry);
+ x[1] = word_sub(y[1], x[1], &carry);
+ x[2] = word_sub(y[2], x[2], &carry);
+ x[3] = word_sub(y[3], x[3], &carry);
+ x[4] = word_sub(y[4], x[4], &carry);
+ x[5] = word_sub(y[5], x[5], &carry);
+ x[6] = word_sub(y[6], x[6], &carry);
+ x[7] = word_sub(y[7], x[7], &carry);
+ return carry;
}
+
/*
* Eight Word Block Subtraction, Three Argument
*/
@@ -164,38 +181,38 @@ inline word word8_sub3(word z[8], const word x[8],
const word y[8], word carry)
{
__asm {
- mov edi,[x]
- mov esi,[y]
- xor eax,eax
- sub eax,[carry] //force CF=1 iff *carry==1
- mov ebx,[z]
- mov eax,[edi]
- sbb eax,[esi]
- mov [ebx],eax
- mov eax,[edi+4]
- sbb eax,[esi+4]
- mov [ebx+4],eax
- mov eax,[edi+8]
- sbb eax,[esi+8]
- mov [ebx+8],eax
- mov eax,[edi+12]
- sbb eax,[esi+12]
- mov [ebx+12],eax
- mov eax,[edi+16]
- sbb eax,[esi+16]
- mov [ebx+16],eax
- mov eax,[edi+20]
- sbb eax,[esi+20]
- mov [ebx+20],eax
- mov eax,[edi+24]
- sbb eax,[esi+24]
- mov [ebx+24],eax
- mov eax,[edi+28]
- sbb eax,[esi+28]
- mov [ebx+28],eax
- sbb eax,eax
- neg eax
- }
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov ebx,[z]
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [ebx],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [ebx+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [ebx+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [ebx+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [ebx+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [ebx+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [ebx+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [ebx+28],eax
+ sbb eax,eax
+ neg eax
+ }
}
/*
@@ -203,65 +220,64 @@ inline word word8_sub3(word z[8], const word x[8],
*/
inline word word8_linmul2(word x[8], word y, word carry)
{
- __asm
- {
- mov esi,[x]
- mov eax,[esi] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,[carry] //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi],eax //load a
-
- mov eax,[esi+4] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+4],eax //load a
-
- mov eax,[esi+8] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+8],eax //load a
-
- mov eax,[esi+12] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+12],eax //load a
-
- mov eax,[esi+16] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+16],eax //load a
-
- mov eax,[esi+20] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+20],eax //load a
-
- mov eax,[esi+24] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [esi+24],eax //load a
-
- mov eax,[esi+28] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov [esi+28],eax //load a
-
- mov eax,edx //store carry
+ __asm {
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+28],eax //load a
+
+ mov eax,edx //store carry
}
}
@@ -271,207 +287,205 @@ inline word word8_linmul2(word x[8], word y, word carry)
inline word word8_muladd(word z[8], const word x[8],
word y, word carry)
{
- __asm
- {
- mov esi,[x]
- mov ebx,[y]
- mov edi,[z]
- mov eax,[esi] //load a
- mul ebx //edx(hi):eax(lo)=a*b
- add eax,[carry] //sum lo carry
- adc edx,0 //sum hi carry
- add eax,[edi] //sum lo z
- adc edx,0 //sum hi z
- mov ecx,edx //carry for next block = hi z
- mov [edi],eax //save lo z
-
- mov eax,[esi+4]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+4]
- adc edx,0
- mov ecx,edx
- mov [edi+4],eax
-
- mov eax,[esi+8]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+8]
- adc edx,0
- mov ecx,edx
- mov [edi+8],eax
-
- mov eax,[esi+12]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+12]
- adc edx,0
- mov ecx,edx
- mov [edi+12],eax
-
- mov eax,[esi+16]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+16]
- adc edx,0
- mov ecx,edx
- mov [edi+16],eax
-
- mov eax,[esi+20]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+20]
- adc edx,0
- mov ecx,edx
- mov [edi+20],eax
-
- mov eax,[esi+24]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+24]
- adc edx,0
- mov ecx,edx
- mov [edi+24],eax
-
- mov eax,[esi+28]
- mul ebx
- add eax,ecx
- adc edx,0
- add eax,[edi+28]
- adc edx,0
- mov [edi+28],eax
- mov eax,edx
- }
+ __asm {
+ mov esi,[x]
+ mov ebx,[y]
+ mov edi,[z]
+ mov eax,[esi] //load a
+ mul ebx //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ add eax,[edi] //sum lo z
+ adc edx,0 //sum hi z
+ mov ecx,edx //carry for next block = hi z
+ mov [edi],eax //save lo z
+
+ mov eax,[esi+4]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+4]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+4],eax
+
+ mov eax,[esi+8]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+8]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+8],eax
+
+ mov eax,[esi+12]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+12]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+12],eax
+
+ mov eax,[esi+16]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+16]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+16],eax
+
+ mov eax,[esi+20]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+20]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+20],eax
+
+ mov eax,[esi+24]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+24]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+24],eax
+
+ mov eax,[esi+28]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+28]
+ adc edx,0
+ mov [edi+28],eax
+ mov eax,edx
+ }
}
inline word word8_linmul3(word z[4], const word x[4], word y, word carry)
{
- __asm
- {
+ __asm {
#if 0
- //it's slower!!!
- mov edx,[z]
- mov eax,[x]
- movd mm7,[y]
-
- movd mm0,[eax]
- movd mm1,[eax+4]
- movd mm2,[eax+8]
- pmuludq mm0,mm7
- pmuludq mm1,mm7
- pmuludq mm2,mm7
-
- movd mm6,[carry]
- paddq mm0,mm6
- movd [edx],mm0
-
- psrlq mm0,32
- paddq mm1,mm0
- movd [edx+4],mm1
-
- movd mm3,[eax+12]
- psrlq mm1,32
- paddq mm2,mm1
- movd [edx+8],mm2
-
- pmuludq mm3,mm7
- movd mm4,[eax+16]
- psrlq mm2,32
- paddq mm3,mm2
- movd [edx+12],mm3
-
- pmuludq mm4,mm7
- movd mm5,[eax+20]
- psrlq mm3,32
- paddq mm4,mm3
- movd [edx+16],mm4
-
- pmuludq mm5,mm7
- movd mm0,[eax+24]
- psrlq mm4,32
- paddq mm5,mm4
- movd [edx+20],mm5
-
- pmuludq mm0,mm7
- movd mm1,[eax+28]
- psrlq mm5,32
- paddq mm0,mm5
- movd [edx+24],mm0
-
- pmuludq mm1,mm7
- psrlq mm0,32
- paddq mm1,mm0
- movd [edx+28],mm1
-
- psrlq mm1,32
- movd eax,mm1
- emms
+ //it's slower!!!
+ mov edx,[z]
+ mov eax,[x]
+ movd mm7,[y]
+
+ movd mm0,[eax]
+ movd mm1,[eax+4]
+ movd mm2,[eax+8]
+ pmuludq mm0,mm7
+ pmuludq mm1,mm7
+ pmuludq mm2,mm7
+
+ movd mm6,[carry]
+ paddq mm0,mm6
+ movd [edx],mm0
+
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+4],mm1
+
+ movd mm3,[eax+12]
+ psrlq mm1,32
+ paddq mm2,mm1
+ movd [edx+8],mm2
+
+ pmuludq mm3,mm7
+ movd mm4,[eax+16]
+ psrlq mm2,32
+ paddq mm3,mm2
+ movd [edx+12],mm3
+
+ pmuludq mm4,mm7
+ movd mm5,[eax+20]
+ psrlq mm3,32
+ paddq mm4,mm3
+ movd [edx+16],mm4
+
+ pmuludq mm5,mm7
+ movd mm0,[eax+24]
+ psrlq mm4,32
+ paddq mm5,mm4
+ movd [edx+20],mm5
+
+ pmuludq mm0,mm7
+ movd mm1,[eax+28]
+ psrlq mm5,32
+ paddq mm0,mm5
+ movd [edx+24],mm0
+
+ pmuludq mm1,mm7
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+28],mm1
+ psrlq mm1,32
+
+ movd eax,mm1
+ emms
#else
- mov edi,[z]
- mov esi,[x]
- mov eax,[esi] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,[carry] //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi],eax //load a
-
- mov eax,[esi+4] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+4],eax //load a
-
- mov eax,[esi+8] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+8],eax //load a
-
- mov eax,[esi+12] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+12],eax //load a
-
- mov eax,[esi+16] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+16],eax //load a
-
- mov eax,[esi+20] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+20],eax //load a
-
- mov eax,[esi+24] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov ecx,edx //store carry
- mov [edi+24],eax //load a
-
- mov eax,[esi+28] //load a
- mul [y] //edx(hi):eax(lo)=a*b
- add eax,ecx //sum lo carry
- adc edx,0 //sum hi carry
- mov [edi+28],eax //load a
- mov eax,edx //store carry
+ mov edi,[z]
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [edi+28],eax //load a
+ mov eax,edx //store carry
#endif
- }
+ }
}
/*