Add a special handler for the case of doing a subtraction as in:

x -= y; where abs(x) < abs(y). This change alone increases ECDSA performance by 5 to 15%
author: lloyd <[email protected]> 2010-03-16 00:21:30 +0000
committer: lloyd <[email protected]> 2010-03-16 00:21:30 +0000
commit: 4be0c32f1715ca97f6cc85bc1905899f5cd01cef (patch)
tree: 3f7d40f046f6c3fb12f3a23be909916a099344a3 /src/math/bigint/mp_ia32_msvc
parent: b96139b84113b583a453c820127c4765c6f0f66f (diff)
1 files changed, 394 insertions, 380 deletions
diff --git a/src/math/bigint/mp_ia32_msvc/mp_asmi.h b/src/math/bigint/mp_ia32_msvc/mp_asmi.h
index 4c3027fde..aee457d65 100644
--- a/src/math/bigint/mp_ia32_msvc/mp_asmi.h
+++ b/src/math/bigint/mp_ia32_msvc/mp_asmi.h
@@ -1,6 +1,6 @@
 /*
 * Lowest Level MPI Algorithms
-* (C) 1999-2006 Jack Lloyd
+* (C) 1999-2010 Jack Lloyd
 *     2006 Luca Piccarreta
 *
 * Distributed under the terms of the Botan license
@@ -33,28 +33,28 @@ inline word word_add(word x, word y, word* carry)
 inline word word8_add2(word x[8], const word y[8], word carry)
    {
    __asm {
-       mov edx,[x]
-       mov esi,[y]
-       xor eax,eax
-       sub eax,[carry] //force CF=1 iff *carry==1
-       mov eax,[esi]
-       adc [edx],eax
-       mov eax,[esi+4]
-       adc [edx+4],eax
-       mov eax,[esi+8]
-       adc [edx+8],eax
-       mov eax,[esi+12]
-       adc [edx+12],eax
-       mov eax,[esi+16]
-       adc [edx+16],eax
-       mov eax,[esi+20]
-       adc [edx+20],eax
-       mov eax,[esi+24]
-       adc [edx+24],eax
-       mov eax,[esi+28]
-       adc [edx+28],eax
-       sbb eax,eax
-       neg eax
+      mov edx,[x]
+      mov esi,[y]
+      xor eax,eax
+      sub eax,[carry] //force CF=1 iff *carry==1
+      mov eax,[esi]
+      adc [edx],eax
+      mov eax,[esi+4]
+      adc [edx+4],eax
+      mov eax,[esi+8]
+      adc [edx+8],eax
+      mov eax,[esi+12]
+      adc [edx+12],eax
+      mov eax,[esi+16]
+      adc [edx+16],eax
+      mov eax,[esi+20]
+      adc [edx+20],eax
+      mov eax,[esi+24]
+      adc [edx+24],eax
+      mov eax,[esi+28]
+      adc [edx+28],eax
+      sbb eax,eax
+      neg eax
       }
    }
 
@@ -64,46 +64,46 @@ inline word word8_add2(word x[8], const word y[8], word carry)
 inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
    {
     __asm {
-       mov edi,[x]
-       mov esi,[y]
-       mov ebx,[z]
-       xor eax,eax
-       sub eax,[carry] //force CF=1 iff *carry==1
-       mov eax,[edi]
-       adc eax,[esi]
-       mov [ebx],eax
-
-       mov eax,[edi+4]
-       adc eax,[esi+4]
-       mov [ebx+4],eax
-
-       mov eax,[edi+8]
-       adc eax,[esi+8]
-       mov [ebx+8],eax
-
-       mov eax,[edi+12]
-       adc eax,[esi+12]
-       mov [ebx+12],eax
-
-       mov eax,[edi+16]
-       adc eax,[esi+16]
-       mov [ebx+16],eax
-
-       mov eax,[edi+20]
-       adc eax,[esi+20]
-       mov [ebx+20],eax
-
-       mov eax,[edi+24]
-       adc eax,[esi+24]
-       mov [ebx+24],eax
-
-       mov eax,[edi+28]
-       adc eax,[esi+28]
-       mov [ebx+28],eax
-
-       sbb eax,eax
-       neg eax
-       }
+      mov edi,[x]
+      mov esi,[y]
+      mov ebx,[z]
+      xor eax,eax
+      sub eax,[carry] //force CF=1 iff *carry==1
+      mov eax,[edi]
+      adc eax,[esi]
+      mov [ebx],eax
+
+      mov eax,[edi+4]
+      adc eax,[esi+4]
+      mov [ebx+4],eax
+
+      mov eax,[edi+8]
+      adc eax,[esi+8]
+      mov [ebx+8],eax
+
+      mov eax,[edi+12]
+      adc eax,[esi+12]
+      mov [ebx+12],eax
+
+      mov eax,[edi+16]
+      adc eax,[esi+16]
+      mov [ebx+16],eax
+
+      mov eax,[edi+20]
+      adc eax,[esi+20]
+      mov [ebx+20],eax
+
+      mov eax,[edi+24]
+      adc eax,[esi+24]
+      mov [ebx+24],eax
+
+      mov eax,[edi+28]
+      adc eax,[esi+28]
+      mov [ebx+28],eax
+
+      sbb eax,eax
+      neg eax
+      }
    }
 
 /*
@@ -123,40 +123,57 @@ inline word word_sub(word x, word y, word* carry)
 */
 inline word word8_sub2(word x[8], const word y[8], word carry)
    {
-    _asm {
-       mov edi,[x]
-       mov esi,[y]
-       xor eax,eax
-       sub eax,[carry] //force CF=1 iff *carry==1
-       mov eax,[edi]
-       sbb eax,[esi]
-       mov [edi],eax
-       mov eax,[edi+4]
-       sbb eax,[esi+4]
-       mov [edi+4],eax
-       mov eax,[edi+8]
-       sbb eax,[esi+8]
-       mov [edi+8],eax
-       mov eax,[edi+12]
-       sbb eax,[esi+12]
-       mov [edi+12],eax
-       mov eax,[edi+16]
-       sbb eax,[esi+16]
-       mov [edi+16],eax
-       mov eax,[edi+20]
-       sbb eax,[esi+20]
-       mov [edi+20],eax
-       mov eax,[edi+24]
-       sbb eax,[esi+24]
-       mov [edi+24],eax
-       mov eax,[edi+28]
-       sbb eax,[esi+28]
-       mov [edi+28],eax
-       sbb eax,eax
-       neg eax
-    }
+    __asm {
+      mov edi,[x]
+      mov esi,[y]
+      xor eax,eax
+      sub eax,[carry] //force CF=1 iff *carry==1
+      mov eax,[edi]
+      sbb eax,[esi]
+      mov [edi],eax
+      mov eax,[edi+4]
+      sbb eax,[esi+4]
+      mov [edi+4],eax
+      mov eax,[edi+8]
+      sbb eax,[esi+8]
+      mov [edi+8],eax
+      mov eax,[edi+12]
+      sbb eax,[esi+12]
+      mov [edi+12],eax
+      mov eax,[edi+16]
+      sbb eax,[esi+16]
+      mov [edi+16],eax
+      mov eax,[edi+20]
+      sbb eax,[esi+20]
+      mov [edi+20],eax
+      mov eax,[edi+24]
+      sbb eax,[esi+24]
+      mov [edi+24],eax
+      mov eax,[edi+28]
+      sbb eax,[esi+28]
+      mov [edi+28],eax
+      sbb eax,eax
+      neg eax
+      }
+   }
+
+/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+   {
+   x[0] = word_sub(y[0], x[0], &carry);
+   x[1] = word_sub(y[1], x[1], &carry);
+   x[2] = word_sub(y[2], x[2], &carry);
+   x[3] = word_sub(y[3], x[3], &carry);
+   x[4] = word_sub(y[4], x[4], &carry);
+   x[5] = word_sub(y[5], x[5], &carry);
+   x[6] = word_sub(y[6], x[6], &carry);
+   x[7] = word_sub(y[7], x[7], &carry);
+   return carry;
    }
 
+
 /*
 * Eight Word Block Subtraction, Three Argument
 */
@@ -164,38 +181,38 @@ inline word word8_sub3(word z[8], const word x[8],
                        const word y[8], word carry)
    {
     __asm {
-       mov edi,[x]
-       mov esi,[y]
-       xor eax,eax
-       sub eax,[carry] //force CF=1 iff *carry==1
-       mov ebx,[z]
-       mov eax,[edi]
-       sbb eax,[esi]
-       mov [ebx],eax
-       mov eax,[edi+4]
-       sbb eax,[esi+4]
-       mov [ebx+4],eax
-       mov eax,[edi+8]
-       sbb eax,[esi+8]
-       mov [ebx+8],eax
-       mov eax,[edi+12]
-       sbb eax,[esi+12]
-       mov [ebx+12],eax
-       mov eax,[edi+16]
-       sbb eax,[esi+16]
-       mov [ebx+16],eax
-       mov eax,[edi+20]
-       sbb eax,[esi+20]
-       mov [ebx+20],eax
-       mov eax,[edi+24]
-       sbb eax,[esi+24]
-       mov [ebx+24],eax
-       mov eax,[edi+28]
-       sbb eax,[esi+28]
-       mov [ebx+28],eax
-       sbb eax,eax
-       neg eax
-       }
+      mov edi,[x]
+      mov esi,[y]
+      xor eax,eax
+      sub eax,[carry] //force CF=1 iff *carry==1
+      mov ebx,[z]
+      mov eax,[edi]
+      sbb eax,[esi]
+      mov [ebx],eax
+      mov eax,[edi+4]
+      sbb eax,[esi+4]
+      mov [ebx+4],eax
+      mov eax,[edi+8]
+      sbb eax,[esi+8]
+      mov [ebx+8],eax
+      mov eax,[edi+12]
+      sbb eax,[esi+12]
+      mov [ebx+12],eax
+      mov eax,[edi+16]
+      sbb eax,[esi+16]
+      mov [ebx+16],eax
+      mov eax,[edi+20]
+      sbb eax,[esi+20]
+      mov [ebx+20],eax
+      mov eax,[edi+24]
+      sbb eax,[esi+24]
+      mov [ebx+24],eax
+      mov eax,[edi+28]
+      sbb eax,[esi+28]
+      mov [ebx+28],eax
+      sbb eax,eax
+      neg eax
+      }
    }
 
 /*
@@ -203,65 +220,64 @@ inline word word8_sub3(word z[8], const word x[8],
 */
 inline word word8_linmul2(word x[8], word y, word carry)
    {
-   __asm
-      {
-       mov esi,[x]
-       mov eax,[esi]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,[carry]      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi],eax        //load a
-
-       mov eax,[esi+4]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi+4],eax        //load a
-
-       mov eax,[esi+8]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi+8],eax        //load a
-
-       mov eax,[esi+12]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi+12],eax        //load a
-
-       mov eax,[esi+16]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi+16],eax        //load a
-
-       mov eax,[esi+20]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi+20],eax        //load a
-
-       mov eax,[esi+24]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [esi+24],eax        //load a
-
-       mov eax,[esi+28]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov [esi+28],eax        //load a
-
-       mov eax,edx      //store carry
+   __asm {
+      mov esi,[x]
+      mov eax,[esi]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,[carry]      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi],eax        //load a
+
+      mov eax,[esi+4]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi+4],eax        //load a
+
+      mov eax,[esi+8]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi+8],eax        //load a
+
+      mov eax,[esi+12]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi+12],eax        //load a
+
+      mov eax,[esi+16]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi+16],eax        //load a
+
+      mov eax,[esi+20]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi+20],eax        //load a
+
+      mov eax,[esi+24]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [esi+24],eax        //load a
+
+      mov eax,[esi+28]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov [esi+28],eax        //load a
+
+      mov eax,edx      //store carry
       }
    }
 
@@ -271,207 +287,205 @@ inline word word8_linmul2(word x[8], word y, word carry)
 inline word word8_muladd(word z[8], const word x[8],
                          word y, word carry)
    {
-   __asm
-      {
-       mov esi,[x]
-       mov ebx,[y]
-       mov edi,[z]
-       mov eax,[esi]     //load a
-       mul ebx           //edx(hi):eax(lo)=a*b
-       add eax,[carry]   //sum lo carry
-       adc edx,0         //sum hi carry
-       add eax,[edi]     //sum lo z
-       adc edx,0         //sum hi z
-       mov ecx,edx       //carry for next block = hi z
-       mov [edi],eax     //save lo z
-
-       mov eax,[esi+4]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+4]
-       adc edx,0
-       mov ecx,edx
-       mov [edi+4],eax
-
-       mov eax,[esi+8]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+8]
-       adc edx,0
-       mov ecx,edx
-       mov [edi+8],eax
-
-       mov eax,[esi+12]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+12]
-       adc edx,0
-       mov ecx,edx
-       mov [edi+12],eax
-
-       mov eax,[esi+16]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+16]
-       adc edx,0
-       mov ecx,edx
-       mov [edi+16],eax
-
-       mov eax,[esi+20]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+20]
-       adc edx,0
-       mov ecx,edx
-       mov [edi+20],eax
-
-       mov eax,[esi+24]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+24]
-       adc edx,0
-       mov ecx,edx
-       mov [edi+24],eax
-
-       mov eax,[esi+28]
-       mul ebx
-       add eax,ecx
-       adc edx,0
-       add eax,[edi+28]
-       adc edx,0
-       mov [edi+28],eax
-       mov eax,edx
-   }
+   __asm {
+      mov esi,[x]
+      mov ebx,[y]
+      mov edi,[z]
+      mov eax,[esi]     //load a
+      mul ebx           //edx(hi):eax(lo)=a*b
+      add eax,[carry]   //sum lo carry
+      adc edx,0         //sum hi carry
+      add eax,[edi]     //sum lo z
+      adc edx,0         //sum hi z
+      mov ecx,edx       //carry for next block = hi z
+      mov [edi],eax     //save lo z
+
+      mov eax,[esi+4]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+4]
+      adc edx,0
+      mov ecx,edx
+      mov [edi+4],eax
+
+      mov eax,[esi+8]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+8]
+      adc edx,0
+      mov ecx,edx
+      mov [edi+8],eax
+
+      mov eax,[esi+12]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+12]
+      adc edx,0
+      mov ecx,edx
+      mov [edi+12],eax
+
+      mov eax,[esi+16]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+16]
+      adc edx,0
+      mov ecx,edx
+      mov [edi+16],eax
+
+      mov eax,[esi+20]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+20]
+      adc edx,0
+      mov ecx,edx
+      mov [edi+20],eax
+
+      mov eax,[esi+24]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+24]
+      adc edx,0
+      mov ecx,edx
+      mov [edi+24],eax
+
+      mov eax,[esi+28]
+      mul ebx
+      add eax,ecx
+      adc edx,0
+      add eax,[edi+28]
+      adc edx,0
+      mov [edi+28],eax
+      mov eax,edx
+      }
    }
 
 inline word word8_linmul3(word z[4], const word x[4], word y, word carry)
    {
-   __asm
-   {
+   __asm {
 #if 0
-        //it's slower!!!
-       mov edx,[z]
-       mov eax,[x]
-        movd mm7,[y]
-
-        movd mm0,[eax]
-        movd mm1,[eax+4]
-        movd mm2,[eax+8]
-        pmuludq mm0,mm7
-        pmuludq mm1,mm7
-        pmuludq mm2,mm7
-
-        movd mm6,[carry]
-        paddq mm0,mm6
-        movd [edx],mm0
-
-        psrlq mm0,32
-        paddq mm1,mm0
-        movd [edx+4],mm1
-
-        movd mm3,[eax+12]
-        psrlq mm1,32
-        paddq mm2,mm1
-        movd [edx+8],mm2
-
-        pmuludq mm3,mm7
-        movd mm4,[eax+16]
-        psrlq mm2,32
-        paddq mm3,mm2
-        movd [edx+12],mm3
-
-        pmuludq mm4,mm7
-        movd mm5,[eax+20]
-        psrlq mm3,32
-        paddq mm4,mm3
-        movd [edx+16],mm4
-
-        pmuludq mm5,mm7
-        movd mm0,[eax+24]
-        psrlq mm4,32
-        paddq mm5,mm4
-        movd [edx+20],mm5
-
-        pmuludq mm0,mm7
-        movd mm1,[eax+28]
-        psrlq mm5,32
-        paddq mm0,mm5
-        movd [edx+24],mm0
-
-        pmuludq mm1,mm7
-        psrlq mm0,32
-        paddq mm1,mm0
-        movd [edx+28],mm1
-
-        psrlq mm1,32
-        movd eax,mm1
-        emms
+      //it's slower!!!
+      mov edx,[z]
+      mov eax,[x]
+      movd mm7,[y]
+
+      movd mm0,[eax]
+      movd mm1,[eax+4]
+      movd mm2,[eax+8]
+      pmuludq mm0,mm7
+      pmuludq mm1,mm7
+      pmuludq mm2,mm7
+
+      movd mm6,[carry]
+      paddq mm0,mm6
+      movd [edx],mm0
+
+      psrlq mm0,32
+      paddq mm1,mm0
+      movd [edx+4],mm1
+
+      movd mm3,[eax+12]
+      psrlq mm1,32
+      paddq mm2,mm1
+      movd [edx+8],mm2
+
+      pmuludq mm3,mm7
+      movd mm4,[eax+16]
+      psrlq mm2,32
+      paddq mm3,mm2
+      movd [edx+12],mm3
+
+      pmuludq mm4,mm7
+      movd mm5,[eax+20]
+      psrlq mm3,32
+      paddq mm4,mm3
+      movd [edx+16],mm4
+
+      pmuludq mm5,mm7
+      movd mm0,[eax+24]
+      psrlq mm4,32
+      paddq mm5,mm4
+      movd [edx+20],mm5
+
+      pmuludq mm0,mm7
+      movd mm1,[eax+28]
+      psrlq mm5,32
+      paddq mm0,mm5
+      movd [edx+24],mm0
+
+      pmuludq mm1,mm7
+      psrlq mm0,32
+      paddq mm1,mm0
+      movd [edx+28],mm1
+      psrlq mm1,32
+
+      movd eax,mm1
+      emms
 #else
-       mov edi,[z]
-       mov esi,[x]
-       mov eax,[esi]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,[carry]    //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi],eax        //load a
-
-       mov eax,[esi+4]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi+4],eax        //load a
-
-       mov eax,[esi+8]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi+8],eax        //load a
-
-       mov eax,[esi+12]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi+12],eax        //load a
-
-       mov eax,[esi+16]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi+16],eax        //load a
-
-       mov eax,[esi+20]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi+20],eax        //load a
-
-       mov eax,[esi+24]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov ecx,edx      //store carry
-       mov [edi+24],eax        //load a
-
-       mov eax,[esi+28]        //load a
-       mul [y]           //edx(hi):eax(lo)=a*b
-       add eax,ecx      //sum lo carry
-       adc edx,0          //sum hi carry
-       mov [edi+28],eax        //load a
-       mov eax,edx      //store carry
+      mov edi,[z]
+      mov esi,[x]
+      mov eax,[esi]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,[carry]    //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi],eax        //load a
+
+      mov eax,[esi+4]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi+4],eax        //load a
+
+      mov eax,[esi+8]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi+8],eax        //load a
+
+      mov eax,[esi+12]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi+12],eax        //load a
+
+      mov eax,[esi+16]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi+16],eax        //load a
+
+      mov eax,[esi+20]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi+20],eax        //load a
+
+      mov eax,[esi+24]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov ecx,edx      //store carry
+      mov [edi+24],eax        //load a
+
+      mov eax,[esi+28]        //load a
+      mul [y]           //edx(hi):eax(lo)=a*b
+      add eax,ecx      //sum lo carry
+      adc edx,0          //sum hi carry
+      mov [edi+28],eax        //load a
+      mov eax,edx      //store carry
 #endif
-   }
+      }
    }
 
 /*
author	lloyd <[email protected]>	2010-03-16 00:21:30 +0000
committer	lloyd <[email protected]>	2010-03-16 00:21:30 +0000
commit	4be0c32f1715ca97f6cc85bc1905899f5cd01cef (patch)
tree	3f7d40f046f6c3fb12f3a23be909916a099344a3 /src/math/bigint/mp_ia32_msvc
parent	b96139b84113b583a453c820127c4765c6f0f66f (diff)