Initial checkin1.5.6

author: lloyd <[email protected]> 2006-05-18 18:33:19 +0000
committer: lloyd <[email protected]> 2006-05-18 18:33:19 +0000
commit: a2c99d3270eb73ef2db5704fc54356c6b75096f8 (patch)
tree: ad3d6c4fcc8dd0f403f8105598943616246fe172 /modules/mp_ia32_msvc/mp_asmi.h
1 files changed, 486 insertions, 0 deletions
diff --git a/modules/mp_ia32_msvc/mp_asmi.h b/modules/mp_ia32_msvc/mp_asmi.h
new file mode 100644
index 000000000..ad1451010
--- /dev/null
+++ b/modules/mp_ia32_msvc/mp_asmi.h
@@ -0,0 +1,486 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File        *
+* (C) 1999-2006 The Botan Project                *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/mp_asm.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Word Addition                                  *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+   {
+   word z = x + y;
+   word c1 = (z < x);
+   z += *carry;
+   *carry = c1 | (z < *carry);
+   return z;
+   }
+
+/*************************************************
+* Four Word Block Addition, Two Argument         *
+*************************************************/
+inline word word4_addcarry(word x[4], word carry)
+   {
+      __asm {
+       mov edx,[x]
+       xor eax,eax
+       sub eax,[carry] //force CF=1 iff *carry==1
+       adc [edx],0
+       mov eax,[esi+4]      
+       adc [edx+4],0
+       mov eax,[esi+8]
+       adc [edx+8],0
+       mov eax,[esi+12]     
+       adc [edx+12],0
+       sbb eax,eax
+       neg eax
+   }
+   }
+
+   /*************************************************
+* Four Word Block Addition, Two Argument         *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+   {
+      __asm {
+       mov edx,[x]
+       mov esi,[y]
+       xor eax,eax
+       sub eax,[carry] //force CF=1 iff *carry==1
+       mov eax,[esi]        
+       adc [edx],eax
+       mov eax,[esi+4]      
+       adc [edx+4],eax
+       mov eax,[esi+8]
+       adc [edx+8],eax
+       mov eax,[esi+12]     
+       adc [edx+12],eax
+       mov eax,[esi+16]
+       adc [edx+16],eax
+       mov eax,[esi+20]     
+       adc [edx+20],eax
+       mov eax,[esi+24]     
+       adc [edx+24],eax
+       mov eax,[esi+28]     
+       adc [edx+28],eax
+       sbb eax,eax
+       neg eax
+   }
+   }
+
+/*************************************************
+* Four Word Block Addition, Three Argument       *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+   {
+       __asm {
+       mov edi,[x]
+       mov esi,[y]
+       mov ebx,[z]
+       xor eax,eax
+       sub eax,[carry] //force CF=1 iff *carry==1
+       mov eax,[edi]        
+       adc eax,[esi]
+       mov [ebx],eax
+
+       mov eax,[edi+4]      
+       adc eax,[esi+4]
+       mov [ebx+4],eax
+
+       mov eax,[edi+8]      
+       adc eax,[esi+8]
+       mov [ebx+8],eax
+
+       mov eax,[edi+12]     
+       adc eax,[esi+12]
+       mov [ebx+12],eax
+
+       mov eax,[edi+16]     
+       adc eax,[esi+16]
+       mov [ebx+16],eax
+
+       mov eax,[edi+20]     
+       adc eax,[esi+20]
+       mov [ebx+20],eax
+
+       mov eax,[edi+24]     
+       adc eax,[esi+24]
+       mov [ebx+24],eax
+
+       mov eax,[edi+28]     
+       adc eax,[esi+28]
+       mov [ebx+28],eax
+
+       sbb eax,eax
+       neg eax
+       }
+   }
+
+/*************************************************
+* Word Subtraction                               *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+   {
+   word t0 = x - y;
+   word c1 = (t0 > x);
+   word z = t0 - *carry;
+   *carry = c1 | (z > t0);
+   return z;
+   }
+
+/*************************************************
+* Four Word Block Subtraction, Two Argument      *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+   {
+    _asm {
+       mov edi,[x]
+       mov esi,[y]
+       xor eax,eax
+       sub eax,[carry] //force CF=1 iff *carry==1
+       mov eax,[edi]        
+       sbb eax,[esi]
+       mov [edi],eax
+       mov eax,[edi+4]      
+       sbb eax,[esi+4]
+       mov [edi+4],eax
+       mov eax,[edi+8]      
+       sbb eax,[esi+8]
+       mov [edi+8],eax
+       mov eax,[edi+12]     
+       sbb eax,[esi+12]
+       mov [edi+12],eax
+       mov eax,[edi+16]
+       sbb eax,[esi+16]
+       mov [edi+16],eax
+       mov eax,[edi+20]     
+       sbb eax,[esi+20]
+       mov [edi+20],eax
+       mov eax,[edi+24]     
+       sbb eax,[esi+24]
+       mov [edi+24],eax
+       mov eax,[edi+28]     
+       sbb eax,[esi+28]
+       mov [edi+28],eax
+       sbb eax,eax
+       neg eax
+    }
+   }
+
+/*************************************************
+* Four Word Block Subtraction, Three Argument    *
+*************************************************/
+__forceinline word word8_sub3(word z[8], const word x[8],
+                              const word y[8], word carry)
+   {
+   __asm
+      {
+      mov edi,[x]
+       mov esi,[y]
+       xor eax,eax
+       sub eax,[carry] //force CF=1 iff *carry==1
+       mov ebx,[z]
+       mov eax,[edi]
+       sbb eax,[esi]
+       mov [ebx],eax
+       mov eax,[edi+4]
+       sbb eax,[esi+4]
+       mov [ebx+4],eax
+       mov eax,[edi+8]      
+       sbb eax,[esi+8]
+       mov [ebx+8],eax
+       mov eax,[edi+12]     
+       sbb eax,[esi+12]
+       mov [ebx+12],eax
+       mov eax,[edi+16]     
+       sbb eax,[esi+16]
+       mov [ebx+16],eax
+       mov eax,[edi+20]     
+       sbb eax,[esi+20]
+       mov [ebx+20],eax
+       mov eax,[edi+24]     
+       sbb eax,[esi+24]
+       mov [ebx+24],eax
+       mov eax,[edi+28]     
+       sbb eax,[esi+28]
+       mov [ebx+28],eax
+       sbb eax,eax
+       neg eax
+       }
+   }
+
+/*************************************************
+* Four Word Block Linear Multiplication          *
+*************************************************/
+inline word word4_linmul2(word x[4], word y, word carry)
+{
+   __asm
+   {
+       mov esi,[x]
+       mov eax,[esi]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,[carry]      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [esi],eax        //load a
+
+       mov eax,[esi+4]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [esi+4],eax        //load a
+
+       mov eax,[esi+8]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [esi+8],eax        //load a
+
+       mov eax,[esi+12]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov [esi+12],eax        //load a
+       mov eax,edx      //store carry
+   }
+   }
+
+/*************************************************
+* Eight Word Block Linear Multiplication          *
+*************************************************/
+
+__forceinline word word8_muladd(word z[8], const word x[8], word y, word carry)
+   {
+   __asm
+   {
+       mov esi,[x]
+       mov ebx,[y]
+       mov edi,[z]
+       mov eax,[esi]     //load a
+       mul ebx           //edx(hi):eax(lo)=a*b
+       add eax,[carry]   //sum lo carry
+       adc edx,0         //sum hi carry
+       add eax,[edi]     //sum lo z
+       adc edx,0         //sum hi z
+       mov ecx,edx       //carry for next block = hi z
+       mov [edi],eax     //save lo z
+
+       mov eax,[esi+4]   
+       mul ebx           
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+4]  
+       adc edx,0        
+       mov ecx,edx      
+       mov [edi+4],eax  
+
+       mov eax,[esi+8]  
+       mul ebx          
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+8]  
+       adc edx,0        
+       mov ecx,edx      
+       mov [edi+8],eax  
+
+       mov eax,[esi+12] 
+       mul ebx          
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+12] 
+       adc edx,0        
+       mov ecx,edx      
+       mov [edi+12],eax 
+
+       mov eax,[esi+16] 
+       mul ebx          
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+16] 
+       adc edx,0        
+       mov ecx,edx      
+       mov [edi+16],eax 
+
+       mov eax,[esi+20] 
+       mul ebx          
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+20] 
+       adc edx,0        
+       mov ecx,edx      
+       mov [edi+20],eax 
+
+       mov eax,[esi+24] 
+       mul ebx          
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+24] 
+       adc edx,0        
+       mov ecx,edx      
+       mov [edi+24],eax 
+
+       mov eax,[esi+28] 
+       mul ebx          
+       add eax,ecx      
+       adc edx,0        
+       add eax,[edi+28] 
+       adc edx,0        
+       mov [edi+28],eax 
+       mov eax,edx      
+   }
+   }
+
+__forceinline word word8_linmul3(word z[4], const word x[4], word y, word carry)
+   {
+   __asm
+   {
+#if 0
+        //it's slower!!!
+       mov edx,[z]
+       mov eax,[x]
+        movd mm7,[y]
+
+        movd mm0,[eax]
+        movd mm1,[eax+4]
+        movd mm2,[eax+8]
+        pmuludq mm0,mm7
+        pmuludq mm1,mm7
+        pmuludq mm2,mm7
+
+        movd mm6,[carry]
+        paddq mm0,mm6
+        movd [edx],mm0
+
+        psrlq mm0,32
+        paddq mm1,mm0
+        movd [edx+4],mm1
+
+        movd mm3,[eax+12]
+        psrlq mm1,32
+        paddq mm2,mm1
+        movd [edx+8],mm2
+
+        pmuludq mm3,mm7
+        movd mm4,[eax+16]
+        psrlq mm2,32
+        paddq mm3,mm2
+        movd [edx+12],mm3
+
+        pmuludq mm4,mm7
+        movd mm5,[eax+20]
+        psrlq mm3,32
+        paddq mm4,mm3
+        movd [edx+16],mm4
+
+        pmuludq mm5,mm7
+        movd mm0,[eax+24]
+        psrlq mm4,32
+        paddq mm5,mm4
+        movd [edx+20],mm5
+
+        pmuludq mm0,mm7
+        movd mm1,[eax+28]
+        psrlq mm5,32
+        paddq mm0,mm5
+        movd [edx+24],mm0
+
+        pmuludq mm1,mm7
+        psrlq mm0,32
+        paddq mm1,mm0
+        movd [edx+28],mm1
+
+        psrlq mm1,32
+        movd eax,mm1
+        emms
+#else
+       mov edi,[z]
+       mov esi,[x]
+       mov eax,[esi]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,[carry]    //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi],eax        //load a
+
+       mov eax,[esi+4]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi+4],eax        //load a
+
+       mov eax,[esi+8]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi+8],eax        //load a
+
+       mov eax,[esi+12]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi+12],eax        //load a
+
+       mov eax,[esi+16]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi+16],eax        //load a
+
+       mov eax,[esi+20]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi+20],eax        //load a
+
+       mov eax,[esi+24]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov ecx,edx      //store carry
+       mov [edi+24],eax        //load a
+
+       mov eax,[esi+28]        //load a
+       mul [y]           //edx(hi):eax(lo)=a*b
+       add eax,ecx      //sum lo carry
+       adc edx,0          //sum hi carry
+       mov [edi+28],eax        //load a
+       mov eax,edx      //store carry
+#endif
+   }
+   }
+
+/*************************************************
+* Eight Word Block Multiply-Add                  *
+*************************************************/
+inline void word8_madd3(word z[], word x, const word y[], word* carry)
+   {
+   word_madd(x, y[0], z[0], *carry, z + 0, carry);
+   word_madd(x, y[1], z[1], *carry, z + 1, carry);
+   word_madd(x, y[2], z[2], *carry, z + 2, carry);
+   word_madd(x, y[3], z[3], *carry, z + 3, carry);
+   word_madd(x, y[4], z[4], *carry, z + 4, carry);
+   word_madd(x, y[5], z[5], *carry, z + 5, carry);
+   word_madd(x, y[6], z[6], *carry, z + 6, carry);
+   word_madd(x, y[7], z[7], *carry, z + 7, carry);
+   }
+
+}
+
+}
+
+#endif
author	lloyd <[email protected]>	2006-05-18 18:33:19 +0000
committer	lloyd <[email protected]>	2006-05-18 18:33:19 +0000
commit	a2c99d3270eb73ef2db5704fc54356c6b75096f8 (patch)
tree	ad3d6c4fcc8dd0f403f8105598943616246fe172 /modules/mp_ia32_msvc/mp_asmi.h