aboutsummaryrefslogtreecommitdiffstats
path: root/modules/mp_ia32_msvc/mp_asmi.h
diff options
context:
space:
mode:
authorlloyd <[email protected]>2006-05-18 18:33:19 +0000
committerlloyd <[email protected]>2006-05-18 18:33:19 +0000
commita2c99d3270eb73ef2db5704fc54356c6b75096f8 (patch)
treead3d6c4fcc8dd0f403f8105598943616246fe172 /modules/mp_ia32_msvc/mp_asmi.h
Initial checkin1.5.6
Diffstat (limited to 'modules/mp_ia32_msvc/mp_asmi.h')
-rw-r--r--modules/mp_ia32_msvc/mp_asmi.h486
1 files changed, 486 insertions, 0 deletions
diff --git a/modules/mp_ia32_msvc/mp_asmi.h b/modules/mp_ia32_msvc/mp_asmi.h
new file mode 100644
index 000000000..ad1451010
--- /dev/null
+++ b/modules/mp_ia32_msvc/mp_asmi.h
@@ -0,0 +1,486 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2006 The Botan Project *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/mp_asm.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ word z = x + y;
+ word c1 = (z < x);
+ z += *carry;
+ *carry = c1 | (z < *carry);
+ return z;
+ }
+
+/*************************************************
+* Four Word Block Addition, Two Argument *
+*************************************************/
+inline word word4_addcarry(word x[4], word carry)
+ {
+ __asm {
+ mov edx,[x]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ adc [edx],0
+ mov eax,[esi+4]
+ adc [edx+4],0
+ mov eax,[esi+8]
+ adc [edx+8],0
+ mov eax,[esi+12]
+ adc [edx+12],0
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+ /*************************************************
+* Four Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ __asm {
+ mov edx,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[esi]
+ adc [edx],eax
+ mov eax,[esi+4]
+ adc [edx+4],eax
+ mov eax,[esi+8]
+ adc [edx+8],eax
+ mov eax,[esi+12]
+ adc [edx+12],eax
+ mov eax,[esi+16]
+ adc [edx+16],eax
+ mov eax,[esi+20]
+ adc [edx+20],eax
+ mov eax,[esi+24]
+ adc [edx+24],eax
+ mov eax,[esi+28]
+ adc [edx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ mov ebx,[z]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ adc eax,[esi]
+ mov [ebx],eax
+
+ mov eax,[edi+4]
+ adc eax,[esi+4]
+ mov [ebx+4],eax
+
+ mov eax,[edi+8]
+ adc eax,[esi+8]
+ mov [ebx+8],eax
+
+ mov eax,[edi+12]
+ adc eax,[esi+12]
+ mov [ebx+12],eax
+
+ mov eax,[edi+16]
+ adc eax,[esi+16]
+ mov [ebx+16],eax
+
+ mov eax,[edi+20]
+ adc eax,[esi+20]
+ mov [ebx+20],eax
+
+ mov eax,[edi+24]
+ adc eax,[esi+24]
+ mov [ebx+24],eax
+
+ mov eax,[edi+28]
+ adc eax,[esi+28]
+ mov [ebx+28],eax
+
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ word t0 = x - y;
+ word c1 = (t0 > x);
+ word z = t0 - *carry;
+ *carry = c1 | (z > t0);
+ return z;
+ }
+
+/*************************************************
+* Four Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ _asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [edi],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [edi+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [edi+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [edi+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [edi+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [edi+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [edi+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [edi+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Subtraction, Three Argument *
+*************************************************/
+__forceinline word word8_sub3(word z[8], const word x[8],
+ const word y[8], word carry)
+ {
+ __asm
+ {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov ebx,[z]
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [ebx],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [ebx+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [ebx+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [ebx+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [ebx+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [ebx+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [ebx+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [ebx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Linear Multiplication *
+*************************************************/
+inline word word4_linmul2(word x[4], word y, word carry)
+{
+ __asm
+ {
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+12],eax //load a
+ mov eax,edx //store carry
+ }
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+
+__forceinline word word8_muladd(word z[8], const word x[8], word y, word carry)
+ {
+ __asm
+ {
+ mov esi,[x]
+ mov ebx,[y]
+ mov edi,[z]
+ mov eax,[esi] //load a
+ mul ebx //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ add eax,[edi] //sum lo z
+ adc edx,0 //sum hi z
+ mov ecx,edx //carry for next block = hi z
+ mov [edi],eax //save lo z
+
+ mov eax,[esi+4]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+4]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+4],eax
+
+ mov eax,[esi+8]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+8]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+8],eax
+
+ mov eax,[esi+12]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+12]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+12],eax
+
+ mov eax,[esi+16]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+16]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+16],eax
+
+ mov eax,[esi+20]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+20]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+20],eax
+
+ mov eax,[esi+24]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+24]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+24],eax
+
+ mov eax,[esi+28]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+28]
+ adc edx,0
+ mov [edi+28],eax
+ mov eax,edx
+ }
+ }
+
+__forceinline word word8_linmul3(word z[4], const word x[4], word y, word carry)
+ {
+ __asm
+ {
+#if 0
+ //it's slower!!!
+ mov edx,[z]
+ mov eax,[x]
+ movd mm7,[y]
+
+ movd mm0,[eax]
+ movd mm1,[eax+4]
+ movd mm2,[eax+8]
+ pmuludq mm0,mm7
+ pmuludq mm1,mm7
+ pmuludq mm2,mm7
+
+ movd mm6,[carry]
+ paddq mm0,mm6
+ movd [edx],mm0
+
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+4],mm1
+
+ movd mm3,[eax+12]
+ psrlq mm1,32
+ paddq mm2,mm1
+ movd [edx+8],mm2
+
+ pmuludq mm3,mm7
+ movd mm4,[eax+16]
+ psrlq mm2,32
+ paddq mm3,mm2
+ movd [edx+12],mm3
+
+ pmuludq mm4,mm7
+ movd mm5,[eax+20]
+ psrlq mm3,32
+ paddq mm4,mm3
+ movd [edx+16],mm4
+
+ pmuludq mm5,mm7
+ movd mm0,[eax+24]
+ psrlq mm4,32
+ paddq mm5,mm4
+ movd [edx+20],mm5
+
+ pmuludq mm0,mm7
+ movd mm1,[eax+28]
+ psrlq mm5,32
+ paddq mm0,mm5
+ movd [edx+24],mm0
+
+ pmuludq mm1,mm7
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+28],mm1
+
+ psrlq mm1,32
+ movd eax,mm1
+ emms
+#else
+ mov edi,[z]
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [edi+28],eax //load a
+ mov eax,edx //store carry
+#endif
+ }
+ }
+
+/*************************************************
+* Eight Word Block Multiply-Add *
+*************************************************/
+inline void word8_madd3(word z[], word x, const word y[], word* carry)
+ {
+ word_madd(x, y[0], z[0], *carry, z + 0, carry);
+ word_madd(x, y[1], z[1], *carry, z + 1, carry);
+ word_madd(x, y[2], z[2], *carry, z + 2, carry);
+ word_madd(x, y[3], z[3], *carry, z + 3, carry);
+ word_madd(x, y[4], z[4], *carry, z + 4, carry);
+ word_madd(x, y[5], z[5], *carry, z + 5, carry);
+ word_madd(x, y[6], z[6], *carry, z + 6, carry);
+ word_madd(x, y[7], z[7], *carry, z + 7, carry);
+ }
+
+}
+
+}
+
+#endif