aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlloyd <[email protected]>2006-05-28 04:49:23 +0000
committerlloyd <[email protected]>2006-05-28 04:49:23 +0000
commitac52e529a9548b9e37bf73eba7c049237495b700 (patch)
tree18061b1333536648efd9950ea2153cd770ad8077
parent62b8a5a6e802db6a8fbaadd7bf4d7a74c93dcc47 (diff)
The two-argument linear multiply function in the Visual C++ assembly
was assuming 4-word blocks rather than 8. Fixed, but not yet tested.
-rw-r--r--modules/mp_ia32_msvc/mp_asmi.h102
1 files changed, 69 insertions, 33 deletions
diff --git a/modules/mp_ia32_msvc/mp_asmi.h b/modules/mp_ia32_msvc/mp_asmi.h
index d0f7d7bb6..1658bd899 100644
--- a/modules/mp_ia32_msvc/mp_asmi.h
+++ b/modules/mp_ia32_msvc/mp_asmi.h
@@ -45,8 +45,8 @@ inline word word4_addcarry(word x[4], word carry)
}
}
- /*************************************************
-* Four Word Block Addition, Two Argument *
+/*************************************************
+* Eight Word Block Addition, Two Argument *
*************************************************/
inline word word8_add2(word x[8], const word y[8], word carry)
{
@@ -55,29 +55,38 @@ inline word word8_add2(word x[8], const word y[8], word carry)
mov esi,[y]
xor eax,eax
sub eax,[carry] //force CF=1 iff *carry==1
+
mov eax,[esi]
adc [edx],eax
+
mov eax,[esi+4]
adc [edx+4],eax
+
mov eax,[esi+8]
adc [edx+8],eax
+
mov eax,[esi+12]
adc [edx+12],eax
+
mov eax,[esi+16]
adc [edx+16],eax
+
mov eax,[esi+20]
adc [edx+20],eax
+
mov eax,[esi+24]
adc [edx+24],eax
+
mov eax,[esi+28]
adc [edx+28],eax
+
sbb eax,eax
neg eax
}
}
/*************************************************
-* Four Word Block Addition, Three Argument *
+* Eight Word Block Addition, Three Argument *
*************************************************/
inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
{
@@ -87,35 +96,36 @@ inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
mov ebx,[z]
xor eax,eax
sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[edi]
+
+ mov eax,[edi]
adc eax,[esi]
mov [ebx],eax
- mov eax,[edi+4]
+ mov eax,[edi+4]
adc eax,[esi+4]
mov [ebx+4],eax
- mov eax,[edi+8]
+ mov eax,[edi+8]
adc eax,[esi+8]
mov [ebx+8],eax
- mov eax,[edi+12]
+ mov eax,[edi+12]
adc eax,[esi+12]
mov [ebx+12],eax
- mov eax,[edi+16]
+ mov eax,[edi+16]
adc eax,[esi+16]
mov [ebx+16],eax
- mov eax,[edi+20]
+ mov eax,[edi+20]
adc eax,[esi+20]
mov [ebx+20],eax
- mov eax,[edi+24]
+ mov eax,[edi+24]
adc eax,[esi+24]
mov [ebx+24],eax
- mov eax,[edi+28]
+ mov eax,[edi+28]
adc eax,[esi+28]
mov [ebx+28],eax
@@ -137,7 +147,7 @@ inline word word_sub(word x, word y, word* carry)
}
/*************************************************
-* Four Word Block Subtraction, Two Argument *
+* Eight Word Block Subtraction, Two Argument *
*************************************************/
inline word word8_sub2(word x[8], const word y[8], word carry)
{
@@ -146,28 +156,28 @@ inline word word8_sub2(word x[8], const word y[8], word carry)
mov esi,[y]
xor eax,eax
sub eax,[carry] //force CF=1 iff *carry==1
- mov eax,[edi]
+ mov eax,[edi]
sbb eax,[esi]
mov [edi],eax
- mov eax,[edi+4]
+ mov eax,[edi+4]
sbb eax,[esi+4]
mov [edi+4],eax
- mov eax,[edi+8]
+ mov eax,[edi+8]
sbb eax,[esi+8]
mov [edi+8],eax
- mov eax,[edi+12]
+ mov eax,[edi+12]
sbb eax,[esi+12]
mov [edi+12],eax
mov eax,[edi+16]
sbb eax,[esi+16]
mov [edi+16],eax
- mov eax,[edi+20]
+ mov eax,[edi+20]
sbb eax,[esi+20]
mov [edi+20],eax
- mov eax,[edi+24]
+ mov eax,[edi+24]
sbb eax,[esi+24]
mov [edi+24],eax
- mov eax,[edi+28]
+ mov eax,[edi+28]
sbb eax,[esi+28]
mov [edi+28],eax
sbb eax,eax
@@ -176,7 +186,7 @@ inline word word8_sub2(word x[8], const word y[8], word carry)
}
/*************************************************
-* Four Word Block Subtraction, Three Argument *
+* Eight Word Block Subtraction, Three Argument *
*************************************************/
__forceinline word word8_sub3(word z[8], const word x[8],
const word y[8], word carry)
@@ -194,22 +204,22 @@ __forceinline word word8_sub3(word z[8], const word x[8],
mov eax,[edi+4]
sbb eax,[esi+4]
mov [ebx+4],eax
- mov eax,[edi+8]
+ mov eax,[edi+8]
sbb eax,[esi+8]
mov [ebx+8],eax
- mov eax,[edi+12]
+ mov eax,[edi+12]
sbb eax,[esi+12]
mov [ebx+12],eax
- mov eax,[edi+16]
+ mov eax,[edi+16]
sbb eax,[esi+16]
mov [ebx+16],eax
- mov eax,[edi+20]
+ mov eax,[edi+20]
sbb eax,[esi+20]
mov [ebx+20],eax
- mov eax,[edi+24]
+ mov eax,[edi+24]
sbb eax,[esi+24]
mov [ebx+24],eax
- mov eax,[edi+28]
+ mov eax,[edi+28]
sbb eax,[esi+28]
mov [ebx+28],eax
sbb eax,eax
@@ -218,40 +228,66 @@ __forceinline word word8_sub3(word z[8], const word x[8],
}
/*************************************************
-* Four Word Block Linear Multiplication *
+* Eight Word Block Linear Multiplication *
*************************************************/
-inline word word4_linmul2(word x[4], word y, word carry)
+inline word word8_linmul2(word x[4], word y, word carry)
{
__asm
{
mov esi,[x]
+
mov eax,[esi] //load a
mul [y] //edx(hi):eax(lo)=a*b
add eax,[carry] //sum lo carry
adc edx,0 //sum hi carry
mov ecx,edx //store carry
- mov [esi],eax //load a
+ mov [esi],eax //store a
mov eax,[esi+4] //load a
mul [y] //edx(hi):eax(lo)=a*b
add eax,ecx //sum lo carry
adc edx,0 //sum hi carry
mov ecx,edx //store carry
- mov [esi+4],eax //load a
+ mov [esi+4],eax //store a
mov eax,[esi+8] //load a
mul [y] //edx(hi):eax(lo)=a*b
add eax,ecx //sum lo carry
adc edx,0 //sum hi carry
mov ecx,edx //store carry
- mov [esi+8],eax //load a
+ mov [esi+8],eax //store a
mov eax,[esi+12] //load a
mul [y] //edx(hi):eax(lo)=a*b
add eax,ecx //sum lo carry
adc edx,0 //sum hi carry
- mov [esi+12],eax //load a
- mov eax,edx //store carry
+ mov [esi+12],eax //store a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+16],eax //store a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+20],eax //store a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+24],eax //store a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+28],eax //store a
+
+ mov eax,edx //store carry for return
}
}