diff options
author | lloyd <[email protected]> | 2010-03-16 00:21:30 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2010-03-16 00:21:30 +0000 |
commit | 4be0c32f1715ca97f6cc85bc1905899f5cd01cef (patch) | |
tree | 3f7d40f046f6c3fb12f3a23be909916a099344a3 /src/math/bigint/mp_ia32_msvc | |
parent | b96139b84113b583a453c820127c4765c6f0f66f (diff) |
Add a special handler for the case of doing a subtraction as in:
x -= y;
where abs(x) < abs(y).
This change alone increases ECDSA performance by 5 to 15%
Diffstat (limited to 'src/math/bigint/mp_ia32_msvc')
-rw-r--r-- | src/math/bigint/mp_ia32_msvc/mp_asmi.h | 774 |
1 files changed, 394 insertions, 380 deletions
diff --git a/src/math/bigint/mp_ia32_msvc/mp_asmi.h b/src/math/bigint/mp_ia32_msvc/mp_asmi.h index 4c3027fde..aee457d65 100644 --- a/src/math/bigint/mp_ia32_msvc/mp_asmi.h +++ b/src/math/bigint/mp_ia32_msvc/mp_asmi.h @@ -1,6 +1,6 @@ /* * Lowest Level MPI Algorithms -* (C) 1999-2006 Jack Lloyd +* (C) 1999-2010 Jack Lloyd * 2006 Luca Piccarreta * * Distributed under the terms of the Botan license @@ -33,28 +33,28 @@ inline word word_add(word x, word y, word* carry) inline word word8_add2(word x[8], const word y[8], word carry) { __asm { - mov edx,[x] - mov esi,[y] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov eax,[esi] - adc [edx],eax - mov eax,[esi+4] - adc [edx+4],eax - mov eax,[esi+8] - adc [edx+8],eax - mov eax,[esi+12] - adc [edx+12],eax - mov eax,[esi+16] - adc [edx+16],eax - mov eax,[esi+20] - adc [edx+20],eax - mov eax,[esi+24] - adc [edx+24],eax - mov eax,[esi+28] - adc [edx+28],eax - sbb eax,eax - neg eax + mov edx,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[esi] + adc [edx],eax + mov eax,[esi+4] + adc [edx+4],eax + mov eax,[esi+8] + adc [edx+8],eax + mov eax,[esi+12] + adc [edx+12],eax + mov eax,[esi+16] + adc [edx+16],eax + mov eax,[esi+20] + adc [edx+20],eax + mov eax,[esi+24] + adc [edx+24],eax + mov eax,[esi+28] + adc [edx+28],eax + sbb eax,eax + neg eax } } @@ -64,46 +64,46 @@ inline word word8_add2(word x[8], const word y[8], word carry) inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) { __asm { - mov edi,[x] - mov esi,[y] - mov ebx,[z] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov eax,[edi] - adc eax,[esi] - mov [ebx],eax - - mov eax,[edi+4] - adc eax,[esi+4] - mov [ebx+4],eax - - mov eax,[edi+8] - adc eax,[esi+8] - mov [ebx+8],eax - - mov eax,[edi+12] - adc eax,[esi+12] - mov [ebx+12],eax - - mov eax,[edi+16] - adc eax,[esi+16] - mov [ebx+16],eax - - mov eax,[edi+20] - adc eax,[esi+20] - mov [ebx+20],eax - - mov eax,[edi+24] - adc eax,[esi+24] - mov [ebx+24],eax - - mov eax,[edi+28] - adc eax,[esi+28] - mov [ebx+28],eax - - sbb eax,eax - neg eax - } + mov edi,[x] + mov esi,[y] + mov ebx,[z] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + adc eax,[esi] + mov [ebx],eax + + mov eax,[edi+4] + adc eax,[esi+4] + mov [ebx+4],eax + + mov eax,[edi+8] + adc eax,[esi+8] + mov [ebx+8],eax + + mov eax,[edi+12] + adc eax,[esi+12] + mov [ebx+12],eax + + mov eax,[edi+16] + adc eax,[esi+16] + mov [ebx+16],eax + + mov eax,[edi+20] + adc eax,[esi+20] + mov [ebx+20],eax + + mov eax,[edi+24] + adc eax,[esi+24] + mov [ebx+24],eax + + mov eax,[edi+28] + adc eax,[esi+28] + mov [ebx+28],eax + + sbb eax,eax + neg eax + } } /* @@ -123,40 +123,57 @@ inline word word_sub(word x, word y, word* carry) */ inline word word8_sub2(word x[8], const word y[8], word carry) { - _asm { - mov edi,[x] - mov esi,[y] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov eax,[edi] - sbb eax,[esi] - mov [edi],eax - mov eax,[edi+4] - sbb eax,[esi+4] - mov [edi+4],eax - mov eax,[edi+8] - sbb eax,[esi+8] - mov [edi+8],eax - mov eax,[edi+12] - sbb eax,[esi+12] - mov [edi+12],eax - mov eax,[edi+16] - sbb eax,[esi+16] - mov [edi+16],eax - mov eax,[edi+20] - sbb eax,[esi+20] - mov [edi+20],eax - mov eax,[edi+24] - sbb eax,[esi+24] - mov [edi+24],eax - mov eax,[edi+28] - sbb eax,[esi+28] - mov [edi+28],eax - sbb eax,eax - neg eax - } + __asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + sbb eax,[esi] + mov [edi],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [edi+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [edi+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [edi+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [edi+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [edi+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [edi+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [edi+28],eax + sbb eax,eax + neg eax + } + } + +/* +* Eight Word Block Subtraction, Two Argument +*/ +inline word word8_sub2_rev(word x[8], const word y[8], word carry) + { + x[0] = word_sub(y[0], x[0], &carry); + x[1] = word_sub(y[1], x[1], &carry); + x[2] = word_sub(y[2], x[2], &carry); + x[3] = word_sub(y[3], x[3], &carry); + x[4] = word_sub(y[4], x[4], &carry); + x[5] = word_sub(y[5], x[5], &carry); + x[6] = word_sub(y[6], x[6], &carry); + x[7] = word_sub(y[7], x[7], &carry); + return carry; } + /* * Eight Word Block Subtraction, Three Argument */ @@ -164,38 +181,38 @@ inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) { __asm { - mov edi,[x] - mov esi,[y] - xor eax,eax - sub eax,[carry] //force CF=1 iff *carry==1 - mov ebx,[z] - mov eax,[edi] - sbb eax,[esi] - mov [ebx],eax - mov eax,[edi+4] - sbb eax,[esi+4] - mov [ebx+4],eax - mov eax,[edi+8] - sbb eax,[esi+8] - mov [ebx+8],eax - mov eax,[edi+12] - sbb eax,[esi+12] - mov [ebx+12],eax - mov eax,[edi+16] - sbb eax,[esi+16] - mov [ebx+16],eax - mov eax,[edi+20] - sbb eax,[esi+20] - mov [ebx+20],eax - mov eax,[edi+24] - sbb eax,[esi+24] - mov [ebx+24],eax - mov eax,[edi+28] - sbb eax,[esi+28] - mov [ebx+28],eax - sbb eax,eax - neg eax - } + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov ebx,[z] + mov eax,[edi] + sbb eax,[esi] + mov [ebx],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [ebx+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [ebx+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [ebx+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [ebx+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [ebx+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [ebx+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [ebx+28],eax + sbb eax,eax + neg eax + } } /* @@ -203,65 +220,64 @@ inline word word8_sub3(word z[8], const word x[8], */ inline word word8_linmul2(word x[8], word y, word carry) { - __asm - { - mov esi,[x] - mov eax,[esi] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,[carry] //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi],eax //load a - - mov eax,[esi+4] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+4],eax //load a - - mov eax,[esi+8] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+8],eax //load a - - mov eax,[esi+12] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+12],eax //load a - - mov eax,[esi+16] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+16],eax //load a - - mov eax,[esi+20] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+20],eax //load a - - mov eax,[esi+24] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [esi+24],eax //load a - - mov eax,[esi+28] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov [esi+28],eax //load a - - mov eax,edx //store carry + __asm { + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [esi+28],eax //load a + + mov eax,edx //store carry } } @@ -271,207 +287,205 @@ inline word word8_linmul2(word x[8], word y, word carry) inline word word8_muladd(word z[8], const word x[8], word y, word carry) { - __asm - { - mov esi,[x] - mov ebx,[y] - mov edi,[z] - mov eax,[esi] //load a - mul ebx //edx(hi):eax(lo)=a*b - add eax,[carry] //sum lo carry - adc edx,0 //sum hi carry - add eax,[edi] //sum lo z - adc edx,0 //sum hi z - mov ecx,edx //carry for next block = hi z - mov [edi],eax //save lo z - - mov eax,[esi+4] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+4] - adc edx,0 - mov ecx,edx - mov [edi+4],eax - - mov eax,[esi+8] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+8] - adc edx,0 - mov ecx,edx - mov [edi+8],eax - - mov eax,[esi+12] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+12] - adc edx,0 - mov ecx,edx - mov [edi+12],eax - - mov eax,[esi+16] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+16] - adc edx,0 - mov ecx,edx - mov [edi+16],eax - - mov eax,[esi+20] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+20] - adc edx,0 - mov ecx,edx - mov [edi+20],eax - - mov eax,[esi+24] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+24] - adc edx,0 - mov ecx,edx - mov [edi+24],eax - - mov eax,[esi+28] - mul ebx - add eax,ecx - adc edx,0 - add eax,[edi+28] - adc edx,0 - mov [edi+28],eax - mov eax,edx - } + __asm { + mov esi,[x] + mov ebx,[y] + mov edi,[z] + mov eax,[esi] //load a + mul ebx //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + add eax,[edi] //sum lo z + adc edx,0 //sum hi z + mov ecx,edx //carry for next block = hi z + mov [edi],eax //save lo z + + mov eax,[esi+4] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+4] + adc edx,0 + mov ecx,edx + mov [edi+4],eax + + mov eax,[esi+8] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+8] + adc edx,0 + mov ecx,edx + mov [edi+8],eax + + mov eax,[esi+12] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+12] + adc edx,0 + mov ecx,edx + mov [edi+12],eax + + mov eax,[esi+16] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+16] + adc edx,0 + mov ecx,edx + mov [edi+16],eax + + mov eax,[esi+20] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+20] + adc edx,0 + mov ecx,edx + mov [edi+20],eax + + mov eax,[esi+24] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+24] + adc edx,0 + mov ecx,edx + mov [edi+24],eax + + mov eax,[esi+28] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+28] + adc edx,0 + mov [edi+28],eax + mov eax,edx + } } inline word word8_linmul3(word z[4], const word x[4], word y, word carry) { - __asm - { + __asm { #if 0 - //it's slower!!! - mov edx,[z] - mov eax,[x] - movd mm7,[y] - - movd mm0,[eax] - movd mm1,[eax+4] - movd mm2,[eax+8] - pmuludq mm0,mm7 - pmuludq mm1,mm7 - pmuludq mm2,mm7 - - movd mm6,[carry] - paddq mm0,mm6 - movd [edx],mm0 - - psrlq mm0,32 - paddq mm1,mm0 - movd [edx+4],mm1 - - movd mm3,[eax+12] - psrlq mm1,32 - paddq mm2,mm1 - movd [edx+8],mm2 - - pmuludq mm3,mm7 - movd mm4,[eax+16] - psrlq mm2,32 - paddq mm3,mm2 - movd [edx+12],mm3 - - pmuludq mm4,mm7 - movd mm5,[eax+20] - psrlq mm3,32 - paddq mm4,mm3 - movd [edx+16],mm4 - - pmuludq mm5,mm7 - movd mm0,[eax+24] - psrlq mm4,32 - paddq mm5,mm4 - movd [edx+20],mm5 - - pmuludq mm0,mm7 - movd mm1,[eax+28] - psrlq mm5,32 - paddq mm0,mm5 - movd [edx+24],mm0 - - pmuludq mm1,mm7 - psrlq mm0,32 - paddq mm1,mm0 - movd [edx+28],mm1 - - psrlq mm1,32 - movd eax,mm1 - emms + //it's slower!!! + mov edx,[z] + mov eax,[x] + movd mm7,[y] + + movd mm0,[eax] + movd mm1,[eax+4] + movd mm2,[eax+8] + pmuludq mm0,mm7 + pmuludq mm1,mm7 + pmuludq mm2,mm7 + + movd mm6,[carry] + paddq mm0,mm6 + movd [edx],mm0 + + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+4],mm1 + + movd mm3,[eax+12] + psrlq mm1,32 + paddq mm2,mm1 + movd [edx+8],mm2 + + pmuludq mm3,mm7 + movd mm4,[eax+16] + psrlq mm2,32 + paddq mm3,mm2 + movd [edx+12],mm3 + + pmuludq mm4,mm7 + movd mm5,[eax+20] + psrlq mm3,32 + paddq mm4,mm3 + movd [edx+16],mm4 + + pmuludq mm5,mm7 + movd mm0,[eax+24] + psrlq mm4,32 + paddq mm5,mm4 + movd [edx+20],mm5 + + pmuludq mm0,mm7 + movd mm1,[eax+28] + psrlq mm5,32 + paddq mm0,mm5 + movd [edx+24],mm0 + + pmuludq mm1,mm7 + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+28],mm1 + psrlq mm1,32 + + movd eax,mm1 + emms #else - mov edi,[z] - mov esi,[x] - mov eax,[esi] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,[carry] //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi],eax //load a - - mov eax,[esi+4] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+4],eax //load a - - mov eax,[esi+8] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+8],eax //load a - - mov eax,[esi+12] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+12],eax //load a - - mov eax,[esi+16] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+16],eax //load a - - mov eax,[esi+20] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+20],eax //load a - - mov eax,[esi+24] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov ecx,edx //store carry - mov [edi+24],eax //load a - - mov eax,[esi+28] //load a - mul [y] //edx(hi):eax(lo)=a*b - add eax,ecx //sum lo carry - adc edx,0 //sum hi carry - mov [edi+28],eax //load a - mov eax,edx //store carry + mov edi,[z] + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [edi+28],eax //load a + mov eax,edx //store carry #endif - } + } } /* |