aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/math/mp/mp_asmi.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/math/mp/mp_asmi.h')
-rw-r--r--src/lib/math/mp/mp_asmi.h820
1 files changed, 820 insertions, 0 deletions
diff --git a/src/lib/math/mp/mp_asmi.h b/src/lib/math/mp/mp_asmi.h
new file mode 100644
index 000000000..afb4d1407
--- /dev/null
+++ b/src/lib/math/mp/mp_asmi.h
@@ -0,0 +1,820 @@
+/*
+* Lowest Level MPI Algorithms
+* (C) 1999-2010 Jack Lloyd
+* 2006 Luca Piccarreta
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/internal/mp_madd.h>
+
+namespace Botan {
+
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+
+#define ADDSUB2_OP(OPERATION, INDEX) \
+ ASM("movl 4*" #INDEX "(%[y]), %[carry]") \
+ ASM(OPERATION " %[carry], 4*" #INDEX "(%[x])") \
+
+#define ADDSUB3_OP(OPERATION, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]), %[carry]") \
+ ASM(OPERATION " 4*" #INDEX "(%[y]), %[carry]") \
+ ASM("movl %[carry], 4*" #INDEX "(%[z])") \
+
+#define LINMUL_OP(WRITE_TO, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]),%%eax") \
+ ASM("mull %[y]") \
+ ASM("addl %[carry],%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("movl %%edx,%[carry]") \
+ ASM("movl %%eax, 4*" #INDEX "(%[" WRITE_TO "])")
+
+#define MULADD_OP(IGNORED, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]),%%eax") \
+ ASM("mull %[y]") \
+ ASM("addl %[carry],%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("addl 4*" #INDEX "(%[z]),%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("movl %%edx,%[carry]") \
+ ASM("movl %%eax, 4*" #INDEX " (%[z])")
+
+#define ADD_OR_SUBTRACT(CORE_CODE) \
+ ASM("rorl %[carry]") \
+ CORE_CODE \
+ ASM("sbbl %[carry],%[carry]") \
+ ASM("negl %[carry]")
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+#define ADDSUB2_OP(OPERATION, INDEX) \
+ ASM("movq 8*" #INDEX "(%[y]), %[carry]") \
+ ASM(OPERATION " %[carry], 8*" #INDEX "(%[x])") \
+
+#define ADDSUB3_OP(OPERATION, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]), %[carry]") \
+ ASM(OPERATION " 8*" #INDEX "(%[y]), %[carry]") \
+ ASM("movq %[carry], 8*" #INDEX "(%[z])") \
+
+#define LINMUL_OP(WRITE_TO, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]),%%rax") \
+ ASM("mulq %[y]") \
+ ASM("addq %[carry],%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("movq %%rdx,%[carry]") \
+ ASM("movq %%rax, 8*" #INDEX "(%[" WRITE_TO "])")
+
+#define MULADD_OP(IGNORED, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]),%%rax") \
+ ASM("mulq %[y]") \
+ ASM("addq %[carry],%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("addq 8*" #INDEX "(%[z]),%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("movq %%rdx,%[carry]") \
+ ASM("movq %%rax, 8*" #INDEX " (%[z])")
+
+#define ADD_OR_SUBTRACT(CORE_CODE) \
+ ASM("rorq %[carry]") \
+ CORE_CODE \
+ ASM("sbbq %[carry],%[carry]") \
+ ASM("negq %[carry]")
+
+#endif
+
+#if defined(ADD_OR_SUBTRACT)
+
+#define ASM(x) x "\n\t"
+
+#define DO_8_TIMES(MACRO, ARG) \
+ MACRO(ARG, 0) \
+ MACRO(ARG, 1) \
+ MACRO(ARG, 2) \
+ MACRO(ARG, 3) \
+ MACRO(ARG, 4) \
+ MACRO(ARG, 5) \
+ MACRO(ARG, 6) \
+ MACRO(ARG, 7)
+
+#endif
+
+/*
+* Word Addition
+*/
+inline word word_add(word x, word y, word* carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(ASM("adcl %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(ASM("adcq %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+
+#else
+ word z = x + y;
+ word c1 = (z < x);
+ z += *carry;
+ *carry = c1 | (z < *carry);
+ return z;
+#endif
+ }
+
+/*
+* Eight Word Block Addition, Two Argument
+*/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_32_MSVC_ASM)
+
+ __asm {
+ mov edx,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[esi]
+ adc [edx],eax
+ mov eax,[esi+4]
+ adc [edx+4],eax
+ mov eax,[esi+8]
+ adc [edx+8],eax
+ mov eax,[esi+12]
+ adc [edx+12],eax
+ mov eax,[esi+16]
+ adc [edx+16],eax
+ mov eax,[esi+20]
+ adc [edx+20],eax
+ mov eax,[esi+24]
+ adc [edx+24],eax
+ mov eax,[esi+28]
+ adc [edx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+
+#else
+ x[0] = word_add(x[0], y[0], &carry);
+ x[1] = word_add(x[1], y[1], &carry);
+ x[2] = word_add(x[2], y[2], &carry);
+ x[3] = word_add(x[3], y[3], &carry);
+ x[4] = word_add(x[4], y[4], &carry);
+ x[5] = word_add(x[5], y[5], &carry);
+ x[6] = word_add(x[6], y[6], &carry);
+ x[7] = word_add(x[7], y[7], &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Eight Word Block Addition, Three Argument
+*/
+inline word word8_add3(word z[8], const word x[8],
+ const word y[8], word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_32_MSVC_ASM)
+
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ mov ebx,[z]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ adc eax,[esi]
+ mov [ebx],eax
+
+ mov eax,[edi+4]
+ adc eax,[esi+4]
+ mov [ebx+4],eax
+
+ mov eax,[edi+8]
+ adc eax,[esi+8]
+ mov [ebx+8],eax
+
+ mov eax,[edi+12]
+ adc eax,[esi+12]
+ mov [ebx+12],eax
+
+ mov eax,[edi+16]
+ adc eax,[esi+16]
+ mov [ebx+16],eax
+
+ mov eax,[edi+20]
+ adc eax,[esi+20]
+ mov [ebx+20],eax
+
+ mov eax,[edi+24]
+ adc eax,[esi+24]
+ mov [ebx+24],eax
+
+ mov eax,[edi+28]
+ adc eax,[esi+28]
+ mov [ebx+28],eax
+
+ sbb eax,eax
+ neg eax
+ }
+
+#else
+ z[0] = word_add(x[0], y[0], &carry);
+ z[1] = word_add(x[1], y[1], &carry);
+ z[2] = word_add(x[2], y[2], &carry);
+ z[3] = word_add(x[3], y[3], &carry);
+ z[4] = word_add(x[4], y[4], &carry);
+ z[5] = word_add(x[5], y[5], &carry);
+ z[6] = word_add(x[6], y[6], &carry);
+ z[7] = word_add(x[7], y[7], &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Word Subtraction
+*/
+inline word word_sub(word x, word y, word* carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(ASM("sbbl %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(ASM("sbbq %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+
+#else
+ word t0 = x - y;
+ word c1 = (t0 > x);
+ word z = t0 - *carry;
+ *carry = c1 | (z > t0);
+ return z;
+#endif
+ }
+
+/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_32_MSVC_ASM)
+
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [edi],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [edi+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [edi+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [edi+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [edi+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [edi+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [edi+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [edi+28],eax
+ sbb eax,eax
+ neg eax
+ }
+
+#else
+ x[0] = word_sub(x[0], y[0], &carry);
+ x[1] = word_sub(x[1], y[1], &carry);
+ x[2] = word_sub(x[2], y[2], &carry);
+ x[3] = word_sub(x[3], y[3], &carry);
+ x[4] = word_sub(x[4], y[4], &carry);
+ x[5] = word_sub(x[5], y[5], &carry);
+ x[6] = word_sub(x[6], y[6], &carry);
+ x[7] = word_sub(x[7], y[7], &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Eight Word Block Subtraction, Two Argument
+*/
+inline word word8_sub2_rev(word x[8], const word y[8], word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(y), [y]"r"(x), [z]"r"(x), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#else
+ x[0] = word_sub(y[0], x[0], &carry);
+ x[1] = word_sub(y[1], x[1], &carry);
+ x[2] = word_sub(y[2], x[2], &carry);
+ x[3] = word_sub(y[3], x[3], &carry);
+ x[4] = word_sub(y[4], x[4], &carry);
+ x[5] = word_sub(y[5], x[5], &carry);
+ x[6] = word_sub(y[6], x[6], &carry);
+ x[7] = word_sub(y[7], x[7], &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Eight Word Block Subtraction, Three Argument
+*/
+inline word word8_sub3(word z[8], const word x[8],
+ const word y[8], word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_32_MSVC_ASM)
+
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov ebx,[z]
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [ebx],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [ebx+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [ebx+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [ebx+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [ebx+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [ebx+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [ebx+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [ebx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+
+#else
+ z[0] = word_sub(x[0], y[0], &carry);
+ z[1] = word_sub(x[1], y[1], &carry);
+ z[2] = word_sub(x[2], y[2], &carry);
+ z[3] = word_sub(x[3], y[3], &carry);
+ z[4] = word_sub(x[4], y[4], &carry);
+ z[5] = word_sub(x[5], y[5], &carry);
+ z[6] = word_sub(x[6], y[6], &carry);
+ z[7] = word_sub(x[7], y[7], &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Eight Word Block Linear Multiplication
+*/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ DO_8_TIMES(LINMUL_OP, "x")
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ DO_8_TIMES(LINMUL_OP, "x")
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_32_MSVC_ASM)
+
+ __asm {
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+28],eax //load a
+
+ mov eax,edx //store carry
+ }
+
+#else
+ x[0] = word_madd2(x[0], y, &carry);
+ x[1] = word_madd2(x[1], y, &carry);
+ x[2] = word_madd2(x[2], y, &carry);
+ x[3] = word_madd2(x[3], y, &carry);
+ x[4] = word_madd2(x[4], y, &carry);
+ x[5] = word_madd2(x[5], y, &carry);
+ x[6] = word_madd2(x[6], y, &carry);
+ x[7] = word_madd2(x[7], y, &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Eight Word Block Linear Multiplication
+*/
+inline word word8_linmul3(word z[8], const word x[8], word y, word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ DO_8_TIMES(LINMUL_OP, "z")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+ asm(
+ DO_8_TIMES(LINMUL_OP, "z")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_32_MSVC_ASM)
+
+ __asm {
+ mov edi,[z]
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [edi+28],eax //load a
+ mov eax,edx //store carry
+ }
+
+#else
+ z[0] = word_madd2(x[0], y, &carry);
+ z[1] = word_madd2(x[1], y, &carry);
+ z[2] = word_madd2(x[2], y, &carry);
+ z[3] = word_madd2(x[3], y, &carry);
+ z[4] = word_madd2(x[4], y, &carry);
+ z[5] = word_madd2(x[5], y, &carry);
+ z[6] = word_madd2(x[6], y, &carry);
+ z[7] = word_madd2(x[7], y, &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Eight Word Block Multiply/Add
+*/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ DO_8_TIMES(MULADD_OP, "")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ DO_8_TIMES(MULADD_OP, "")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+
+#else
+ z[0] = word_madd3(x[0], y, z[0], &carry);
+ z[1] = word_madd3(x[1], y, z[1], &carry);
+ z[2] = word_madd3(x[2], y, z[2], &carry);
+ z[3] = word_madd3(x[3], y, z[3], &carry);
+ z[4] = word_madd3(x[4], y, z[4], &carry);
+ z[5] = word_madd3(x[5], y, z[5], &carry);
+ z[6] = word_madd3(x[6], y, z[6], &carry);
+ z[7] = word_madd3(x[7], y, z[7], &carry);
+ return carry;
+#endif
+ }
+
+/*
+* Multiply-Add Accumulator
+* (w2,w1,w0) += x * y
+*/
+inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ASM("mull %[y]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ASM("mulq %[y]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+
+#else
+ word carry = *w0;
+ *w0 = word_madd2(x, y, &carry);
+ *w1 += carry;
+ *w2 += (*w1 < carry) ? 1 : 0;
+#endif
+ }
+
+/*
+* Multiply-Add Accumulator
+* (w2,w1,w0) += 2 * x * y
+*/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ASM("mull %[y]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ASM("mulq %[y]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+
+#else
+ word carry = 0;
+ x = word_madd2(x, y, &carry);
+ y = carry;
+
+ word top = (y >> (BOTAN_MP_WORD_BITS-1));
+ y <<= 1;
+ y |= (x >> (BOTAN_MP_WORD_BITS-1));
+ x <<= 1;
+
+ carry = 0;
+ *w0 = word_add(*w0, x, &carry);
+ *w1 = word_add(*w1, y, &carry);
+ *w2 = word_add(*w2, top, &carry);
+#endif
+ }
+
+#if defined(ASM)
+ #undef ASM
+ #undef DO_8_TIMES
+ #undef ADD_OR_SUBTRACT
+ #undef ADDSUB2_OP
+ #undef ADDSUB3_OP
+ #undef LINMUL_OP
+ #undef MULADD_OP
+#endif
+
+}
+
+#endif