From a3366dc421c4ef9f802009a667ddcb8a9cd2c8a6 Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Mon, 29 Sep 2008 20:20:49 +0000
Subject: Reorg BigInt asm modules

---
 src/bigint/asm_amd64/mp_monty.S         | 397 --------------------------------
 src/bigint/asm_amd64/mp_mulop.cpp       |  94 --------
 src/bigint/asm_amd64/mp_mulop_amd64.S   | 128 ----------
 src/bigint/asm_amd64/xxxinfo.txt        |  34 ---
 src/bigint/asm_ia32/mp_mulop.S          |  62 -----
 src/bigint/asm_ia32/xxxinfo.txt         |  43 ----
 src/bigint/monty_amd64/mp_monty.S       | 397 ++++++++++++++++++++++++++++++++
 src/bigint/monty_amd64/xxxinfo.txt      |  34 +++
 src/bigint/mulop_amd64/mp_mulop.cpp     |  94 ++++++++
 src/bigint/mulop_amd64/mp_mulop_amd64.S | 128 ++++++++++
 src/bigint/mulop_ia32/mp_mulop.S        |  62 +++++
 src/bigint/mulop_ia32/xxxinfo.txt       |  43 ++++
 12 files changed, 758 insertions(+), 758 deletions(-)
 delete mode 100644 src/bigint/asm_amd64/mp_monty.S
 delete mode 100644 src/bigint/asm_amd64/mp_mulop.cpp
 delete mode 100644 src/bigint/asm_amd64/mp_mulop_amd64.S
 delete mode 100644 src/bigint/asm_amd64/xxxinfo.txt
 delete mode 100644 src/bigint/asm_ia32/mp_mulop.S
 delete mode 100644 src/bigint/asm_ia32/xxxinfo.txt
 create mode 100644 src/bigint/monty_amd64/mp_monty.S
 create mode 100644 src/bigint/monty_amd64/xxxinfo.txt
 create mode 100644 src/bigint/mulop_amd64/mp_mulop.cpp
 create mode 100644 src/bigint/mulop_amd64/mp_mulop_amd64.S
 create mode 100644 src/bigint/mulop_ia32/mp_mulop.S
 create mode 100644 src/bigint/mulop_ia32/xxxinfo.txt

(limited to 'src')

diff --git a/src/bigint/asm_amd64/mp_monty.S b/src/bigint/asm_amd64/mp_monty.S
deleted file mode 100644
index 3dd4040bc..000000000
--- a/src/bigint/asm_amd64/mp_monty.S
+++ /dev/null
@@ -1,397 +0,0 @@
-/*************************************************
-* Montgomery Reduction Source File               *
-* (C) 2008 Jack Lloyd                            *
-*************************************************/
-
-#include <botan/asm_macr.h>
-
-START_LISTING(mp_monty.S)
-
-START_FUNCTION(bigint_monty_redc)
-	pushq	%r15	#
-	pushq	%r14	#
-	pushq	%r13	#
-	pushq	%r12	#
-	pushq	%rbp	#
-	pushq	%rbx	#
-
-        movq	%rdi, %r14	# z
-	movq	%rdx, %r12	# x
-	movl	%esi, %ebp	# z_size
-
-	xorl	%esi, %esi	# j.76
-	movq	%r8, -16(%rsp)	# u, u
-	movl	%ecx, %ebx	# x_size, x_size
-	movl	%ecx, %r8d	# x_size, blocks_of_8
-	andl	$-8, %r8d	#, blocks_of_8
-	testl	%ecx, %ecx	# x_size
-	je	.L3	#,
-	mov	%ecx, %eax	# x_size, pretmp.71
-	leal	1(%rbx), %r15d	#, k.73
-	salq	$3, %rax	#,
-	xorl	%r13d, %r13d	# j
-	movq	%rax, -8(%rsp)	#, pretmp.21
-	.p2align 4,,10
-	.p2align 3
-.L11:
-	mov	%r13d, %eax	# j, j
-	movq	-16(%rsp), %rdi	# u, y
-	leaq	(%r14,%rax,8), %r11	#, z_j
-	xorl	%r9d, %r9d	# i
-	imulq	(%r11), %rdi	#* z_j, y
-	xorl	%r10d, %r10d	# carry
-	testl	%r8d, %r8d	# blocks_of_8
-	je	.L7	#,
-	.p2align 4,,10
-	.p2align 3
-.LOOP_MUL_ADD:
-	mov	%r9d, %ecx	# i, i
-	addl	$8, %r9d	#, i
-	salq	$3, %rcx	#, D.2315
-	leaq	(%r11,%rcx), %rsi	#, tmp130
-	leaq	(%r12,%rcx), %rcx	#, tmp131
-
-	movq 8*0(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*0(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*0 (%rsi)
-
-        movq 8*1(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*1(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*1 (%rsi)
-
-        movq 8*2(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*2(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*2 (%rsi)
-
-        movq 8*3(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*3(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*3 (%rsi)
-
-        movq 8*4(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*4(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*4 (%rsi)
-
-        movq 8*5(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*5(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*5 (%rsi)
-
-        movq 8*6(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*6(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*6 (%rsi)
-
-        movq 8*7(%rcx), %rax
-	mulq %rdi	# y
-	addq %r10, %rax	# carry
-	adcq $0,%rdx
-	addq 8*7(%rsi), %rax
-	adcq $0,%rdx
-	movq %rdx,%r10	# carry
-	movq %rax, 8*7 (%rsi)
-
-	cmpl	%r9d, %r8d	# i, blocks_of_8
-	jne	.LOOP_MUL_ADD	#,
-	cmpl	%r8d, %ebx	# blocks_of_8, x_size
-	je	.L8	#,
-.L7:
-	movl	%r8d, %esi	# blocks_of_8, i
-	.p2align 4,,10
-	.p2align 3
-.L5:
-	mov	%esi, %eax	# i, i
-	movq	%rdi, %rcx	# y, b
-	leaq	(%r11, %rax,8), %r9	#, D.2325
-	incl	%esi	# i
-	movq	(%r12, %rax,8), %rax	#* x, tmp133
-
-        mulq %rcx	# b
-	addq (%r9), %rax	#* D.2325, a
-	adcq $0,%rdx	#
-	addq %r10, %rax	# carry, a
-	adcq $0,%rdx	#
-
-	cmpl	%esi, %ebx	# i, x_size
-	movq	%rdx, %r10	#, carry
-	movq	%rax, (%r9)	# a,* D.2325
-	jne	.L5	#,
-.L8:
-	movq	-8(%rsp), %rdx	# pretmp.21,
-	leaq	(%r11,%rdx), %rax	#, D.2332
-	movq	(%rax), %rcx	#* D.2332, D.2333
-	leaq	(%r10,%rcx), %rdx	#, z_sum
-	movq	%rdx, (%rax)	# z_sum,* D.2332
-	cmpq	%rdx, %rcx	# z_sum, D.2333
-	jbe	.L9	#,
-	cmpl	%ebp, %r15d	# z_size, k.73
-	je	.L9	#,
-	movl	%r15d, %ecx	# k.73, k
-	jmp	.L10	#
-	.p2align 4,,10
-	.p2align 3
-.L31:
-	incl	%ecx	# k
-	cmpl	%ecx, %ebp	# k, z_size
-	.p2align 4,,4
-	.p2align 3
-	je	.L9	#,
-.L10:
-	mov	%ecx, %edx	# k, k
-	leaq	(%r11,%rdx,8), %rdx	#, D.2342
-	movq	(%rdx), %rax	#* D.2342, tmp136
-	incq	%rax	# D.2344
-	movq	%rax, (%rdx)	# D.2344,* D.2342
-	testq	%rax, %rax	# D.2344
-	je	.L31	#,
-.L9:
-	incl	%r13d	# j
-	decl	%ebp	# z_size
-	cmpl	%r13d, %ebx	# j, x_size
-	jne	.L11	#,
-	movl	%ebx, %esi	# x_size, j.76
-.L3:
-	leal	(%rbx,%rbx), %eax	#, tmp137
-	mov	%eax, %eax
-	leaq	(%r14, %rax,8), %rdi	#, D.2349
-	cmpq	$0, (%rdi)	#,* D.2349
-	jne	.L12	#,
-	testl	%ebx, %ebx	# x_size
-	je	.L12	#,
-	leal	-1(%rbx), %ecx	#, j
-	leal	(%rsi,%rcx), %edx	#, tmp141
-	mov	%ecx, %eax	# j, j
-	movq	(%r14,%rdx,8), %rbp	#* z,
-	cmpq	%rbp, (%r12, %rax,8)	#,* x
-	jb	.L12	#,
-	ja	.L_EXIT	#,
-	leal	-2(%rsi,%rbx), %edx	#, ivtmp.45
-	jmp	.L14	#
-	.p2align 4,,10
-	.p2align 3
-.L15:
-	mov	%edx, %eax	# ivtmp.45, ivtmp.45
-	decl	%ecx	# j
-	movq	(%r14, %rax,8), %rsi	#* z, D.2360
-	mov	%ecx, %eax	# j, j
-	movq	(%r12, %rax,8), %rax	#* x, temp.68
-	cmpq	%rax, %rsi
-	ja	.L12	#,
-	decl	%edx	# ivtmp.45
-	cmpq	%rax, %rsi
-	jb	.L_EXIT	#,
-.L14:
-	testl	%ecx, %ecx	# j
-	jne	.L15	#,
-.L12:
-	xorl	%ecx, %ecx	# j
-	xorl	%r10d, %r10d	# carry
-	mov	%ebx, %esi	# x_size, pretmp.19
-	testl	%r8d, %r8d	# blocks_of_8
-	je	.L17	#,
-	.p2align 4,,10
-	.p2align 3
-.L22:
-	mov	%ecx, %edx	# j, D.2375
-	addl	$8, %ecx	#, j
-	leaq	(%rdx,%rsi), %rax	#, tmp146
-	leaq	(%r12,%rdx,8), %rdx	#, tmp150
-	leaq	(%r14, %rax,8), %rax	#, tmp148
-
-	rorq %r10	# carry
-
-        movq 8*0(%rdx), %r10
-	sbbq %r10, 8*0(%rax)
-
-        movq 8*1(%rdx), %r10
-	sbbq %r10, 8*1(%rax)
-
-        movq 8*2(%rdx), %r10
-	sbbq %r10, 8*2(%rax)
-
-        movq 8*3(%rdx), %r10
-	sbbq %r10, 8*3(%rax)
-
-        movq 8*4(%rdx), %r10
-	sbbq %r10, 8*4(%rax)
-
-        movq 8*5(%rdx), %r10
-	sbbq %r10, 8*5(%rax)
-
-        movq 8*6(%rdx), %r10
-	sbbq %r10, 8*6(%rax)
-
-        movq 8*7(%rdx), %r10
-	sbbq %r10, 8*7(%rax)
-
-        sbbq %r10,%r10	# carry
-	negq %r10	# carry
-
-	cmpl	%ecx, %r8d	# j, blocks_of_8
-	jne	.L22	#,
-.L17:
-	cmpl	%r8d, %ebx	# blocks_of_8, x_size
-	je	.L19	#,
-	leal	(%r8,%rbx), %r9d	#, ivtmp.33
-	movl	%r8d, %esi	# blocks_of_8, j
-	.p2align 4,,10
-	.p2align 3
-.L20:
-	mov	%r9d, %eax	# ivtmp.33, ivtmp.33
-	mov	%esi, %ecx	# j, j
-	leaq	(%r14, %rax,8), %rax	#, D.2387
-	incl	%esi	# j
-	movq	(%rax), %rdx	#* D.2387, tmp153
-	incl	%r9d	# ivtmp.33
-
-	rorq %r10	# carry
-	sbbq (%r12,%rcx,8),%rdx	#* x, x
-	sbbq %r10,%r10	# carry
-	negq %r10	# carry
-
-	cmpl	%esi, %ebx	# j, x_size
-	movq	%rdx, (%rax)	# x,* D.2387
-	jne	.L20	#,
-.L19:
-	testq	%r10, %r10	# carry
-	je	.L_EXIT	#,
-	decq	(%rdi)	#* D.2349
-.L_EXIT:
-	popq	%rbx	#
-	popq	%rbp	#
-	popq	%r12	#
-	popq	%r13	#
-	popq	%r14	#
-	popq	%r15	#
-END_FUNCTION(bigint_monty_redc)
-
-
-#if 0
-   #define Z_ARR    ARG_1 // rdi
-#define Z_SIZE   ARG_2_32 // esi
-// X_ARR is ARG_3 == rdx, moved b/c needed for multiply
-#define X_SIZE   ARG_4_32 // ecx
-#define U        ARG_5 // r8
-
-/*
-     We need all arguments for a while (we can reuse U eventually)
-   So only temp registers are
-     TEMP_1 %r10
-     TEMP_2 %r11
-     TEMP_3 = ARG_6 = %r9
-   void return, so also
-     R0 %rax (aka TEMP_9)
-   is free (but needed for multiply)
-
-   Can push:
-     %rbx (base pointer, callee saved)
-     %rpb (frame pointer, callee saved)
-     %r12-%r15 (callee saved)
-
-  Can push base/frame pointers since this is a leaf function
-  and does not reference any data.
-*/
-
-   push %r12
-   push %r13
-   push %r14
-   push %r15
-
-#define LOOP_CTR_I %r12
-#define LOOP_CTR_J %r13
-
-#define CARRY    TEMP_1
-#define Z_WORD   TEMP_2
-#define X_ARR    TEMP_3
-#define MUL_LO   %rax
-#define MUL_HI   %rdx
-
-   ASSIGN(X_ARR, ARG_3)
-
-   /*
-   ZEROIZE(CARRY)
-
-   ASSIGN(LOOP_CTR, X_SIZE)
-
-   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
-   JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
-
-#define MULADD_OP(N)                  \
-   ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
-   ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
-   MUL(Y)                           ; \
-   ADD(Z_WORD, CARRY)               ; \
-   ASSIGN(CARRY, MUL_HI)            ; \
-   ADD_LAST_CARRY(CARRY)            ; \
-   ADD(Z_WORD, MUL_LO)              ; \
-   ADD_LAST_CARRY(CARRY)            ; \
-   ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
-
-ALIGN
-.LOOP_MULADD8:
-   MULADD_OP(0)
-   MULADD_OP(1)
-   MULADD_OP(2)
-   MULADD_OP(3)
-   MULADD_OP(4)
-   MULADD_OP(5)
-   MULADD_OP(6)
-   MULADD_OP(7)
-
-   SUB_IMM(LOOP_CTR, 8)
-   ADD_IMM(Z_ARR, 64)
-   ADD_IMM(X_ARR, 64)
-   cmp IMM(8), LOOP_CTR
-   jge .LOOP_MULADD8
-
-   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
-
-ALIGN
-.LOOP_MULADD1:
-   MULADD_OP(0)
-
-   SUB_IMM(LOOP_CTR, 1)
-   ADD_IMM(Z_ARR, 8)
-   ADD_IMM(X_ARR, 8)
-
-   cmp IMM(0), LOOP_CTR
-   jne .LOOP_MULADD1
-*/
-
-   pop %r15
-   pop %r14
-   pop %r13
-   pop %r12
-#endif
diff --git a/src/bigint/asm_amd64/mp_mulop.cpp b/src/bigint/asm_amd64/mp_mulop.cpp
deleted file mode 100644
index d1aa51489..000000000
--- a/src/bigint/asm_amd64/mp_mulop.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*************************************************
-* Simple O(N^2) Multiplication and Squaring      *
-* (C) 1999-2008 Jack Lloyd                       *
-*************************************************/
-
-#include <botan/mp_asm.h>
-#include <botan/mp_asmi.h>
-#include <botan/mp_core.h>
-#include <botan/mem_ops.h>
-
-namespace Botan {
-
-extern "C" {
-
-/*************************************************
-* Simple O(N^2) Multiplication                   *
-*************************************************/
-void bigint_simple_mul(word z[], const word x[], u32bit x_size,
-                                 const word y[], u32bit y_size)
-   {
-   const u32bit blocks = x_size - (x_size % 8);
-
-   clear_mem(z, x_size + y_size);
-
-   for(u32bit i = 0; i != y_size; ++i)
-      {
-      word carry = 0;
-
-      for(u32bit j = 0; j != blocks; j += 8)
-         carry = word8_madd3(z + i + j, x + j, y[i], carry);
-
-      for(u32bit j = blocks; j != x_size; ++j)
-         z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry);
-
-      z[x_size+i] = carry;
-      }
-   }
-
-inline word word_sqr(word x,
-
-/*************************************************
-* Simple O(N^2) Squaring
-
-This is exactly the same algorithm as bigint_simple_mul,
-however because C/C++ compilers suck at alias analysis it
-is good to have the version where the compiler knows
-that x == y
-*************************************************/
-void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
-   {
-   clear_mem(z, 2*x_size);
-
-   for(u32bit i = 0; i != x_size; ++i)
-      {
-      const word x_i = x[i];
-
-      word carry = z[2*i];
-      z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry);
-
-      for(u32bit j = i; j != x_size; ++j)
-         {
-         // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry;
-
-         /*
-         load z[i+j] into register
-         load x[j] into %hi
-         mulq %[x_i] -> x[i] * x[j] -> %lo:%hi
-         shlq %lo, $1
-
-         // put carry bit (cf) from %lo into %temp
-         xorl %temp
-         adcq $0, %temp
-
-         // high bit of lo now in cf
-         shl %hi, $1
-         // add in lowest bid from %lo
-         orl %temp, %hi
-
-         addq %[c], %[lo]
-         adcq $0, %[hi]
-         addq %[z_ij], %[lo]
-         adcq $0, %[hi]
-
-         */
-
-         }
-
-      z[x_size+i] = carry;
-      }
-   }
-
-}
-
-}
diff --git a/src/bigint/asm_amd64/mp_mulop_amd64.S b/src/bigint/asm_amd64/mp_mulop_amd64.S
deleted file mode 100644
index e5bba23fb..000000000
--- a/src/bigint/asm_amd64/mp_mulop_amd64.S
+++ /dev/null
@@ -1,128 +0,0 @@
-/*************************************************
-* Simple O(N^2) Multiplication and Squaring      *
-* (C) 1999-2008 Jack Lloyd                       *
-*************************************************/
-
-#include <botan/asm_macr.h>
-
-START_LISTING(mp_mulop.S)
-
-#if 0
-void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
-   {
-   const u32bit blocks = x_size - (x_size % 8);
-
-   clear_mem(z, 2*x_size);
-
-   for(u32bit i = 0; i != x_size; ++i)
-      {
-      word carry = 0;
-
-      /*
-      for(u32bit j = 0; j != blocks; j += 8)
-         carry = word8_madd3(z + i + j, x + j, x[i], carry);
-
-      for(u32bit j = blocks; j != x_size; ++j)
-         z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
-      */
-
-
-      for(u32bit j = 0; j != x_size; ++j)
-         z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
-
-      for(u32bit j = 0; j != x_size; ++j)
-         {
-         dword z = (dword)a * b + c + *d;
-         *d = (word)(z >> BOTAN_MP_WORD_BITS);
-         return (word)z;
-         }
-
-   
-   
-         z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
-
-   }
-
-   
-
-      z[x_size+i] = carry;
-      }
-   }
-
-#endif
-
-START_FUNCTION(bigint_simple_sqr)
-
-#define Z_ARR    ARG_1
-#define X_ARR    ARG_2
-//#define X_SIZE   ARG_3_32
-
-#define CARRY    TEMP_1
-#define Z_WORD   TEMP_2
-#define LOOP_I   TEMP_3
-#define LOOP_J   TEMP_4
-#define X_SIZE   TEMP_5
-#define MUL_LO   %rax
-// arg 3, xsize
-#define MUL_HI   %rdx
-   
-// need arg3 == rdx for multiply
-   ASSIGN(X_SIZE, ARG3_32)
-
-   ZEROIZE(CARRY)
-
-   ZEROIZE(LOOP_I)
-
-.LOOP_ZEROIZE_Z:
-
-   cmp LOOP_I, X_SIZE
-
-
-
-   
-   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
-   JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
-
-#define MULADD_OP(N)                  \
-   ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
-   ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
-   MUL(Y)                           ; \
-   ADD(Z_WORD, CARRY)               ; \
-   ASSIGN(CARRY, MUL_HI)            ; \
-   ADD_LAST_CARRY(CARRY)            ; \
-   ADD(Z_WORD, MUL_LO)              ; \
-   ADD_LAST_CARRY(CARRY)            ; \
-   ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
-
-.LOOP_MULADD8:
-   MULADD_OP(0)
-   MULADD_OP(1)
-   MULADD_OP(2)
-   MULADD_OP(3)
-   MULADD_OP(4)
-   MULADD_OP(5)
-   MULADD_OP(6)
-   MULADD_OP(7)
-
-   SUB_IMM(LOOP_CTR, 8)
-   ADD_IMM(Z_ARR, 64)
-   ADD_IMM(X_ARR, 64)
-   cmp IMM(8), LOOP_CTR
-   jge .LOOP_MULADD8
-
-   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
-
-ALIGN
-.LOOP_MULADD1:
-   MULADD_OP(0)
-
-   SUB_IMM(LOOP_CTR, 1)
-   ADD_IMM(Z_ARR, 8)
-   ADD_IMM(X_ARR, 8)
-
-   cmp IMM(0), LOOP_CTR
-   jne .LOOP_MULADD1
-
-.L_MULADD_DONE:
-   RETURN_VALUE_IS(CARRY)
-END_FUNCTION(bigint_simple_square)
diff --git a/src/bigint/asm_amd64/xxxinfo.txt b/src/bigint/asm_amd64/xxxinfo.txt
deleted file mode 100644
index 2a8f9fe5b..000000000
--- a/src/bigint/asm_amd64/xxxinfo.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-realname "x86-64 Assembler"
-
-mp_bits 64
-
-load_on request
-
-<ignore>
-#mp_mulop.cpp
-#mp_monty.cpp
-</ignore>
-
-<add>
-asm_macr.h
-#mp_mulop_amd64.S
-#mp_monty.S
-</add>
-
-<arch>
-amd64
-</arch>
-
-<cc>
-gcc
-icc
-</cc>
-
-# ELF systems
-<os>
-linux
-freebsd
-netbsd
-openbsd
-solaris
-</os>
diff --git a/src/bigint/asm_ia32/mp_mulop.S b/src/bigint/asm_ia32/mp_mulop.S
deleted file mode 100644
index a5f0d3b27..000000000
--- a/src/bigint/asm_ia32/mp_mulop.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/*************************************************
-* Multiply/Add Algorithm Source File             *
-* (C) 1999-2007 Jack Lloyd                       *
-*************************************************/
-
-#include <botan/asm_macr.h>
-
-START_LISTING(mp_muladd.S)
-
-START_FUNCTION(bigint_mul_add_words)
-   SPILL_REGS()
-#define PUSHED 4
-
-#define LOOP_CTR ESI
-   ASSIGN(LOOP_CTR, ARG(3)) /* x_size */
-   ZEROIZE(EDI)
-
-   ASSIGN(ECX, ARG(1)) /* z[] */
-   ASSIGN(EBX, ARG(2)) /* x[] */
-   ASSIGN(EBP, ARG(4)) /* y */
-
-#define MULADD_OP(N)                       \
-   ASSIGN(EAX, ARRAY4(EBX, N))           ; \
-   MUL(EBP)                              ; \
-   ADD_W_CARRY(EAX, EDX, EDI)            ; \
-   ASSIGN(EDI, EDX)                      ; \
-   ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ;
-
-   JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
-   JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP)
-
-START_LOOP(.MULADD8)
-   MULADD_OP(0)
-   MULADD_OP(1)
-   MULADD_OP(2)
-   MULADD_OP(3)
-   MULADD_OP(4)
-   MULADD_OP(5)
-   MULADD_OP(6)
-   MULADD_OP(7)
-
-   SUB_IMM(LOOP_CTR, 8)
-   ADD_IMM(EBX, 32)
-   ADD_IMM(ECX, 32)
-LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8)
-
-   JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
-
-START_LOOP(.MULADD1)
-   MULADD_OP(0)
-
-   SUB_IMM(LOOP_CTR, 1)
-   ADD_IMM(EBX, 4)
-   ADD_IMM(ECX, 4)
-LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1)
-
-.MUL_ADD_DONE:
-
-   ASSIGN(EAX, EDI)
-#undef PUSHED
-   RESTORE_REGS()
-END_FUNCTION(bigint_mul_add_words)
diff --git a/src/bigint/asm_ia32/xxxinfo.txt b/src/bigint/asm_ia32/xxxinfo.txt
deleted file mode 100644
index 12c8cd96d..000000000
--- a/src/bigint/asm_ia32/xxxinfo.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-realname "x86 Assembler"
-
-#mp_bits 32
-
-load_on asm_ok
-
-<replace>
-md4.cpp
-md5.cpp
-sha160.cpp
-serpent.cpp
-</replace>
-
-<ignore>
-#mp_mulop.cpp
-</ignore>
-
-<add>
-asm_macr.h
-md4core.S
-md5core.S
-sha1_asm.S
-serp_asm.S
-#mp_mulop.S
-</add>
-
-<arch>
-ia32
-</arch>
-
-<cc>
-gcc
-icc
-</cc>
-
-# ELF systems
-<os>
-linux
-freebsd
-netbsd
-openbsd
-solaris
-</os>
diff --git a/src/bigint/monty_amd64/mp_monty.S b/src/bigint/monty_amd64/mp_monty.S
new file mode 100644
index 000000000..3dd4040bc
--- /dev/null
+++ b/src/bigint/monty_amd64/mp_monty.S
@@ -0,0 +1,397 @@
+/*************************************************
+* Montgomery Reduction Source File               *
+* (C) 2008 Jack Lloyd                            *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_monty.S)
+
+START_FUNCTION(bigint_monty_redc)
+	pushq	%r15	#
+	pushq	%r14	#
+	pushq	%r13	#
+	pushq	%r12	#
+	pushq	%rbp	#
+	pushq	%rbx	#
+
+        movq	%rdi, %r14	# z
+	movq	%rdx, %r12	# x
+	movl	%esi, %ebp	# z_size
+
+	xorl	%esi, %esi	# j.76
+	movq	%r8, -16(%rsp)	# u, u
+	movl	%ecx, %ebx	# x_size, x_size
+	movl	%ecx, %r8d	# x_size, blocks_of_8
+	andl	$-8, %r8d	#, blocks_of_8
+	testl	%ecx, %ecx	# x_size
+	je	.L3	#,
+	mov	%ecx, %eax	# x_size, pretmp.71
+	leal	1(%rbx), %r15d	#, k.73
+	salq	$3, %rax	#,
+	xorl	%r13d, %r13d	# j
+	movq	%rax, -8(%rsp)	#, pretmp.21
+	.p2align 4,,10
+	.p2align 3
+.L11:
+	mov	%r13d, %eax	# j, j
+	movq	-16(%rsp), %rdi	# u, y
+	leaq	(%r14,%rax,8), %r11	#, z_j
+	xorl	%r9d, %r9d	# i
+	imulq	(%r11), %rdi	#* z_j, y
+	xorl	%r10d, %r10d	# carry
+	testl	%r8d, %r8d	# blocks_of_8
+	je	.L7	#,
+	.p2align 4,,10
+	.p2align 3
+.LOOP_MUL_ADD:
+	mov	%r9d, %ecx	# i, i
+	addl	$8, %r9d	#, i
+	salq	$3, %rcx	#, D.2315
+	leaq	(%r11,%rcx), %rsi	#, tmp130
+	leaq	(%r12,%rcx), %rcx	#, tmp131
+
+	movq 8*0(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*0(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*0 (%rsi)
+
+        movq 8*1(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*1(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*1 (%rsi)
+
+        movq 8*2(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*2(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*2 (%rsi)
+
+        movq 8*3(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*3(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*3 (%rsi)
+
+        movq 8*4(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*4(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*4 (%rsi)
+
+        movq 8*5(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*5(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*5 (%rsi)
+
+        movq 8*6(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*6(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*6 (%rsi)
+
+        movq 8*7(%rcx), %rax
+	mulq %rdi	# y
+	addq %r10, %rax	# carry
+	adcq $0,%rdx
+	addq 8*7(%rsi), %rax
+	adcq $0,%rdx
+	movq %rdx,%r10	# carry
+	movq %rax, 8*7 (%rsi)
+
+	cmpl	%r9d, %r8d	# i, blocks_of_8
+	jne	.LOOP_MUL_ADD	#,
+	cmpl	%r8d, %ebx	# blocks_of_8, x_size
+	je	.L8	#,
+.L7:
+	movl	%r8d, %esi	# blocks_of_8, i
+	.p2align 4,,10
+	.p2align 3
+.L5:
+	mov	%esi, %eax	# i, i
+	movq	%rdi, %rcx	# y, b
+	leaq	(%r11, %rax,8), %r9	#, D.2325
+	incl	%esi	# i
+	movq	(%r12, %rax,8), %rax	#* x, tmp133
+
+        mulq %rcx	# b
+	addq (%r9), %rax	#* D.2325, a
+	adcq $0,%rdx	#
+	addq %r10, %rax	# carry, a
+	adcq $0,%rdx	#
+
+	cmpl	%esi, %ebx	# i, x_size
+	movq	%rdx, %r10	#, carry
+	movq	%rax, (%r9)	# a,* D.2325
+	jne	.L5	#,
+.L8:
+	movq	-8(%rsp), %rdx	# pretmp.21,
+	leaq	(%r11,%rdx), %rax	#, D.2332
+	movq	(%rax), %rcx	#* D.2332, D.2333
+	leaq	(%r10,%rcx), %rdx	#, z_sum
+	movq	%rdx, (%rax)	# z_sum,* D.2332
+	cmpq	%rdx, %rcx	# z_sum, D.2333
+	jbe	.L9	#,
+	cmpl	%ebp, %r15d	# z_size, k.73
+	je	.L9	#,
+	movl	%r15d, %ecx	# k.73, k
+	jmp	.L10	#
+	.p2align 4,,10
+	.p2align 3
+.L31:
+	incl	%ecx	# k
+	cmpl	%ecx, %ebp	# k, z_size
+	.p2align 4,,4
+	.p2align 3
+	je	.L9	#,
+.L10:
+	mov	%ecx, %edx	# k, k
+	leaq	(%r11,%rdx,8), %rdx	#, D.2342
+	movq	(%rdx), %rax	#* D.2342, tmp136
+	incq	%rax	# D.2344
+	movq	%rax, (%rdx)	# D.2344,* D.2342
+	testq	%rax, %rax	# D.2344
+	je	.L31	#,
+.L9:
+	incl	%r13d	# j
+	decl	%ebp	# z_size
+	cmpl	%r13d, %ebx	# j, x_size
+	jne	.L11	#,
+	movl	%ebx, %esi	# x_size, j.76
+.L3:
+	leal	(%rbx,%rbx), %eax	#, tmp137
+	mov	%eax, %eax
+	leaq	(%r14, %rax,8), %rdi	#, D.2349
+	cmpq	$0, (%rdi)	#,* D.2349
+	jne	.L12	#,
+	testl	%ebx, %ebx	# x_size
+	je	.L12	#,
+	leal	-1(%rbx), %ecx	#, j
+	leal	(%rsi,%rcx), %edx	#, tmp141
+	mov	%ecx, %eax	# j, j
+	movq	(%r14,%rdx,8), %rbp	#* z,
+	cmpq	%rbp, (%r12, %rax,8)	#,* x
+	jb	.L12	#,
+	ja	.L_EXIT	#,
+	leal	-2(%rsi,%rbx), %edx	#, ivtmp.45
+	jmp	.L14	#
+	.p2align 4,,10
+	.p2align 3
+.L15:
+	mov	%edx, %eax	# ivtmp.45, ivtmp.45
+	decl	%ecx	# j
+	movq	(%r14, %rax,8), %rsi	#* z, D.2360
+	mov	%ecx, %eax	# j, j
+	movq	(%r12, %rax,8), %rax	#* x, temp.68
+	cmpq	%rax, %rsi
+	ja	.L12	#,
+	decl	%edx	# ivtmp.45
+	cmpq	%rax, %rsi
+	jb	.L_EXIT	#,
+.L14:
+	testl	%ecx, %ecx	# j
+	jne	.L15	#,
+.L12:
+	xorl	%ecx, %ecx	# j
+	xorl	%r10d, %r10d	# carry
+	mov	%ebx, %esi	# x_size, pretmp.19
+	testl	%r8d, %r8d	# blocks_of_8
+	je	.L17	#,
+	.p2align 4,,10
+	.p2align 3
+.L22:
+	mov	%ecx, %edx	# j, D.2375
+	addl	$8, %ecx	#, j
+	leaq	(%rdx,%rsi), %rax	#, tmp146
+	leaq	(%r12,%rdx,8), %rdx	#, tmp150
+	leaq	(%r14, %rax,8), %rax	#, tmp148
+
+	rorq %r10	# carry
+
+        movq 8*0(%rdx), %r10
+	sbbq %r10, 8*0(%rax)
+
+        movq 8*1(%rdx), %r10
+	sbbq %r10, 8*1(%rax)
+
+        movq 8*2(%rdx), %r10
+	sbbq %r10, 8*2(%rax)
+
+        movq 8*3(%rdx), %r10
+	sbbq %r10, 8*3(%rax)
+
+        movq 8*4(%rdx), %r10
+	sbbq %r10, 8*4(%rax)
+
+        movq 8*5(%rdx), %r10
+	sbbq %r10, 8*5(%rax)
+
+        movq 8*6(%rdx), %r10
+	sbbq %r10, 8*6(%rax)
+
+        movq 8*7(%rdx), %r10
+	sbbq %r10, 8*7(%rax)
+
+        sbbq %r10,%r10	# carry
+	negq %r10	# carry
+
+	cmpl	%ecx, %r8d	# j, blocks_of_8
+	jne	.L22	#,
+.L17:
+	cmpl	%r8d, %ebx	# blocks_of_8, x_size
+	je	.L19	#,
+	leal	(%r8,%rbx), %r9d	#, ivtmp.33
+	movl	%r8d, %esi	# blocks_of_8, j
+	.p2align 4,,10
+	.p2align 3
+.L20:
+	mov	%r9d, %eax	# ivtmp.33, ivtmp.33
+	mov	%esi, %ecx	# j, j
+	leaq	(%r14, %rax,8), %rax	#, D.2387
+	incl	%esi	# j
+	movq	(%rax), %rdx	#* D.2387, tmp153
+	incl	%r9d	# ivtmp.33
+
+	rorq %r10	# carry
+	sbbq (%r12,%rcx,8),%rdx	#* x, x
+	sbbq %r10,%r10	# carry
+	negq %r10	# carry
+
+	cmpl	%esi, %ebx	# j, x_size
+	movq	%rdx, (%rax)	# x,* D.2387
+	jne	.L20	#,
+.L19:
+	testq	%r10, %r10	# carry
+	je	.L_EXIT	#,
+	decq	(%rdi)	#* D.2349
+.L_EXIT:
+	popq	%rbx	#
+	popq	%rbp	#
+	popq	%r12	#
+	popq	%r13	#
+	popq	%r14	#
+	popq	%r15	#
+END_FUNCTION(bigint_monty_redc)
+
+
+#if 0
+   #define Z_ARR    ARG_1 // rdi
+#define Z_SIZE   ARG_2_32 // esi
+// X_ARR is ARG_3 == rdx, moved b/c needed for multiply
+#define X_SIZE   ARG_4_32 // ecx
+#define U        ARG_5 // r8
+
+/*
+     We need all arguments for a while (we can reuse U eventually)
+   So only temp registers are
+     TEMP_1 %r10
+     TEMP_2 %r11
+     TEMP_3 = ARG_6 = %r9
+   void return, so also
+     R0 %rax (aka TEMP_9)
+   is free (but needed for multiply)
+
+   Can push:
+     %rbx (base pointer, callee saved)
+     %rpb (frame pointer, callee saved)
+     %r12-%r15 (callee saved)
+
+  Can push base/frame pointers since this is a leaf function
+  and does not reference any data.
+*/
+
+   push %r12
+   push %r13
+   push %r14
+   push %r15
+
+#define LOOP_CTR_I %r12
+#define LOOP_CTR_J %r13
+
+#define CARRY    TEMP_1
+#define Z_WORD   TEMP_2
+#define X_ARR    TEMP_3
+#define MUL_LO   %rax
+#define MUL_HI   %rdx
+
+   ASSIGN(X_ARR, ARG_3)
+
+   /*
+   ZEROIZE(CARRY)
+
+   ASSIGN(LOOP_CTR, X_SIZE)
+
+   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+   JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N)                  \
+   ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+   ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+   MUL(Y)                           ; \
+   ADD(Z_WORD, CARRY)               ; \
+   ASSIGN(CARRY, MUL_HI)            ; \
+   ADD_LAST_CARRY(CARRY)            ; \
+   ADD(Z_WORD, MUL_LO)              ; \
+   ADD_LAST_CARRY(CARRY)            ; \
+   ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+ALIGN
+.LOOP_MULADD8:
+   MULADD_OP(0)
+   MULADD_OP(1)
+   MULADD_OP(2)
+   MULADD_OP(3)
+   MULADD_OP(4)
+   MULADD_OP(5)
+   MULADD_OP(6)
+   MULADD_OP(7)
+
+   SUB_IMM(LOOP_CTR, 8)
+   ADD_IMM(Z_ARR, 64)
+   ADD_IMM(X_ARR, 64)
+   cmp IMM(8), LOOP_CTR
+   jge .LOOP_MULADD8
+
+   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+   MULADD_OP(0)
+
+   SUB_IMM(LOOP_CTR, 1)
+   ADD_IMM(Z_ARR, 8)
+   ADD_IMM(X_ARR, 8)
+
+   cmp IMM(0), LOOP_CTR
+   jne .LOOP_MULADD1
+*/
+
+   pop %r15
+   pop %r14
+   pop %r13
+   pop %r12
+#endif
diff --git a/src/bigint/monty_amd64/xxxinfo.txt b/src/bigint/monty_amd64/xxxinfo.txt
new file mode 100644
index 000000000..2a8f9fe5b
--- /dev/null
+++ b/src/bigint/monty_amd64/xxxinfo.txt
@@ -0,0 +1,34 @@
+realname "x86-64 Assembler"
+
+mp_bits 64
+
+load_on request
+
+<ignore>
+#mp_mulop.cpp
+#mp_monty.cpp
+</ignore>
+
+<add>
+asm_macr.h
+#mp_mulop_amd64.S
+#mp_monty.S
+</add>
+
+<arch>
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
diff --git a/src/bigint/mulop_amd64/mp_mulop.cpp b/src/bigint/mulop_amd64/mp_mulop.cpp
new file mode 100644
index 000000000..d1aa51489
--- /dev/null
+++ b/src/bigint/mulop_amd64/mp_mulop.cpp
@@ -0,0 +1,94 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring      *
+* (C) 1999-2008 Jack Lloyd                       *
+*************************************************/
+
+#include <botan/mp_asm.h>
+#include <botan/mp_asmi.h>
+#include <botan/mp_core.h>
+#include <botan/mem_ops.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Simple O(N^2) Multiplication                   *
+*************************************************/
+void bigint_simple_mul(word z[], const word x[], u32bit x_size,
+                                 const word y[], u32bit y_size)
+   {
+   const u32bit blocks = x_size - (x_size % 8);
+
+   clear_mem(z, x_size + y_size);
+
+   for(u32bit i = 0; i != y_size; ++i)
+      {
+      word carry = 0;
+
+      for(u32bit j = 0; j != blocks; j += 8)
+         carry = word8_madd3(z + i + j, x + j, y[i], carry);
+
+      for(u32bit j = blocks; j != x_size; ++j)
+         z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry);
+
+      z[x_size+i] = carry;
+      }
+   }
+
+inline word word_sqr(word x,
+
+/*************************************************
+* Simple O(N^2) Squaring
+
+This is exactly the same algorithm as bigint_simple_mul,
+however because C/C++ compilers suck at alias analysis it
+is good to have the version where the compiler knows
+that x == y
+*************************************************/
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+   {
+   clear_mem(z, 2*x_size);
+
+   for(u32bit i = 0; i != x_size; ++i)
+      {
+      const word x_i = x[i];
+
+      word carry = z[2*i];
+      z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry);
+
+      for(u32bit j = i; j != x_size; ++j)
+         {
+         // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry;
+
+         /*
+         load z[i+j] into register
+         load x[j] into %hi
+         mulq %[x_i] -> x[i] * x[j] -> %lo:%hi
+         shlq %lo, $1
+
+         // put carry bit (cf) from %lo into %temp
+         xorl %temp
+         adcq $0, %temp
+
+         // high bit of lo now in cf
+         shl %hi, $1
+         // add in lowest bid from %lo
+         orl %temp, %hi
+
+         addq %[c], %[lo]
+         adcq $0, %[hi]
+         addq %[z_ij], %[lo]
+         adcq $0, %[hi]
+
+         */
+
+         }
+
+      z[x_size+i] = carry;
+      }
+   }
+
+}
+
+}
diff --git a/src/bigint/mulop_amd64/mp_mulop_amd64.S b/src/bigint/mulop_amd64/mp_mulop_amd64.S
new file mode 100644
index 000000000..e5bba23fb
--- /dev/null
+++ b/src/bigint/mulop_amd64/mp_mulop_amd64.S
@@ -0,0 +1,128 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring      *
+* (C) 1999-2008 Jack Lloyd                       *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_mulop.S)
+
+#if 0
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+   {
+   const u32bit blocks = x_size - (x_size % 8);
+
+   clear_mem(z, 2*x_size);
+
+   for(u32bit i = 0; i != x_size; ++i)
+      {
+      word carry = 0;
+
+      /*
+      for(u32bit j = 0; j != blocks; j += 8)
+         carry = word8_madd3(z + i + j, x + j, x[i], carry);
+
+      for(u32bit j = blocks; j != x_size; ++j)
+         z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+      */
+
+
+      for(u32bit j = 0; j != x_size; ++j)
+         z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+      for(u32bit j = 0; j != x_size; ++j)
+         {
+         dword z = (dword)a * b + c + *d;
+         *d = (word)(z >> BOTAN_MP_WORD_BITS);
+         return (word)z;
+         }
+
+   
+   
+         z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+   }
+
+   
+
+      z[x_size+i] = carry;
+      }
+   }
+
+#endif
+
+START_FUNCTION(bigint_simple_sqr)
+
+#define Z_ARR    ARG_1
+#define X_ARR    ARG_2
+//#define X_SIZE   ARG_3_32
+
+#define CARRY    TEMP_1
+#define Z_WORD   TEMP_2
+#define LOOP_I   TEMP_3
+#define LOOP_J   TEMP_4
+#define X_SIZE   TEMP_5
+#define MUL_LO   %rax
+// arg 3, xsize
+#define MUL_HI   %rdx
+   
+// need arg3 == rdx for multiply
+   ASSIGN(X_SIZE, ARG3_32)
+
+   ZEROIZE(CARRY)
+
+   ZEROIZE(LOOP_I)
+
+.LOOP_ZEROIZE_Z:
+
+   cmp LOOP_I, X_SIZE
+
+
+
+   
+   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+   JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N)                  \
+   ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+   ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+   MUL(Y)                           ; \
+   ADD(Z_WORD, CARRY)               ; \
+   ASSIGN(CARRY, MUL_HI)            ; \
+   ADD_LAST_CARRY(CARRY)            ; \
+   ADD(Z_WORD, MUL_LO)              ; \
+   ADD_LAST_CARRY(CARRY)            ; \
+   ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+.LOOP_MULADD8:
+   MULADD_OP(0)
+   MULADD_OP(1)
+   MULADD_OP(2)
+   MULADD_OP(3)
+   MULADD_OP(4)
+   MULADD_OP(5)
+   MULADD_OP(6)
+   MULADD_OP(7)
+
+   SUB_IMM(LOOP_CTR, 8)
+   ADD_IMM(Z_ARR, 64)
+   ADD_IMM(X_ARR, 64)
+   cmp IMM(8), LOOP_CTR
+   jge .LOOP_MULADD8
+
+   JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+   MULADD_OP(0)
+
+   SUB_IMM(LOOP_CTR, 1)
+   ADD_IMM(Z_ARR, 8)
+   ADD_IMM(X_ARR, 8)
+
+   cmp IMM(0), LOOP_CTR
+   jne .LOOP_MULADD1
+
+.L_MULADD_DONE:
+   RETURN_VALUE_IS(CARRY)
+END_FUNCTION(bigint_simple_square)
diff --git a/src/bigint/mulop_ia32/mp_mulop.S b/src/bigint/mulop_ia32/mp_mulop.S
new file mode 100644
index 000000000..a5f0d3b27
--- /dev/null
+++ b/src/bigint/mulop_ia32/mp_mulop.S
@@ -0,0 +1,62 @@
+/*************************************************
+* Multiply/Add Algorithm Source File             *
+* (C) 1999-2007 Jack Lloyd                       *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_muladd.S)
+
+START_FUNCTION(bigint_mul_add_words)
+   SPILL_REGS()
+#define PUSHED 4
+
+#define LOOP_CTR ESI
+   ASSIGN(LOOP_CTR, ARG(3)) /* x_size */
+   ZEROIZE(EDI)
+
+   ASSIGN(ECX, ARG(1)) /* z[] */
+   ASSIGN(EBX, ARG(2)) /* x[] */
+   ASSIGN(EBP, ARG(4)) /* y */
+
+#define MULADD_OP(N)                       \
+   ASSIGN(EAX, ARRAY4(EBX, N))           ; \
+   MUL(EBP)                              ; \
+   ADD_W_CARRY(EAX, EDX, EDI)            ; \
+   ASSIGN(EDI, EDX)                      ; \
+   ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ;
+
+   JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
+   JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP)
+
+START_LOOP(.MULADD8)
+   MULADD_OP(0)
+   MULADD_OP(1)
+   MULADD_OP(2)
+   MULADD_OP(3)
+   MULADD_OP(4)
+   MULADD_OP(5)
+   MULADD_OP(6)
+   MULADD_OP(7)
+
+   SUB_IMM(LOOP_CTR, 8)
+   ADD_IMM(EBX, 32)
+   ADD_IMM(ECX, 32)
+LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8)
+
+   JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
+
+START_LOOP(.MULADD1)
+   MULADD_OP(0)
+
+   SUB_IMM(LOOP_CTR, 1)
+   ADD_IMM(EBX, 4)
+   ADD_IMM(ECX, 4)
+LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1)
+
+.MUL_ADD_DONE:
+
+   ASSIGN(EAX, EDI)
+#undef PUSHED
+   RESTORE_REGS()
+END_FUNCTION(bigint_mul_add_words)
diff --git a/src/bigint/mulop_ia32/xxxinfo.txt b/src/bigint/mulop_ia32/xxxinfo.txt
new file mode 100644
index 000000000..12c8cd96d
--- /dev/null
+++ b/src/bigint/mulop_ia32/xxxinfo.txt
@@ -0,0 +1,43 @@
+realname "x86 Assembler"
+
+#mp_bits 32
+
+load_on asm_ok
+
+<replace>
+md4.cpp
+md5.cpp
+sha160.cpp
+serpent.cpp
+</replace>
+
+<ignore>
+#mp_mulop.cpp
+</ignore>
+
+<add>
+asm_macr.h
+md4core.S
+md5core.S
+sha1_asm.S
+serp_asm.S
+#mp_mulop.S
+</add>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
-- 
cgit v1.2.3