aboutsummaryrefslogtreecommitdiffstats
path: root/modules/asm
diff options
context:
space:
mode:
authorlloyd <[email protected]>2008-09-28 15:34:09 +0000
committerlloyd <[email protected]>2008-09-28 15:34:09 +0000
commitea32d18231b9c6c5c84b3754c4249170d3b4e4c0 (patch)
treecc179337d0594ed105768011722b9dbae105e07a /modules/asm
parentb841401e095cfc1aa0708689d7920eb95ece71af (diff)
This is the first checkin to net.randombit.botan.modularized, which
has the intent of modularizing Botan's source code, and making it much easier to add or remove various things at compile time. In this first checkin: Add support for nested directories in modules/ and move all the modules into grouped directories like entropy/ or compression/ Currently this is not ideal, it will _only_ find code in modules/*/*/modinfo.txt, while it would be much better to allow for arbitrary nestings under modules (find modules -name modinfo.txt) for more complicated setups. This 'new' (OMG I've found directories!) structure allows for a more free naming convention (no need for leading es_, ml_, etc to group names, though some keep it for lack of a more meaningful name being obvious to me right at the moment).
Diffstat (limited to 'modules/asm')
-rw-r--r--modules/asm/asm_amd64/asm_macr.h125
-rw-r--r--modules/asm/asm_amd64/modinfo.txt39
-rw-r--r--modules/asm/asm_amd64/mp_monty.S397
-rw-r--r--modules/asm/asm_amd64/mp_mulop_amd64.S128
-rw-r--r--modules/asm/asm_amd64/sha160.cpp52
-rw-r--r--modules/asm/asm_amd64/sha1_asm.S258
-rw-r--r--modules/asm/asm_ia32/asm_macr.h131
-rw-r--r--modules/asm/asm_ia32/md4.cpp43
-rw-r--r--modules/asm/asm_ia32/md4core.S135
-rw-r--r--modules/asm/asm_ia32/md5.cpp43
-rw-r--r--modules/asm/asm_ia32/md5core.S164
-rw-r--r--modules/asm/asm_ia32/modinfo.txt43
-rw-r--r--modules/asm/asm_ia32/mp_mulop.S62
-rw-r--r--modules/asm/asm_ia32/serp_asm.S667
-rw-r--r--modules/asm/asm_ia32/serpent.cpp49
-rw-r--r--modules/asm/asm_ia32/sha160.cpp52
-rw-r--r--modules/asm/asm_ia32/sha1_asm.S242
-rw-r--r--modules/asm/mp_amd64/bswap.h36
-rw-r--r--modules/asm/mp_amd64/modinfo.txt21
-rw-r--r--modules/asm/mp_amd64/mp_asm.h67
-rw-r--r--modules/asm/mp_amd64/mp_asmi.h233
-rw-r--r--modules/asm/mp_amd64/mp_mulop.cpp94
-rw-r--r--modules/asm/mp_asm64/modinfo.txt25
-rw-r--r--modules/asm/mp_asm64/mp_asm.h110
-rw-r--r--modules/asm/mp_ia32/modinfo.txt19
-rw-r--r--modules/asm/mp_ia32/mp_asm.h65
-rw-r--r--modules/asm/mp_ia32/mp_asmi.h225
-rw-r--r--modules/asm/mp_ia32_msvc/modinfo.txt17
-rw-r--r--modules/asm/mp_ia32_msvc/mp_asmi.h547
-rw-r--r--modules/asm/sha1_sse2/modinfo.txt22
-rw-r--r--modules/asm/sha1_sse2/sha160.cpp52
-rw-r--r--modules/asm/sha1_sse2/sha160.h32
-rw-r--r--modules/asm/sha1_sse2/sha1core.cpp327
33 files changed, 4522 insertions, 0 deletions
diff --git a/modules/asm/asm_amd64/asm_macr.h b/modules/asm/asm_amd64/asm_macr.h
new file mode 100644
index 000000000..02aafb347
--- /dev/null
+++ b/modules/asm/asm_amd64/asm_macr.h
@@ -0,0 +1,125 @@
+/*************************************************
+* Assembly Macros Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_EXT_AMD64_ASM_MACROS_H__
+#define BOTAN_EXT_AMD64_ASM_MACROS_H__
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* General/Global Macros *
+*************************************************/
+#define ALIGN .p2align 4,,15
+
+#define START_LISTING(FILENAME) \
+ .file #FILENAME; \
+ .text; \
+ ALIGN;
+
+/*************************************************
+* Function Definitions *
+*************************************************/
+#define START_FUNCTION(func_name) \
+ ALIGN; \
+ .global func_name; \
+ .type func_name,@function; \
+func_name:
+
+#define END_FUNCTION(func_name) \
+ ret
+
+/*************************************************
+* Conditional Jumps *
+*************************************************/
+#define JUMP_IF_ZERO(REG, LABEL) \
+ cmp IMM(0), REG; \
+ jz LABEL
+
+#define JUMP_IF_LT(REG, NUM, LABEL) \
+ cmp IMM(NUM), REG; \
+ jl LABEL
+
+/*************************************************
+* Register Names *
+*************************************************/
+#define R0 %rax
+#define R1 %rbx
+#define R2 %rcx
+#define R2_32 %ecx
+#define R3 %rdx
+#define R3_32 %edx
+#define R4 %rsp
+#define R5 %rbp
+#define R6 %rsi
+#define R6_32 %esi
+#define R7 %rdi
+#define R8 %r8
+#define R9 %r9
+#define R9_32 %r9d
+#define R10 %r10
+#define R11 %r11
+#define R12 %r12
+#define R13 %r13
+#define R14 %r14
+#define R15 %r15
+#define R16 %r16
+
+#define ARG_1 R7
+#define ARG_2 R6
+#define ARG_2_32 R6_32
+#define ARG_3 R3
+#define ARG_3_32 R3_32
+#define ARG_4 R2
+#define ARG_4_32 R2_32
+#define ARG_5 R8
+#define ARG_6 R9
+#define ARG_6_32 R9_32
+
+#define TEMP_1 R10
+#define TEMP_2 R11
+#define TEMP_3 ARG_6
+#define TEMP_4 ARG_5
+#define TEMP_5 ARG_4
+#define TEMP_5_32 ARG_4_32
+#define TEMP_6 ARG_3
+#define TEMP_7 ARG_2
+#define TEMP_8 ARG_1
+#define TEMP_9 R0
+
+/*************************************************
+* Memory Access Operations *
+*************************************************/
+#define ARRAY8(REG, NUM) 8*(NUM)(REG)
+#define ARRAY4(REG, NUM) 4*(NUM)(REG)
+
+#define ASSIGN(TO, FROM) mov FROM, TO
+
+/*************************************************
+* ALU Operations *
+*************************************************/
+#define IMM(VAL) $VAL
+
+#define ADD(TO, FROM) add FROM, TO
+#define ADD_LAST_CARRY(REG) adc IMM(0), REG
+#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM))
+#define ADD_W_CARRY(TO1, TO2, FROM) add FROM, TO1; adc IMM(0), TO2;
+#define SUB_IMM(TO, NUM) sub IMM(NUM), TO
+#define MUL(REG) mul REG
+
+#define XOR(TO, FROM) xor FROM, TO
+#define AND(TO, FROM) and FROM, TO
+#define OR(TO, FROM) or FROM, TO
+#define NOT(REG) not REG
+#define ZEROIZE(REG) XOR(REG, REG)
+
+#define RETURN_VALUE_IS(V) ASSIGN(%rax, V)
+
+#define ROTL_IMM(REG, NUM) rol IMM(NUM), REG
+#define ROTR_IMM(REG, NUM) ror IMM(NUM), REG
+#define ADD3_IMM(TO, FROM, NUM) lea NUM(TO,FROM,1), TO
+
+#endif
diff --git a/modules/asm/asm_amd64/modinfo.txt b/modules/asm/asm_amd64/modinfo.txt
new file mode 100644
index 000000000..30aa5a413
--- /dev/null
+++ b/modules/asm/asm_amd64/modinfo.txt
@@ -0,0 +1,39 @@
+realname "x86-64 Assembler"
+
+mp_bits 64
+
+load_on request
+
+<replace>
+sha160.cpp
+</replace>
+
+<ignore>
+#mp_mulop.cpp
+#mp_monty.cpp
+</ignore>
+
+<add>
+asm_macr.h
+#mp_mulop_amd64.S
+#mp_monty.S
+sha1_asm.S
+</add>
+
+<arch>
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
diff --git a/modules/asm/asm_amd64/mp_monty.S b/modules/asm/asm_amd64/mp_monty.S
new file mode 100644
index 000000000..3dd4040bc
--- /dev/null
+++ b/modules/asm/asm_amd64/mp_monty.S
@@ -0,0 +1,397 @@
+/*************************************************
+* Montgomery Reduction Source File *
+* (C) 2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_monty.S)
+
+START_FUNCTION(bigint_monty_redc)
+ pushq %r15 #
+ pushq %r14 #
+ pushq %r13 #
+ pushq %r12 #
+ pushq %rbp #
+ pushq %rbx #
+
+ movq %rdi, %r14 # z
+ movq %rdx, %r12 # x
+ movl %esi, %ebp # z_size
+
+ xorl %esi, %esi # j.76
+ movq %r8, -16(%rsp) # u, u
+ movl %ecx, %ebx # x_size, x_size
+ movl %ecx, %r8d # x_size, blocks_of_8
+ andl $-8, %r8d #, blocks_of_8
+ testl %ecx, %ecx # x_size
+ je .L3 #,
+ mov %ecx, %eax # x_size, pretmp.71
+ leal 1(%rbx), %r15d #, k.73
+ salq $3, %rax #,
+ xorl %r13d, %r13d # j
+ movq %rax, -8(%rsp) #, pretmp.21
+ .p2align 4,,10
+ .p2align 3
+.L11:
+ mov %r13d, %eax # j, j
+ movq -16(%rsp), %rdi # u, y
+ leaq (%r14,%rax,8), %r11 #, z_j
+ xorl %r9d, %r9d # i
+ imulq (%r11), %rdi #* z_j, y
+ xorl %r10d, %r10d # carry
+ testl %r8d, %r8d # blocks_of_8
+ je .L7 #,
+ .p2align 4,,10
+ .p2align 3
+.LOOP_MUL_ADD:
+ mov %r9d, %ecx # i, i
+ addl $8, %r9d #, i
+ salq $3, %rcx #, D.2315
+ leaq (%r11,%rcx), %rsi #, tmp130
+ leaq (%r12,%rcx), %rcx #, tmp131
+
+ movq 8*0(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*0(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*0 (%rsi)
+
+ movq 8*1(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*1(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*1 (%rsi)
+
+ movq 8*2(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*2(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*2 (%rsi)
+
+ movq 8*3(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*3(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*3 (%rsi)
+
+ movq 8*4(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*4(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*4 (%rsi)
+
+ movq 8*5(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*5(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*5 (%rsi)
+
+ movq 8*6(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*6(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*6 (%rsi)
+
+ movq 8*7(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*7(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*7 (%rsi)
+
+ cmpl %r9d, %r8d # i, blocks_of_8
+ jne .LOOP_MUL_ADD #,
+ cmpl %r8d, %ebx # blocks_of_8, x_size
+ je .L8 #,
+.L7:
+ movl %r8d, %esi # blocks_of_8, i
+ .p2align 4,,10
+ .p2align 3
+.L5:
+ mov %esi, %eax # i, i
+ movq %rdi, %rcx # y, b
+ leaq (%r11, %rax,8), %r9 #, D.2325
+ incl %esi # i
+ movq (%r12, %rax,8), %rax #* x, tmp133
+
+ mulq %rcx # b
+ addq (%r9), %rax #* D.2325, a
+ adcq $0,%rdx #
+ addq %r10, %rax # carry, a
+ adcq $0,%rdx #
+
+ cmpl %esi, %ebx # i, x_size
+ movq %rdx, %r10 #, carry
+ movq %rax, (%r9) # a,* D.2325
+ jne .L5 #,
+.L8:
+ movq -8(%rsp), %rdx # pretmp.21,
+ leaq (%r11,%rdx), %rax #, D.2332
+ movq (%rax), %rcx #* D.2332, D.2333
+ leaq (%r10,%rcx), %rdx #, z_sum
+ movq %rdx, (%rax) # z_sum,* D.2332
+ cmpq %rdx, %rcx # z_sum, D.2333
+ jbe .L9 #,
+ cmpl %ebp, %r15d # z_size, k.73
+ je .L9 #,
+ movl %r15d, %ecx # k.73, k
+ jmp .L10 #
+ .p2align 4,,10
+ .p2align 3
+.L31:
+ incl %ecx # k
+ cmpl %ecx, %ebp # k, z_size
+ .p2align 4,,4
+ .p2align 3
+ je .L9 #,
+.L10:
+ mov %ecx, %edx # k, k
+ leaq (%r11,%rdx,8), %rdx #, D.2342
+ movq (%rdx), %rax #* D.2342, tmp136
+ incq %rax # D.2344
+ movq %rax, (%rdx) # D.2344,* D.2342
+ testq %rax, %rax # D.2344
+ je .L31 #,
+.L9:
+ incl %r13d # j
+ decl %ebp # z_size
+ cmpl %r13d, %ebx # j, x_size
+ jne .L11 #,
+ movl %ebx, %esi # x_size, j.76
+.L3:
+ leal (%rbx,%rbx), %eax #, tmp137
+ mov %eax, %eax
+ leaq (%r14, %rax,8), %rdi #, D.2349
+ cmpq $0, (%rdi) #,* D.2349
+ jne .L12 #,
+ testl %ebx, %ebx # x_size
+ je .L12 #,
+ leal -1(%rbx), %ecx #, j
+ leal (%rsi,%rcx), %edx #, tmp141
+ mov %ecx, %eax # j, j
+ movq (%r14,%rdx,8), %rbp #* z,
+ cmpq %rbp, (%r12, %rax,8) #,* x
+ jb .L12 #,
+ ja .L_EXIT #,
+ leal -2(%rsi,%rbx), %edx #, ivtmp.45
+ jmp .L14 #
+ .p2align 4,,10
+ .p2align 3
+.L15:
+ mov %edx, %eax # ivtmp.45, ivtmp.45
+ decl %ecx # j
+ movq (%r14, %rax,8), %rsi #* z, D.2360
+ mov %ecx, %eax # j, j
+ movq (%r12, %rax,8), %rax #* x, temp.68
+ cmpq %rax, %rsi
+ ja .L12 #,
+ decl %edx # ivtmp.45
+ cmpq %rax, %rsi
+ jb .L_EXIT #,
+.L14:
+ testl %ecx, %ecx # j
+ jne .L15 #,
+.L12:
+ xorl %ecx, %ecx # j
+ xorl %r10d, %r10d # carry
+ mov %ebx, %esi # x_size, pretmp.19
+ testl %r8d, %r8d # blocks_of_8
+ je .L17 #,
+ .p2align 4,,10
+ .p2align 3
+.L22:
+ mov %ecx, %edx # j, D.2375
+ addl $8, %ecx #, j
+ leaq (%rdx,%rsi), %rax #, tmp146
+ leaq (%r12,%rdx,8), %rdx #, tmp150
+ leaq (%r14, %rax,8), %rax #, tmp148
+
+ rorq %r10 # carry
+
+ movq 8*0(%rdx), %r10
+ sbbq %r10, 8*0(%rax)
+
+ movq 8*1(%rdx), %r10
+ sbbq %r10, 8*1(%rax)
+
+ movq 8*2(%rdx), %r10
+ sbbq %r10, 8*2(%rax)
+
+ movq 8*3(%rdx), %r10
+ sbbq %r10, 8*3(%rax)
+
+ movq 8*4(%rdx), %r10
+ sbbq %r10, 8*4(%rax)
+
+ movq 8*5(%rdx), %r10
+ sbbq %r10, 8*5(%rax)
+
+ movq 8*6(%rdx), %r10
+ sbbq %r10, 8*6(%rax)
+
+ movq 8*7(%rdx), %r10
+ sbbq %r10, 8*7(%rax)
+
+ sbbq %r10,%r10 # carry
+ negq %r10 # carry
+
+ cmpl %ecx, %r8d # j, blocks_of_8
+ jne .L22 #,
+.L17:
+ cmpl %r8d, %ebx # blocks_of_8, x_size
+ je .L19 #,
+ leal (%r8,%rbx), %r9d #, ivtmp.33
+ movl %r8d, %esi # blocks_of_8, j
+ .p2align 4,,10
+ .p2align 3
+.L20:
+ mov %r9d, %eax # ivtmp.33, ivtmp.33
+ mov %esi, %ecx # j, j
+ leaq (%r14, %rax,8), %rax #, D.2387
+ incl %esi # j
+ movq (%rax), %rdx #* D.2387, tmp153
+ incl %r9d # ivtmp.33
+
+ rorq %r10 # carry
+ sbbq (%r12,%rcx,8),%rdx #* x, x
+ sbbq %r10,%r10 # carry
+ negq %r10 # carry
+
+ cmpl %esi, %ebx # j, x_size
+ movq %rdx, (%rax) # x,* D.2387
+ jne .L20 #,
+.L19:
+ testq %r10, %r10 # carry
+ je .L_EXIT #,
+ decq (%rdi) #* D.2349
+.L_EXIT:
+ popq %rbx #
+ popq %rbp #
+ popq %r12 #
+ popq %r13 #
+ popq %r14 #
+ popq %r15 #
+END_FUNCTION(bigint_monty_redc)
+
+
+#if 0
+ #define Z_ARR ARG_1 // rdi
+#define Z_SIZE ARG_2_32 // esi
+// X_ARR is ARG_3 == rdx, moved b/c needed for multiply
+#define X_SIZE ARG_4_32 // ecx
+#define U ARG_5 // r8
+
+/*
+ We need all arguments for a while (we can reuse U eventually)
+ So only temp registers are
+ TEMP_1 %r10
+ TEMP_2 %r11
+ TEMP_3 = ARG_6 = %r9
+ void return, so also
+ R0 %rax (aka TEMP_9)
+ is free (but needed for multiply)
+
+ Can push:
+ %rbx (base pointer, callee saved)
+ %rpb (frame pointer, callee saved)
+ %r12-%r15 (callee saved)
+
+ Can push base/frame pointers since this is a leaf function
+ and does not reference any data.
+*/
+
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+#define LOOP_CTR_I %r12
+#define LOOP_CTR_J %r13
+
+#define CARRY TEMP_1
+#define Z_WORD TEMP_2
+#define X_ARR TEMP_3
+#define MUL_LO %rax
+#define MUL_HI %rdx
+
+ ASSIGN(X_ARR, ARG_3)
+
+ /*
+ ZEROIZE(CARRY)
+
+ ASSIGN(LOOP_CTR, X_SIZE)
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N) \
+ ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+ ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+ MUL(Y) ; \
+ ADD(Z_WORD, CARRY) ; \
+ ASSIGN(CARRY, MUL_HI) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ADD(Z_WORD, MUL_LO) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+ALIGN
+.LOOP_MULADD8:
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(Z_ARR, 64)
+ ADD_IMM(X_ARR, 64)
+ cmp IMM(8), LOOP_CTR
+ jge .LOOP_MULADD8
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(Z_ARR, 8)
+ ADD_IMM(X_ARR, 8)
+
+ cmp IMM(0), LOOP_CTR
+ jne .LOOP_MULADD1
+*/
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+#endif
diff --git a/modules/asm/asm_amd64/mp_mulop_amd64.S b/modules/asm/asm_amd64/mp_mulop_amd64.S
new file mode 100644
index 000000000..e5bba23fb
--- /dev/null
+++ b/modules/asm/asm_amd64/mp_mulop_amd64.S
@@ -0,0 +1,128 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_mulop.S)
+
+#if 0
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+ {
+ const u32bit blocks = x_size - (x_size % 8);
+
+ clear_mem(z, 2*x_size);
+
+ for(u32bit i = 0; i != x_size; ++i)
+ {
+ word carry = 0;
+
+ /*
+ for(u32bit j = 0; j != blocks; j += 8)
+ carry = word8_madd3(z + i + j, x + j, x[i], carry);
+
+ for(u32bit j = blocks; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+ */
+
+
+ for(u32bit j = 0; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+ for(u32bit j = 0; j != x_size; ++j)
+ {
+ dword z = (dword)a * b + c + *d;
+ *d = (word)(z >> BOTAN_MP_WORD_BITS);
+ return (word)z;
+ }
+
+
+
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+ }
+
+
+
+ z[x_size+i] = carry;
+ }
+ }
+
+#endif
+
+START_FUNCTION(bigint_simple_sqr)
+
+#define Z_ARR ARG_1
+#define X_ARR ARG_2
+//#define X_SIZE ARG_3_32
+
+#define CARRY TEMP_1
+#define Z_WORD TEMP_2
+#define LOOP_I TEMP_3
+#define LOOP_J TEMP_4
+#define X_SIZE TEMP_5
+#define MUL_LO %rax
+// arg 3, xsize
+#define MUL_HI %rdx
+
+// need arg3 == rdx for multiply
+ ASSIGN(X_SIZE, ARG3_32)
+
+ ZEROIZE(CARRY)
+
+ ZEROIZE(LOOP_I)
+
+.LOOP_ZEROIZE_Z:
+
+ cmp LOOP_I, X_SIZE
+
+
+
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N) \
+ ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+ ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+ MUL(Y) ; \
+ ADD(Z_WORD, CARRY) ; \
+ ASSIGN(CARRY, MUL_HI) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ADD(Z_WORD, MUL_LO) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+.LOOP_MULADD8:
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(Z_ARR, 64)
+ ADD_IMM(X_ARR, 64)
+ cmp IMM(8), LOOP_CTR
+ jge .LOOP_MULADD8
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(Z_ARR, 8)
+ ADD_IMM(X_ARR, 8)
+
+ cmp IMM(0), LOOP_CTR
+ jne .LOOP_MULADD1
+
+.L_MULADD_DONE:
+ RETURN_VALUE_IS(CARRY)
+END_FUNCTION(bigint_simple_square)
diff --git a/modules/asm/asm_amd64/sha160.cpp b/modules/asm/asm_amd64/sha160.cpp
new file mode 100644
index 000000000..cfac02f45
--- /dev/null
+++ b/modules/asm/asm_amd64/sha160.cpp
@@ -0,0 +1,52 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/sha160.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void botan_sha160_asm_amd64(u32bit[5], const byte[64], u32bit[80]);
+
+/*************************************************
+* SHA-160 Compression Function *
+*************************************************/
+void SHA_160::hash(const byte input[])
+ {
+ botan_sha160_asm_amd64(digest, input, W);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void SHA_160::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(j % 4, digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void SHA_160::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ W.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ digest[4] = 0xC3D2E1F0;
+ }
+
+/*************************************************
+* SHA_160 Constructor *
+*************************************************/
+SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true), W(80)
+ {
+ clear();
+ }
+
+}
diff --git a/modules/asm/asm_amd64/sha1_asm.S b/modules/asm/asm_amd64/sha1_asm.S
new file mode 100644
index 000000000..ecf4a18ce
--- /dev/null
+++ b/modules/asm/asm_amd64/sha1_asm.S
@@ -0,0 +1,258 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(sha1_asm.S)
+
+START_FUNCTION(botan_sha160_asm_amd64)
+
+#define DIGEST_ARR %rdi
+#define INPUT %rsi
+#define W %rdx
+#define LOOP_CTR %eax
+
+#define A %r8d
+#define B %r9d
+#define C %r10d
+#define D %r11d
+#define E %ecx
+
+ ZEROIZE(LOOP_CTR)
+
+ALIGN;
+.LOOP_LOAD_INPUT:
+ addl $8, %eax
+
+ movq ARRAY8(INPUT, 0), %r8
+ movq ARRAY8(INPUT, 1), %r9
+ movq ARRAY8(INPUT, 2), %r10
+ movq ARRAY8(INPUT, 3), %r11
+
+ bswap %r8
+ bswap %r9
+ bswap %r10
+ bswap %r11
+
+ rolq $32, %r8
+ rolq $32, %r9
+ rolq $32, %r10
+ rolq $32, %r11
+
+ movq %r8, ARRAY8(W, 0)
+ movq %r9, ARRAY8(W, 1)
+ movq %r10, ARRAY8(W, 2)
+ movq %r11, ARRAY8(W, 3)
+
+ addq $32, W
+ addq $32, INPUT
+
+ cmp IMM(16), LOOP_CTR
+ jne .LOOP_LOAD_INPUT
+
+/*
+#define A %r8d
+#define B %r9d
+#define C %r10d
+#define D %r11d
+#define E %ecx
+*/
+
+ALIGN;
+.LOOP_EXPANSION:
+ addl $4, LOOP_CTR
+
+ ZEROIZE(A)
+ ASSIGN(B, ARRAY4(W, -1))
+ ASSIGN(C, ARRAY4(W, -2))
+ ASSIGN(D, ARRAY4(W, -3))
+
+ XOR(A, ARRAY4(W, -5))
+ XOR(B, ARRAY4(W, -6))
+ XOR(C, ARRAY4(W, -7))
+ XOR(D, ARRAY4(W, -8))
+
+ XOR(A, ARRAY4(W, -11))
+ XOR(B, ARRAY4(W, -12))
+ XOR(C, ARRAY4(W, -13))
+ XOR(D, ARRAY4(W, -14))
+
+ XOR(A, ARRAY4(W, -13))
+ XOR(B, ARRAY4(W, -14))
+ XOR(C, ARRAY4(W, -15))
+ XOR(D, ARRAY4(W, -16))
+
+ ROTL_IMM(D, 1)
+ ROTL_IMM(C, 1)
+ ROTL_IMM(B, 1)
+ XOR(A, D)
+ ROTL_IMM(A, 1)
+
+ ASSIGN(ARRAY4(W, 0), D)
+ ASSIGN(ARRAY4(W, 1), C)
+ ASSIGN(ARRAY4(W, 2), B)
+ ASSIGN(ARRAY4(W, 3), A)
+
+ addq $16, W
+ cmp IMM(80), LOOP_CTR
+ jne .LOOP_EXPANSION
+
+ subq $320, W
+
+#define MAGIC1 0x5A827999
+#define MAGIC2 0x6ED9EBA1
+#define MAGIC3 0x8F1BBCDC
+#define MAGIC4 0xCA62C1D6
+
+#define T %esi
+#define T2 %eax
+
+#define F1(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(W, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, C) ; \
+ XOR(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC1) ; \
+ AND(E, B) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2_4(A, B, C, D, E, F, N, MAGIC) \
+ ASSIGN(T2, ARRAY4(W, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ XOR(E, C) ; \
+ ADD3_IMM(F, T2, MAGIC) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F3(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(W, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ OR(E, C) ; \
+ AND(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC3) ; \
+ ASSIGN(T2, B) ; \
+ AND(T2, C) ; \
+ OR(E, T2) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2(A, B, C, D, E, F, W) \
+ F2_4(A, B, C, D, E, F, W, MAGIC2)
+
+#define F4(A, B, C, D, E, F, W) \
+ F2_4(A, B, C, D, E, F, W, MAGIC4)
+
+ ASSIGN(T, ARRAY4(DIGEST_ARR, 0))
+ ASSIGN(B, ARRAY4(DIGEST_ARR, 1))
+ ASSIGN(C, ARRAY4(DIGEST_ARR, 2))
+ ASSIGN(D, ARRAY4(DIGEST_ARR, 3))
+ ASSIGN(E, ARRAY4(DIGEST_ARR, 4))
+
+ /* First Round */
+ F1(A, B, C, D, E, T, 0)
+ F1(T, A, B, C, D, E, 1)
+ F1(E, T, A, B, C, D, 2)
+ F1(D, E, T, A, B, C, 3)
+ F1(C, D, E, T, A, B, 4)
+ F1(B, C, D, E, T, A, 5)
+ F1(A, B, C, D, E, T, 6)
+ F1(T, A, B, C, D, E, 7)
+ F1(E, T, A, B, C, D, 8)
+ F1(D, E, T, A, B, C, 9)
+ F1(C, D, E, T, A, B, 10)
+ F1(B, C, D, E, T, A, 11)
+ F1(A, B, C, D, E, T, 12)
+ F1(T, A, B, C, D, E, 13)
+ F1(E, T, A, B, C, D, 14)
+ F1(D, E, T, A, B, C, 15)
+ F1(C, D, E, T, A, B, 16)
+ F1(B, C, D, E, T, A, 17)
+ F1(A, B, C, D, E, T, 18)
+ F1(T, A, B, C, D, E, 19)
+
+ /* Second Round */
+ F2(E, T, A, B, C, D, 20)
+ F2(D, E, T, A, B, C, 21)
+ F2(C, D, E, T, A, B, 22)
+ F2(B, C, D, E, T, A, 23)
+ F2(A, B, C, D, E, T, 24)
+ F2(T, A, B, C, D, E, 25)
+ F2(E, T, A, B, C, D, 26)
+ F2(D, E, T, A, B, C, 27)
+ F2(C, D, E, T, A, B, 28)
+ F2(B, C, D, E, T, A, 29)
+ F2(A, B, C, D, E, T, 30)
+ F2(T, A, B, C, D, E, 31)
+ F2(E, T, A, B, C, D, 32)
+ F2(D, E, T, A, B, C, 33)
+ F2(C, D, E, T, A, B, 34)
+ F2(B, C, D, E, T, A, 35)
+ F2(A, B, C, D, E, T, 36)
+ F2(T, A, B, C, D, E, 37)
+ F2(E, T, A, B, C, D, 38)
+ F2(D, E, T, A, B, C, 39)
+
+ /* Third Round */
+ F3(C, D, E, T, A, B, 40)
+ F3(B, C, D, E, T, A, 41)
+ F3(A, B, C, D, E, T, 42)
+ F3(T, A, B, C, D, E, 43)
+ F3(E, T, A, B, C, D, 44)
+ F3(D, E, T, A, B, C, 45)
+ F3(C, D, E, T, A, B, 46)
+ F3(B, C, D, E, T, A, 47)
+ F3(A, B, C, D, E, T, 48)
+ F3(T, A, B, C, D, E, 49)
+ F3(E, T, A, B, C, D, 50)
+ F3(D, E, T, A, B, C, 51)
+ F3(C, D, E, T, A, B, 52)
+ F3(B, C, D, E, T, A, 53)
+ F3(A, B, C, D, E, T, 54)
+ F3(T, A, B, C, D, E, 55)
+ F3(E, T, A, B, C, D, 56)
+ F3(D, E, T, A, B, C, 57)
+ F3(C, D, E, T, A, B, 58)
+ F3(B, C, D, E, T, A, 59)
+
+ /* Fourth Round */
+ F4(A, B, C, D, E, T, 60)
+ F4(T, A, B, C, D, E, 61)
+ F4(E, T, A, B, C, D, 62)
+ F4(D, E, T, A, B, C, 63)
+ F4(C, D, E, T, A, B, 64)
+ F4(B, C, D, E, T, A, 65)
+ F4(A, B, C, D, E, T, 66)
+ F4(T, A, B, C, D, E, 67)
+ F4(E, T, A, B, C, D, 68)
+ F4(D, E, T, A, B, C, 69)
+ F4(C, D, E, T, A, B, 70)
+ F4(B, C, D, E, T, A, 71)
+ F4(A, B, C, D, E, T, 72)
+ F4(T, A, B, C, D, E, 73)
+ F4(E, T, A, B, C, D, 74)
+ F4(D, E, T, A, B, C, 75)
+ F4(C, D, E, T, A, B, 76)
+ F4(B, C, D, E, T, A, 77)
+ F4(A, B, C, D, E, T, 78)
+ F4(T, A, B, C, D, E, 79)
+
+ ADD(ARRAY4(DIGEST_ARR, 0), D)
+ ADD(ARRAY4(DIGEST_ARR, 1), T)
+ ADD(ARRAY4(DIGEST_ARR, 2), A)
+ ADD(ARRAY4(DIGEST_ARR, 3), B)
+ ADD(ARRAY4(DIGEST_ARR, 4), C)
+
+END_FUNCTION(botan_sha160_asm_amd64)
diff --git a/modules/asm/asm_ia32/asm_macr.h b/modules/asm/asm_ia32/asm_macr.h
new file mode 100644
index 000000000..392b05d5b
--- /dev/null
+++ b/modules/asm/asm_ia32/asm_macr.h
@@ -0,0 +1,131 @@
+/*************************************************
+* Assembly Macros Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_EXT_IA32_ASM_MACROS_H__
+#define BOTAN_EXT_IA32_ASM_MACROS_H__
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* General/Global Macros *
+*************************************************/
+#define ALIGN .p2align 4,,15
+
+#define START_LISTING(FILENAME) \
+ .file #FILENAME; \
+ .text; \
+ .p2align 4,,15;
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* Function Definitions *
+*************************************************/
+#define START_FUNCTION(func_name) \
+ .align 8; \
+ ALIGN; \
+ .global func_name; \
+ .type func_name,@function; \
+func_name:
+
+#define END_FUNCTION(func_name) \
+ ret
+
+/*************************************************
+* Loop Control *
+*************************************************/
+#define START_LOOP(LABEL) \
+ ALIGN; \
+ LABEL##_LOOP:
+
+#define LOOP_UNTIL_EQ(REG, NUM, LABEL) \
+ cmpl IMM(NUM), REG; \
+ jne LABEL##_LOOP
+
+#define LOOP_UNTIL_LT(REG, NUM, LABEL) \
+ cmpl IMM(NUM), REG; \
+ jge LABEL##_LOOP
+
+/*************************************************
+ Conditional Jumps *
+*************************************************/
+#define JUMP_IF_ZERO(REG, LABEL) \
+ cmpl IMM(0), REG; \
+ jz LABEL
+
+#define JUMP_IF_LT(REG, NUM, LABEL) \
+ cmpl IMM(NUM), REG; \
+ jl LABEL
+
+/*************************************************
+* Register Names *
+*************************************************/
+#define EAX %eax
+#define EBX %ebx
+#define ECX %ecx
+#define EDX %edx
+#define EBP %ebp
+#define EDI %edi
+#define ESI %esi
+#define ESP %esp
+
+/*************************************************
+* Memory Access Operations *
+*************************************************/
+#define ARRAY1(REG, NUM) (NUM)(REG)
+#define ARRAY4(REG, NUM) 4*(NUM)(REG)
+#define ARRAY4_INDIRECT(BASE, OFFSET, NUM) 4*(NUM)(BASE,OFFSET,4)
+#define ARG(NUM) 4*(PUSHED) + ARRAY4(ESP, NUM)
+
+#define ASSIGN(TO, FROM) movl FROM, TO
+#define ASSIGN_BYTE(TO, FROM) movzbl FROM, TO
+
+#define PUSH(REG) pushl REG
+#define POP(REG) popl REG
+
+#define SPILL_REGS() \
+ PUSH(EBP) ; \
+ PUSH(EDI) ; \
+ PUSH(ESI) ; \
+ PUSH(EBX)
+
+#define RESTORE_REGS() \
+ POP(EBX) ; \
+ POP(ESI) ; \
+ POP(EDI) ; \
+ POP(EBP)
+
+/*************************************************
+* ALU Operations *
+*************************************************/
+#define IMM(VAL) $VAL
+
+#define ADD(TO, FROM) addl FROM, TO
+#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM))
+#define ADD_W_CARRY(TO1, TO2, FROM) addl FROM, TO1; adcl IMM(0), TO2;
+#define SUB_IMM(TO, NUM) subl IMM(NUM), TO
+#define ADD2_IMM(TO, FROM, NUM) leal NUM(FROM), TO
+#define ADD3_IMM(TO, FROM, NUM) leal NUM(TO,FROM,1), TO
+#define MUL(REG) mull REG
+
+#define SHL_IMM(REG, SHIFT) shll IMM(SHIFT), REG
+#define SHR_IMM(REG, SHIFT) shrl IMM(SHIFT), REG
+#define SHL2_3(TO, FROM) leal 0(,FROM,8), TO
+
+#define XOR(TO, FROM) xorl FROM, TO
+#define AND(TO, FROM) andl FROM, TO
+#define OR(TO, FROM) orl FROM, TO
+#define NOT(REG) notl REG
+#define ZEROIZE(REG) XOR(REG, REG)
+
+#define ROTL_IMM(REG, NUM) roll IMM(NUM), REG
+#define ROTR_IMM(REG, NUM) rorl IMM(NUM), REG
+#define BSWAP(REG) bswapl REG
+
+#endif
diff --git a/modules/asm/asm_ia32/md4.cpp b/modules/asm/asm_ia32/md4.cpp
new file mode 100644
index 000000000..e3dc79012
--- /dev/null
+++ b/modules/asm/asm_ia32/md4.cpp
@@ -0,0 +1,43 @@
+/*************************************************
+* MD4 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/md4.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void md4_core(u32bit[4], const byte[64], u32bit[16]);
+
+/*************************************************
+* MD4 Compression Function *
+*************************************************/
+void MD4::hash(const byte input[])
+ {
+ md4_core(digest, input, M);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void MD4::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(3 - (j % 4), digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void MD4::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ M.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ }
+
+}
diff --git a/modules/asm/asm_ia32/md4core.S b/modules/asm/asm_ia32/md4core.S
new file mode 100644
index 000000000..662e9924a
--- /dev/null
+++ b/modules/asm/asm_ia32/md4core.S
@@ -0,0 +1,135 @@
+/*************************************************
+* MD4 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(md4core.S)
+
+START_FUNCTION(md4_core)
+ SPILL_REGS()
+
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(2)) /* input block */
+ ASSIGN(EDI, ARG(3)) /* expanded words */
+
+ ZEROIZE(ESI)
+
+START_LOOP(.LOAD_INPUT)
+ ADD_IMM(ESI, 4)
+
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ADD_IMM(EBP, 16)
+
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-4), EAX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-3), EBX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-2), ECX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-1), EDX)
+LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT)
+
+ ASSIGN(EBP, ARG(1))
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+#define MSG EDI
+#define T1 ESI
+#define T2 EBP
+
+#define FF(A, B, C, D, N, S) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, C) ; \
+ XOR(T2, D) ; \
+ AND(T2, B) ; \
+ XOR(T2, D) ; \
+ ADD(A, T1) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ;
+
+#define GG(A, B, C, D, N, S) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ OR(T2, C) ; \
+ AND(T2, D) ; \
+ ADD3_IMM(A, T1, 0x5A827999) ; \
+ ASSIGN(T1, B) ; \
+ AND(T1, C) ; \
+ OR(T2, T1) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ;
+
+#define HH(A, B, C, D, N, S) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ XOR(T2, C) ; \
+ XOR(T2, D) ; \
+ ADD3_IMM(A, T1, 0x6ED9EBA1) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ;
+
+ FF(EAX,EBX,ECX,EDX, 0, 3);
+ FF(EDX,EAX,EBX,ECX, 1, 7);
+ FF(ECX,EDX,EAX,EBX, 2,11);
+ FF(EBX,ECX,EDX,EAX, 3,19);
+ FF(EAX,EBX,ECX,EDX, 4, 3);
+ FF(EDX,EAX,EBX,ECX, 5, 7);
+ FF(ECX,EDX,EAX,EBX, 6,11);
+ FF(EBX,ECX,EDX,EAX, 7,19);
+ FF(EAX,EBX,ECX,EDX, 8, 3);
+ FF(EDX,EAX,EBX,ECX, 9, 7);
+ FF(ECX,EDX,EAX,EBX,10,11);
+ FF(EBX,ECX,EDX,EAX,11,19);
+ FF(EAX,EBX,ECX,EDX,12, 3);
+ FF(EDX,EAX,EBX,ECX,13, 7);
+ FF(ECX,EDX,EAX,EBX,14,11);
+ FF(EBX,ECX,EDX,EAX,15,19);
+
+ GG(EAX,EBX,ECX,EDX, 0, 3);
+ GG(EDX,EAX,EBX,ECX, 4, 5);
+ GG(ECX,EDX,EAX,EBX, 8, 9);
+ GG(EBX,ECX,EDX,EAX,12,13);
+ GG(EAX,EBX,ECX,EDX, 1, 3);
+ GG(EDX,EAX,EBX,ECX, 5, 5);
+ GG(ECX,EDX,EAX,EBX, 9, 9);
+ GG(EBX,ECX,EDX,EAX,13,13);
+ GG(EAX,EBX,ECX,EDX, 2, 3);
+ GG(EDX,EAX,EBX,ECX, 6, 5);
+ GG(ECX,EDX,EAX,EBX,10, 9);
+ GG(EBX,ECX,EDX,EAX,14,13);
+ GG(EAX,EBX,ECX,EDX, 3, 3);
+ GG(EDX,EAX,EBX,ECX, 7, 5);
+ GG(ECX,EDX,EAX,EBX,11, 9);
+ GG(EBX,ECX,EDX,EAX,15,13);
+
+ HH(EAX,EBX,ECX,EDX, 0, 3);
+ HH(EDX,EAX,EBX,ECX, 8, 9);
+ HH(ECX,EDX,EAX,EBX, 4,11);
+ HH(EBX,ECX,EDX,EAX,12,15);
+ HH(EAX,EBX,ECX,EDX, 2, 3);
+ HH(EDX,EAX,EBX,ECX,10, 9);
+ HH(ECX,EDX,EAX,EBX, 6,11);
+ HH(EBX,ECX,EDX,EAX,14,15);
+ HH(EAX,EBX,ECX,EDX, 1, 3);
+ HH(EDX,EAX,EBX,ECX, 9, 9);
+ HH(ECX,EDX,EAX,EBX, 5,11);
+ HH(EBX,ECX,EDX,EAX,13,15);
+ HH(EAX,EBX,ECX,EDX, 3, 3);
+ HH(EDX,EAX,EBX,ECX,11, 9);
+ HH(ECX,EDX,EAX,EBX, 7,11);
+ HH(EBX,ECX,EDX,EAX,15,15);
+
+ ASSIGN(EBP, ARG(1))
+ ADD(ARRAY4(EBP, 0), EAX)
+ ADD(ARRAY4(EBP, 1), EBX)
+ ADD(ARRAY4(EBP, 2), ECX)
+ ADD(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+END_FUNCTION(md4_core)
diff --git a/modules/asm/asm_ia32/md5.cpp b/modules/asm/asm_ia32/md5.cpp
new file mode 100644
index 000000000..cfe48e7e9
--- /dev/null
+++ b/modules/asm/asm_ia32/md5.cpp
@@ -0,0 +1,43 @@
+/*************************************************
+* MD5 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/md5.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void md5_core(u32bit[4], const byte[64], u32bit[16]);
+
+/*************************************************
+* MD5 Compression Function *
+*************************************************/
+void MD5::hash(const byte input[])
+ {
+ md5_core(digest, input, M);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void MD5::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(3 - (j % 4), digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void MD5::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ M.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ }
+
+}
diff --git a/modules/asm/asm_ia32/md5core.S b/modules/asm/asm_ia32/md5core.S
new file mode 100644
index 000000000..8ebe469f3
--- /dev/null
+++ b/modules/asm/asm_ia32/md5core.S
@@ -0,0 +1,164 @@
+/*************************************************
+* MD5 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(md5core.S)
+
+START_FUNCTION(md5_core)
+ SPILL_REGS()
+
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(2)) /* input block */
+ ASSIGN(EDI, ARG(3)) /* expanded words */
+
+ ZEROIZE(ESI)
+
+START_LOOP(.LOAD_INPUT)
+ ADD_IMM(ESI, 4)
+
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ADD_IMM(EBP, 16)
+
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-4), EAX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-3), EBX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-2), ECX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-1), EDX)
+LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT)
+
+ ASSIGN(EBP, ARG(1))
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+#define MSG EDI
+#define T1 ESI
+#define T2 EBP
+
+#define FF(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, C) ; \
+ XOR(T2, D) ; \
+ AND(T2, B) ; \
+ XOR(T2, D) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+#define GG(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ XOR(T2, C) ; \
+ AND(T2, D) ; \
+ XOR(T2, C) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+#define HH(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ XOR(T2, C) ; \
+ XOR(T2, D) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+#define II(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, D) ; \
+ NOT(T2) ; \
+ OR(T2, B) ; \
+ XOR(T2, C) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+ FF(EAX,EBX,ECX,EDX, 0, 7,0xD76AA478);
+ FF(EDX,EAX,EBX,ECX, 1,12,0xE8C7B756);
+ FF(ECX,EDX,EAX,EBX, 2,17,0x242070DB);
+ FF(EBX,ECX,EDX,EAX, 3,22,0xC1BDCEEE);
+ FF(EAX,EBX,ECX,EDX, 4, 7,0xF57C0FAF);
+ FF(EDX,EAX,EBX,ECX, 5,12,0x4787C62A);
+ FF(ECX,EDX,EAX,EBX, 6,17,0xA8304613);
+ FF(EBX,ECX,EDX,EAX, 7,22,0xFD469501);
+ FF(EAX,EBX,ECX,EDX, 8, 7,0x698098D8);
+ FF(EDX,EAX,EBX,ECX, 9,12,0x8B44F7AF);
+ FF(ECX,EDX,EAX,EBX,10,17,0xFFFF5BB1);
+ FF(EBX,ECX,EDX,EAX,11,22,0x895CD7BE);
+ FF(EAX,EBX,ECX,EDX,12, 7,0x6B901122);
+ FF(EDX,EAX,EBX,ECX,13,12,0xFD987193);
+ FF(ECX,EDX,EAX,EBX,14,17,0xA679438E);
+ FF(EBX,ECX,EDX,EAX,15,22,0x49B40821);
+
+ GG(EAX,EBX,ECX,EDX, 1, 5,0xF61E2562);
+ GG(EDX,EAX,EBX,ECX, 6, 9,0xC040B340);
+ GG(ECX,EDX,EAX,EBX,11,14,0x265E5A51);
+ GG(EBX,ECX,EDX,EAX, 0,20,0xE9B6C7AA);
+ GG(EAX,EBX,ECX,EDX, 5, 5,0xD62F105D);
+ GG(EDX,EAX,EBX,ECX,10, 9,0x02441453);
+ GG(ECX,EDX,EAX,EBX,15,14,0xD8A1E681);
+ GG(EBX,ECX,EDX,EAX, 4,20,0xE7D3FBC8);
+ GG(EAX,EBX,ECX,EDX, 9, 5,0x21E1CDE6);
+ GG(EDX,EAX,EBX,ECX,14, 9,0xC33707D6);
+ GG(ECX,EDX,EAX,EBX, 3,14,0xF4D50D87);
+ GG(EBX,ECX,EDX,EAX, 8,20,0x455A14ED);
+ GG(EAX,EBX,ECX,EDX,13, 5,0xA9E3E905);
+ GG(EDX,EAX,EBX,ECX, 2, 9,0xFCEFA3F8);
+ GG(ECX,EDX,EAX,EBX, 7,14,0x676F02D9);
+ GG(EBX,ECX,EDX,EAX,12,20,0x8D2A4C8A);
+
+ HH(EAX,EBX,ECX,EDX, 5, 4,0xFFFA3942);
+ HH(EDX,EAX,EBX,ECX, 8,11,0x8771F681);
+ HH(ECX,EDX,EAX,EBX,11,16,0x6D9D6122);
+ HH(EBX,ECX,EDX,EAX,14,23,0xFDE5380C);
+ HH(EAX,EBX,ECX,EDX, 1, 4,0xA4BEEA44);
+ HH(EDX,EAX,EBX,ECX, 4,11,0x4BDECFA9);
+ HH(ECX,EDX,EAX,EBX, 7,16,0xF6BB4B60);
+ HH(EBX,ECX,EDX,EAX,10,23,0xBEBFBC70);
+ HH(EAX,EBX,ECX,EDX,13, 4,0x289B7EC6);
+ HH(EDX,EAX,EBX,ECX, 0,11,0xEAA127FA);
+ HH(ECX,EDX,EAX,EBX, 3,16,0xD4EF3085);
+ HH(EBX,ECX,EDX,EAX, 6,23,0x04881D05);
+ HH(EAX,EBX,ECX,EDX, 9, 4,0xD9D4D039);
+ HH(EDX,EAX,EBX,ECX,12,11,0xE6DB99E5);
+ HH(ECX,EDX,EAX,EBX,15,16,0x1FA27CF8);
+ HH(EBX,ECX,EDX,EAX, 2,23,0xC4AC5665);
+
+ II(EAX,EBX,ECX,EDX, 0, 6,0xF4292244);
+ II(EDX,EAX,EBX,ECX, 7,10,0x432AFF97);
+ II(ECX,EDX,EAX,EBX,14,15,0xAB9423A7);
+ II(EBX,ECX,EDX,EAX, 5,21,0xFC93A039);
+ II(EAX,EBX,ECX,EDX,12, 6,0x655B59C3);
+ II(EDX,EAX,EBX,ECX, 3,10,0x8F0CCC92);
+ II(ECX,EDX,EAX,EBX,10,15,0xFFEFF47D);
+ II(EBX,ECX,EDX,EAX, 1,21,0x85845DD1);
+ II(EAX,EBX,ECX,EDX, 8, 6,0x6FA87E4F);
+ II(EDX,EAX,EBX,ECX,15,10,0xFE2CE6E0);
+ II(ECX,EDX,EAX,EBX, 6,15,0xA3014314);
+ II(EBX,ECX,EDX,EAX,13,21,0x4E0811A1);
+ II(EAX,EBX,ECX,EDX, 4, 6,0xF7537E82);
+ II(EDX,EAX,EBX,ECX,11,10,0xBD3AF235);
+ II(ECX,EDX,EAX,EBX, 2,15,0x2AD7D2BB);
+ II(EBX,ECX,EDX,EAX, 9,21,0xEB86D391);
+
+ ASSIGN(EBP, ARG(1))
+ ADD(ARRAY4(EBP, 0), EAX)
+ ADD(ARRAY4(EBP, 1), EBX)
+ ADD(ARRAY4(EBP, 2), ECX)
+ ADD(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+END_FUNCTION(md5_core)
diff --git a/modules/asm/asm_ia32/modinfo.txt b/modules/asm/asm_ia32/modinfo.txt
new file mode 100644
index 000000000..12c8cd96d
--- /dev/null
+++ b/modules/asm/asm_ia32/modinfo.txt
@@ -0,0 +1,43 @@
+realname "x86 Assembler"
+
+#mp_bits 32
+
+load_on asm_ok
+
+<replace>
+md4.cpp
+md5.cpp
+sha160.cpp
+serpent.cpp
+</replace>
+
+<ignore>
+#mp_mulop.cpp
+</ignore>
+
+<add>
+asm_macr.h
+md4core.S
+md5core.S
+sha1_asm.S
+serp_asm.S
+#mp_mulop.S
+</add>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
diff --git a/modules/asm/asm_ia32/mp_mulop.S b/modules/asm/asm_ia32/mp_mulop.S
new file mode 100644
index 000000000..a5f0d3b27
--- /dev/null
+++ b/modules/asm/asm_ia32/mp_mulop.S
@@ -0,0 +1,62 @@
+/*************************************************
+* Multiply/Add Algorithm Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_muladd.S)
+
+START_FUNCTION(bigint_mul_add_words)
+ SPILL_REGS()
+#define PUSHED 4
+
+#define LOOP_CTR ESI
+ ASSIGN(LOOP_CTR, ARG(3)) /* x_size */
+ ZEROIZE(EDI)
+
+ ASSIGN(ECX, ARG(1)) /* z[] */
+ ASSIGN(EBX, ARG(2)) /* x[] */
+ ASSIGN(EBP, ARG(4)) /* y */
+
+#define MULADD_OP(N) \
+ ASSIGN(EAX, ARRAY4(EBX, N)) ; \
+ MUL(EBP) ; \
+ ADD_W_CARRY(EAX, EDX, EDI) ; \
+ ASSIGN(EDI, EDX) ; \
+ ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ;
+
+ JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP)
+
+START_LOOP(.MULADD8)
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(EBX, 32)
+ ADD_IMM(ECX, 32)
+LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8)
+
+ JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
+
+START_LOOP(.MULADD1)
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(EBX, 4)
+ ADD_IMM(ECX, 4)
+LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1)
+
+.MUL_ADD_DONE:
+
+ ASSIGN(EAX, EDI)
+#undef PUSHED
+ RESTORE_REGS()
+END_FUNCTION(bigint_mul_add_words)
diff --git a/modules/asm/asm_ia32/serp_asm.S b/modules/asm/asm_ia32/serp_asm.S
new file mode 100644
index 000000000..c8915382d
--- /dev/null
+++ b/modules/asm/asm_ia32/serp_asm.S
@@ -0,0 +1,667 @@
+/*************************************************
+* Serpent Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(serp_asm.S)
+
+#define SBOX_E1(A, B, C, D, T) \
+ XOR(D, A) ; \
+ ASSIGN(T, B) ; \
+ AND(B, D) ; \
+ XOR(T, C) ; \
+ XOR(B, A) ; \
+ OR(A, D) ; \
+ XOR(A, T) ; \
+ XOR(T, D) ; \
+ XOR(D, C) ; \
+ OR(C, B) ; \
+ XOR(C, T) ; \
+ NOT(T) ; \
+ OR(T, B) ; \
+ XOR(B, D) ; \
+ XOR(B, T) ; \
+ OR(D, A) ; \
+ XOR(B, D) ; \
+ XOR(T, D) ; \
+ ASSIGN(D, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_E2(A, B, C, D, T) \
+ NOT(A) ; \
+ NOT(C) ; \
+ ASSIGN(T, A) ; \
+ AND(A, B) ; \
+ XOR(C, A) ; \
+ OR(A, D) ; \
+ XOR(D, C) ; \
+ XOR(B, A) ; \
+ XOR(A, T) ; \
+ OR(T, B) ; \
+ XOR(B, D) ; \
+ OR(C, A) ; \
+ AND(C, T) ; \
+ XOR(A, B) ; \
+ AND(B, C) ; \
+ XOR(B, A) ; \
+ AND(A, C) ; \
+ XOR(T, A) ; \
+ ASSIGN(A, C) ; \
+ ASSIGN(C, D) ; \
+ ASSIGN(D, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_E3(A, B, C, D, T) \
+ ASSIGN(T, A) ; \
+ AND(A, C) ; \
+ XOR(A, D) ; \
+ XOR(C, B) ; \
+ XOR(C, A) ; \
+ OR(D, T) ; \
+ XOR(D, B) ; \
+ XOR(T, C) ; \
+ ASSIGN(B, D) ; \
+ OR(D, T) ; \
+ XOR(D, A) ; \
+ AND(A, B) ; \
+ XOR(T, A) ; \
+ XOR(B, D) ; \
+ XOR(B, T) ; \
+ NOT(T) ; \
+ ASSIGN(A, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_E4(A, B, C, D, T) \
+ ASSIGN(T, A) ; \
+ OR(A, D) ; \
+ XOR(D, B) ; \
+ AND(B, T) ; \
+ XOR(T, C) ; \
+ XOR(C, D) ; \
+ AND(D, A) ; \
+ OR(T, B) ; \
+ XOR(D, T) ; \
+ XOR(A, B) ; \
+ AND(T, A) ; \
+ XOR(B, D) ; \
+ XOR(T, C) ; \
+ OR(B, A) ; \
+ XOR(B, C) ; \
+ XOR(A, D) ; \
+ ASSIGN(C, B) ; \
+ OR(B, D) ; \
+ XOR(B, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, C) ; \
+ ASSIGN(C, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_E5(A, B, C, D, T) \
+ XOR(B, D) ; \
+ NOT(D) ; \
+ XOR(C, D) ; \
+ XOR(D, A) ; \
+ ASSIGN(T, B) ; \
+ AND(B, D) ; \
+ XOR(B, C) ; \
+ XOR(T, D) ; \
+ XOR(A, T) ; \
+ AND(C, T) ; \
+ XOR(C, A) ; \
+ AND(A, B) ; \
+ XOR(D, A) ; \
+ OR(T, B) ; \
+ XOR(T, A) ; \
+ OR(A, D) ; \
+ XOR(A, C) ; \
+ AND(C, D) ; \
+ NOT(A) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_E6(A, B, C, D, T) \
+ XOR(A, B) ; \
+ XOR(B, D) ; \
+ NOT(D) ; \
+ ASSIGN(T, B) ; \
+ AND(B, A) ; \
+ XOR(C, D) ; \
+ XOR(B, C) ; \
+ OR(C, T) ; \
+ XOR(T, D) ; \
+ AND(D, B) ; \
+ XOR(D, A) ; \
+ XOR(T, B) ; \
+ XOR(T, C) ; \
+ XOR(C, A) ; \
+ AND(A, D) ; \
+ NOT(C) ; \
+ XOR(A, T) ; \
+ OR(T, D) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_E7(A, B, C, D, T) \
+ NOT(C) ; \
+ ASSIGN(T, D) ; \
+ AND(D, A) ; \
+ XOR(A, T) ; \
+ XOR(D, C) ; \
+ OR(C, T) ; \
+ XOR(B, D) ; \
+ XOR(C, A) ; \
+ OR(A, B) ; \
+ XOR(C, B) ; \
+ XOR(T, A) ; \
+ OR(A, D) ; \
+ XOR(A, C) ; \
+ XOR(T, D) ; \
+ XOR(T, A) ; \
+ NOT(D) ; \
+ AND(C, T) ; \
+ XOR(C, D) ; \
+ ASSIGN(D, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_E8(A, B, C, D, T) \
+ ASSIGN(T, B) ; \
+ OR(B, C) ; \
+ XOR(B, D) ; \
+ XOR(T, C) ; \
+ XOR(C, B) ; \
+ OR(D, T) ; \
+ AND(D, A) ; \
+ XOR(T, C) ; \
+ XOR(D, B) ; \
+ OR(B, T) ; \
+ XOR(B, A) ; \
+ OR(A, T) ; \
+ XOR(A, C) ; \
+ XOR(B, T) ; \
+ XOR(C, B) ; \
+ AND(B, A) ; \
+ XOR(B, T) ; \
+ NOT(C) ; \
+ OR(C, A) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, A) ; \
+ ASSIGN(A, T) ;
+
+#define SBOX_D1(A, B, C, D, T) \
+ NOT(C) ; \
+ ASSIGN(T, B) ; \
+ OR(B, A) ; \
+ NOT(T) ; \
+ XOR(B, C) ; \
+ OR(C, T) ; \
+ XOR(B, D) ; \
+ XOR(A, T) ; \
+ XOR(C, A) ; \
+ AND(A, D) ; \
+ XOR(T, A) ; \
+ OR(A, B) ; \
+ XOR(A, C) ; \
+ XOR(D, T) ; \
+ XOR(C, B) ; \
+ XOR(D, A) ; \
+ XOR(D, B) ; \
+ AND(C, D) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_D2(A, B, C, D, T) \
+ ASSIGN(T, B) ; \
+ XOR(B, D) ; \
+ AND(D, B) ; \
+ XOR(T, C) ; \
+ XOR(D, A) ; \
+ OR(A, B) ; \
+ XOR(C, D) ; \
+ XOR(A, T) ; \
+ OR(A, C) ; \
+ XOR(B, D) ; \
+ XOR(A, B) ; \
+ OR(B, D) ; \
+ XOR(B, A) ; \
+ NOT(T) ; \
+ XOR(T, B) ; \
+ OR(B, A) ; \
+ XOR(B, A) ; \
+ OR(B, T) ; \
+ XOR(D, B) ; \
+ ASSIGN(B, A) ; \
+ ASSIGN(A, T) ; \
+ ASSIGN(T, D) ; \
+ ASSIGN(D, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_D3(A, B, C, D, T) \
+ XOR(C, D) ; \
+ XOR(D, A) ; \
+ ASSIGN(T, D) ; \
+ AND(D, C) ; \
+ XOR(D, B) ; \
+ OR(B, C) ; \
+ XOR(B, T) ; \
+ AND(T, D) ; \
+ XOR(C, D) ; \
+ AND(T, A) ; \
+ XOR(T, C) ; \
+ AND(C, B) ; \
+ OR(C, A) ; \
+ NOT(D) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ AND(A, B) ; \
+ XOR(D, T) ; \
+ XOR(D, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_D4(A, B, C, D, T) \
+ ASSIGN(T, C) ; \
+ XOR(C, B) ; \
+ XOR(A, C) ; \
+ AND(T, C) ; \
+ XOR(T, A) ; \
+ AND(A, B) ; \
+ XOR(B, D) ; \
+ OR(D, T) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ XOR(B, T) ; \
+ AND(D, C) ; \
+ XOR(D, B) ; \
+ XOR(B, A) ; \
+ OR(B, C) ; \
+ XOR(A, D) ; \
+ XOR(B, T) ; \
+ XOR(A, B) ; \
+ ASSIGN(T, A) ; \
+ ASSIGN(A, C) ; \
+ ASSIGN(C, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_D5(A, B, C, D, T) \
+ ASSIGN(T, C) ; \
+ AND(C, D) ; \
+ XOR(C, B) ; \
+ OR(B, D) ; \
+ AND(B, A) ; \
+ XOR(T, C) ; \
+ XOR(T, B) ; \
+ AND(B, C) ; \
+ NOT(A) ; \
+ XOR(D, T) ; \
+ XOR(B, D) ; \
+ AND(D, A) ; \
+ XOR(D, C) ; \
+ XOR(A, B) ; \
+ AND(C, A) ; \
+ XOR(D, A) ; \
+ XOR(C, T) ; \
+ OR(C, D) ; \
+ XOR(D, A) ; \
+ XOR(C, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_D6(A, B, C, D, T) \
+ NOT(B) ; \
+ ASSIGN(T, D) ; \
+ XOR(C, B) ; \
+ OR(D, A) ; \
+ XOR(D, C) ; \
+ OR(C, B) ; \
+ AND(C, A) ; \
+ XOR(T, D) ; \
+ XOR(C, T) ; \
+ OR(T, A) ; \
+ XOR(T, B) ; \
+ AND(B, C) ; \
+ XOR(B, D) ; \
+ XOR(T, C) ; \
+ AND(D, T) ; \
+ XOR(T, B) ; \
+ XOR(D, T) ; \
+ NOT(T) ; \
+ XOR(D, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ; \
+ ASSIGN(T, D) ; \
+ ASSIGN(D, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_D7(A, B, C, D, T) \
+ XOR(A, C) ; \
+ ASSIGN(T, C) ; \
+ AND(C, A) ; \
+ XOR(T, D) ; \
+ NOT(C) ; \
+ XOR(D, B) ; \
+ XOR(C, D) ; \
+ OR(T, A) ; \
+ XOR(A, C) ; \
+ XOR(D, T) ; \
+ XOR(T, B) ; \
+ AND(B, D) ; \
+ XOR(B, A) ; \
+ XOR(A, D) ; \
+ OR(A, C) ; \
+ XOR(D, B) ; \
+ XOR(T, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_D8(A, B, C, D, T) \
+ ASSIGN(T, C) ; \
+ XOR(C, A) ; \
+ AND(A, D) ; \
+ OR(T, D) ; \
+ NOT(C) ; \
+ XOR(D, B) ; \
+ OR(B, A) ; \
+ XOR(A, C) ; \
+ AND(C, T) ; \
+ AND(D, T) ; \
+ XOR(B, C) ; \
+ XOR(C, A) ; \
+ OR(A, C) ; \
+ XOR(T, B) ; \
+ XOR(A, D) ; \
+ XOR(D, T) ; \
+ OR(T, A) ; \
+ XOR(D, C) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, A) ; \
+ ASSIGN(A, D) ; \
+ ASSIGN(D, T) ;
+
+#define TRANSFORM(A, B, C, D, T) \
+ ROTL_IMM(A, 13) ; \
+ ROTL_IMM(C, 3) ; \
+ SHL2_3(T, A) ; \
+ XOR(B, A) ; \
+ XOR(D, C) ; \
+ XOR(B, C) ; \
+ XOR(D, T) ; \
+ ROTL_IMM(B, 1) ; \
+ ROTL_IMM(D, 7) ; \
+ ASSIGN(T, B) ; \
+ SHL_IMM(T, 7) ; \
+ XOR(A, B) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ XOR(C, T) ; \
+ ROTL_IMM(A, 5) ; \
+ ROTL_IMM(C, 22) ;
+
+#define I_TRANSFORM(A, B, C, D, T) \
+ ROTR_IMM(C, 22) ; \
+ ROTR_IMM(A, 5) ; \
+ ASSIGN(T, B) ; \
+ SHL_IMM(T, 7) ; \
+ XOR(A, B) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ XOR(C, T) ; \
+ ROTR_IMM(D, 7) ; \
+ ROTR_IMM(B, 1) ; \
+ SHL2_3(T, A) ; \
+ XOR(B, C) ; \
+ XOR(D, C) ; \
+ XOR(B, A) ; \
+ XOR(D, T) ; \
+ ROTR_IMM(C, 3) ; \
+ ROTR_IMM(A, 13) ;
+
+#define KEY_XOR(A, B, C, D, N) \
+ XOR(A, ARRAY4(EDI, (4*N ))) ; \
+ XOR(B, ARRAY4(EDI, (4*N+1))) ; \
+ XOR(C, ARRAY4(EDI, (4*N+2))) ; \
+ XOR(D, ARRAY4(EDI, (4*N+3))) ;
+
+/*************************************************
+* Serpent Encryption *
+*************************************************/
+START_FUNCTION(serpent_encrypt)
+ SPILL_REGS()
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(1)) /* input block */
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ASSIGN(EDI, ARG(3)) /* round keys */
+ ZEROIZE(EBP)
+
+#define E_ROUND(A, B, C, D, T, N, SBOX) \
+ KEY_XOR(A, B, C, D, N) \
+ SBOX(A, B, C, D, T) \
+ TRANSFORM(A, B, C, D, T)
+
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 0, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 1, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 2, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 3, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 4, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 5, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 6, SBOX_E7)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 7, SBOX_E8)
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 8, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 9, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 10, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 11, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 12, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 13, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 14, SBOX_E7)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 15, SBOX_E8)
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 16, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 17, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 18, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 19, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 20, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 21, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 22, SBOX_E7)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 23, SBOX_E8)
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 24, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 25, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 26, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 27, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 28, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 29, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 30, SBOX_E7)
+
+ KEY_XOR(EAX, EBX, ECX, EDX, 31)
+ SBOX_E8(EAX, EBX, ECX, EDX, EBP)
+ KEY_XOR(EAX, EBX, ECX, EDX, 32)
+
+ ASSIGN(EBP, ARG(2)) /* output block */
+ ASSIGN(ARRAY4(EBP, 0), EAX)
+ ASSIGN(ARRAY4(EBP, 1), EBX)
+ ASSIGN(ARRAY4(EBP, 2), ECX)
+ ASSIGN(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+#undef PUSHED
+END_FUNCTION(serpent_encrypt)
+
+/*************************************************
+* Serpent Decryption *
+*************************************************/
+START_FUNCTION(serpent_decrypt)
+ SPILL_REGS()
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(1)) /* input block */
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ASSIGN(EDI, ARG(3)) /* round keys */
+
+ ZEROIZE(EBP)
+
+#define D_ROUND(A, B, C, D, T, N, SBOX) \
+ I_TRANSFORM(A, B, C, D, T) \
+ SBOX(A, B, C, D, T) \
+ KEY_XOR(A, B, C, D, N) \
+
+ KEY_XOR(EAX, EBX, ECX, EDX, 32)
+ SBOX_D8(EAX, EBX, ECX, EDX, EBP)
+ KEY_XOR(EAX, EBX, ECX, EDX, 31)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 30, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 29, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 28, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 27, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 26, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 25, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 24, SBOX_D1)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 23, SBOX_D8)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 22, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 21, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 20, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 19, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 18, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 17, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 16, SBOX_D1)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 15, SBOX_D8)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 14, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 13, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 12, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 11, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 10, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 9, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 8, SBOX_D1)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 7, SBOX_D8)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 6, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 5, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 4, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 3, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 2, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 1, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 0, SBOX_D1)
+
+ ASSIGN(EBP, ARG(2)) /* output block */
+ ASSIGN(ARRAY4(EBP, 0), EAX)
+ ASSIGN(ARRAY4(EBP, 1), EBX)
+ ASSIGN(ARRAY4(EBP, 2), ECX)
+ ASSIGN(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+#undef PUSHED
+END_FUNCTION(serpent_decrypt)
+
+/*************************************************
+* Serpent Key Schedule *
+*************************************************/
+START_FUNCTION(serpent_key_schedule)
+ SPILL_REGS()
+#define PUSHED 4
+
+ ASSIGN(EDI, ARG(1)) /* round keys */
+ ASSIGN(ESI, IMM(8))
+ ADD_IMM(EDI, 32)
+
+START_LOOP(.EXPANSION)
+ ASSIGN(EAX, ARRAY4(EDI, -1))
+ ASSIGN(EBX, ARRAY4(EDI, -3))
+ ASSIGN(ECX, ARRAY4(EDI, -5))
+ ASSIGN(EDX, ARRAY4(EDI, -8))
+
+ ASSIGN(EBP, ESI)
+ SUB_IMM(EBP, 8)
+ XOR(EBP, IMM(0x9E3779B9))
+ XOR(EAX, EBX)
+ XOR(ECX, EDX)
+ XOR(EAX, EBP)
+ XOR(EAX, ECX)
+
+ ROTL_IMM(EAX, 11)
+
+ ASSIGN(ARRAY4(EDI, 0), EAX)
+
+ ADD_IMM(ESI, 1)
+ ADD_IMM(EDI, 4)
+LOOP_UNTIL_EQ(ESI, 140, .EXPANSION)
+
+ ASSIGN(EDI, ARG(1)) /* round keys */
+
+#define LOAD_AND_SBOX(MSG, SBOX) \
+ ASSIGN(EAX, ARRAY4(EDI, (4*MSG+ 8))) ; \
+ ASSIGN(EBX, ARRAY4(EDI, (4*MSG+ 9))) ; \
+ ASSIGN(ECX, ARRAY4(EDI, (4*MSG+10))) ; \
+ ASSIGN(EDX, ARRAY4(EDI, (4*MSG+11))) ; \
+ SBOX(EAX, EBX, ECX, EDX, EBP) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+ 8)), EAX) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+ 9)), EBX) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+10)), ECX) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+11)), EDX)
+
+ LOAD_AND_SBOX( 0, SBOX_E4)
+ LOAD_AND_SBOX( 1, SBOX_E3)
+ LOAD_AND_SBOX( 2, SBOX_E2)
+ LOAD_AND_SBOX( 3, SBOX_E1)
+
+ LOAD_AND_SBOX( 4, SBOX_E8)
+ LOAD_AND_SBOX( 5, SBOX_E7)
+ LOAD_AND_SBOX( 6, SBOX_E6)
+ LOAD_AND_SBOX( 7, SBOX_E5)
+ LOAD_AND_SBOX( 8, SBOX_E4)
+ LOAD_AND_SBOX( 9, SBOX_E3)
+ LOAD_AND_SBOX(10, SBOX_E2)
+ LOAD_AND_SBOX(11, SBOX_E1)
+
+ LOAD_AND_SBOX(12, SBOX_E8)
+ LOAD_AND_SBOX(13, SBOX_E7)
+ LOAD_AND_SBOX(14, SBOX_E6)
+ LOAD_AND_SBOX(15, SBOX_E5)
+ LOAD_AND_SBOX(16, SBOX_E4)
+ LOAD_AND_SBOX(17, SBOX_E3)
+ LOAD_AND_SBOX(18, SBOX_E2)
+ LOAD_AND_SBOX(19, SBOX_E1)
+
+ LOAD_AND_SBOX(20, SBOX_E8)
+ LOAD_AND_SBOX(21, SBOX_E7)
+ LOAD_AND_SBOX(22, SBOX_E6)
+ LOAD_AND_SBOX(23, SBOX_E5)
+ LOAD_AND_SBOX(24, SBOX_E4)
+ LOAD_AND_SBOX(25, SBOX_E3)
+ LOAD_AND_SBOX(26, SBOX_E2)
+ LOAD_AND_SBOX(27, SBOX_E1)
+
+ LOAD_AND_SBOX(28, SBOX_E8)
+ LOAD_AND_SBOX(29, SBOX_E7)
+ LOAD_AND_SBOX(30, SBOX_E6)
+ LOAD_AND_SBOX(31, SBOX_E5)
+ LOAD_AND_SBOX(32, SBOX_E4)
+
+ RESTORE_REGS()
+#undef PUSHED
+END_FUNCTION(serpent_key_schedule)
diff --git a/modules/asm/asm_ia32/serpent.cpp b/modules/asm/asm_ia32/serpent.cpp
new file mode 100644
index 000000000..aacb72b0f
--- /dev/null
+++ b/modules/asm/asm_ia32/serpent.cpp
@@ -0,0 +1,49 @@
+/*************************************************
+* Serpent Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/serpent.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" {
+
+void serpent_encrypt(const byte[16], byte[16], const u32bit[132]);
+void serpent_decrypt(const byte[16], byte[16], const u32bit[132]);
+void serpent_key_schedule(u32bit[140]);
+
+}
+
+/*************************************************
+* Serpent Encryption *
+*************************************************/
+void Serpent::enc(const byte in[], byte out[]) const
+ {
+ serpent_encrypt(in, out, round_key);
+ }
+
+/*************************************************
+* Serpent Decryption *
+*************************************************/
+void Serpent::dec(const byte in[], byte out[]) const
+ {
+ serpent_decrypt(in, out, round_key);
+ }
+
+/*************************************************
+* Serpent Key Schedule *
+*************************************************/
+void Serpent::key(const byte key[], u32bit length)
+ {
+ SecureBuffer<u32bit, 140> W;
+ for(u32bit j = 0; j != length / 4; ++j)
+ W[j] = make_u32bit(key[4*j+3], key[4*j+2], key[4*j+1], key[4*j]);
+ W[length / 4] |= u32bit(1) << ((length%4)*8);
+
+ serpent_key_schedule(W);
+ round_key.copy(W + 8, 132);
+ }
+
+}
diff --git a/modules/asm/asm_ia32/sha160.cpp b/modules/asm/asm_ia32/sha160.cpp
new file mode 100644
index 000000000..7725541d5
--- /dev/null
+++ b/modules/asm/asm_ia32/sha160.cpp
@@ -0,0 +1,52 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/sha160.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void botan_sha160_asm_ia32(u32bit[5], const byte[64], u32bit[81]);
+
+/*************************************************
+* SHA-160 Compression Function *
+*************************************************/
+void SHA_160::hash(const byte input[])
+ {
+ botan_sha160_asm_ia32(digest, input, W);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void SHA_160::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(j % 4, digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void SHA_160::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ W.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ digest[4] = 0xC3D2E1F0;
+ }
+
+/*************************************************
+* SHA_160 Constructor *
+*************************************************/
+SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true), W(81)
+ {
+ clear();
+ }
+
+}
diff --git a/modules/asm/asm_ia32/sha1_asm.S b/modules/asm/asm_ia32/sha1_asm.S
new file mode 100644
index 000000000..85bc9dc2c
--- /dev/null
+++ b/modules/asm/asm_ia32/sha1_asm.S
@@ -0,0 +1,242 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(sha1_asm.S)
+
+START_FUNCTION(botan_sha160_asm_ia32)
+ SPILL_REGS()
+
+#define PUSHED 4
+
+ ASSIGN(EDI, ARG(2))
+ ASSIGN(EBP, ARG(3))
+
+ ZEROIZE(ESI)
+
+START_LOOP(.LOAD_INPUT)
+ ADD_IMM(ESI, 4)
+
+ ASSIGN(EAX, ARRAY4(EDI, 0))
+ ASSIGN(EBX, ARRAY4(EDI, 1))
+ ASSIGN(ECX, ARRAY4(EDI, 2))
+ ASSIGN(EDX, ARRAY4(EDI, 3))
+
+ ADD_IMM(EDI, 16)
+
+ BSWAP(EAX)
+ BSWAP(EBX)
+ BSWAP(ECX)
+ BSWAP(EDX)
+
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-4), EAX)
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-3), EBX)
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-2), ECX)
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-1), EDX)
+LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT)
+
+ ADD2_IMM(EDI, EBP, 64)
+
+START_LOOP(.EXPANSION)
+ ADD_IMM(ESI, 4)
+
+ ZEROIZE(EAX)
+ ASSIGN(EBX, ARRAY4(EDI, -1))
+ ASSIGN(ECX, ARRAY4(EDI, -2))
+ ASSIGN(EDX, ARRAY4(EDI, -3))
+
+ XOR(EAX, ARRAY4(EDI, -5))
+ XOR(EBX, ARRAY4(EDI, -6))
+ XOR(ECX, ARRAY4(EDI, -7))
+ XOR(EDX, ARRAY4(EDI, -8))
+
+ XOR(EAX, ARRAY4(EDI, -11))
+ XOR(EBX, ARRAY4(EDI, -12))
+ XOR(ECX, ARRAY4(EDI, -13))
+ XOR(EDX, ARRAY4(EDI, -14))
+
+ XOR(EAX, ARRAY4(EDI, -13))
+ XOR(EBX, ARRAY4(EDI, -14))
+ XOR(ECX, ARRAY4(EDI, -15))
+ XOR(EDX, ARRAY4(EDI, -16))
+
+ ROTL_IMM(EDX, 1)
+ ROTL_IMM(ECX, 1)
+ ROTL_IMM(EBX, 1)
+ XOR(EAX, EDX)
+ ROTL_IMM(EAX, 1)
+
+ ASSIGN(ARRAY4(EDI, 0), EDX)
+ ASSIGN(ARRAY4(EDI, 1), ECX)
+ ASSIGN(ARRAY4(EDI, 2), EBX)
+ ASSIGN(ARRAY4(EDI, 3), EAX)
+
+ ADD_IMM(EDI, 16)
+LOOP_UNTIL_EQ(ESI, 80, .EXPANSION)
+
+#define MAGIC1 0x5A827999
+#define MAGIC2 0x6ED9EBA1
+#define MAGIC3 0x8F1BBCDC
+#define MAGIC4 0xCA62C1D6
+
+#define MSG ESP
+#define T2 EBP
+
+#define F1(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(MSG, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, C) ; \
+ XOR(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC1) ; \
+ AND(E, B) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2_4(A, B, C, D, E, F, N, MAGIC) \
+ ASSIGN(T2, ARRAY4(MSG, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ XOR(E, C) ; \
+ ADD3_IMM(F, T2, MAGIC) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F3(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(MSG, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ OR(E, C) ; \
+ AND(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC3) ; \
+ ASSIGN(T2, B) ; \
+ AND(T2, C) ; \
+ OR(E, T2) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2(A, B, C, D, E, F, MSG) \
+ F2_4(A, B, C, D, E, F, MSG, MAGIC2)
+
+#define F4(A, B, C, D, E, F, MSG) \
+ F2_4(A, B, C, D, E, F, MSG, MAGIC4)
+
+ ASSIGN(EAX, ARG(1))
+ ASSIGN(EDI, ARRAY4(EAX, 0))
+ ASSIGN(EBX, ARRAY4(EAX, 1))
+ ASSIGN(ECX, ARRAY4(EAX, 2))
+ ASSIGN(EDX, ARRAY4(EAX, 3))
+ ASSIGN(ESI, ARRAY4(EAX, 4))
+
+ ASSIGN(ARRAY4(EBP, 80), ESP)
+ ASSIGN(ESP, EBP)
+
+ /* First Round */
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 0)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 1)
+ F1(ESI, EDI, EAX, EBX, ECX, EDX, 2)
+ F1(EDX, ESI, EDI, EAX, EBX, ECX, 3)
+ F1(ECX, EDX, ESI, EDI, EAX, EBX, 4)
+ F1(EBX, ECX, EDX, ESI, EDI, EAX, 5)
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 6)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 7)
+ F1(ESI, EDI, EAX, EBX, ECX, EDX, 8)
+ F1(EDX, ESI, EDI, EAX, EBX, ECX, 9)
+ F1(ECX, EDX, ESI, EDI, EAX, EBX, 10)
+ F1(EBX, ECX, EDX, ESI, EDI, EAX, 11)
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 12)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 13)
+ F1(ESI, EDI, EAX, EBX, ECX, EDX, 14)
+ F1(EDX, ESI, EDI, EAX, EBX, ECX, 15)
+ F1(ECX, EDX, ESI, EDI, EAX, EBX, 16)
+ F1(EBX, ECX, EDX, ESI, EDI, EAX, 17)
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 18)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 19)
+
+ /* Second Round */
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 20)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 21)
+ F2(ECX, EDX, ESI, EDI, EAX, EBX, 22)
+ F2(EBX, ECX, EDX, ESI, EDI, EAX, 23)
+ F2(EAX, EBX, ECX, EDX, ESI, EDI, 24)
+ F2(EDI, EAX, EBX, ECX, EDX, ESI, 25)
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 26)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 27)
+ F2(ECX, EDX, ESI, EDI, EAX, EBX, 28)
+ F2(EBX, ECX, EDX, ESI, EDI, EAX, 29)
+ F2(EAX, EBX, ECX, EDX, ESI, EDI, 30)
+ F2(EDI, EAX, EBX, ECX, EDX, ESI, 31)
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 32)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 33)
+ F2(ECX, EDX, ESI, EDI, EAX, EBX, 34)
+ F2(EBX, ECX, EDX, ESI, EDI, EAX, 35)
+ F2(EAX, EBX, ECX, EDX, ESI, EDI, 36)
+ F2(EDI, EAX, EBX, ECX, EDX, ESI, 37)
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 38)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 39)
+
+ /* Third Round */
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 40)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 41)
+ F3(EAX, EBX, ECX, EDX, ESI, EDI, 42)
+ F3(EDI, EAX, EBX, ECX, EDX, ESI, 43)
+ F3(ESI, EDI, EAX, EBX, ECX, EDX, 44)
+ F3(EDX, ESI, EDI, EAX, EBX, ECX, 45)
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 46)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 47)
+ F3(EAX, EBX, ECX, EDX, ESI, EDI, 48)
+ F3(EDI, EAX, EBX, ECX, EDX, ESI, 49)
+ F3(ESI, EDI, EAX, EBX, ECX, EDX, 50)
+ F3(EDX, ESI, EDI, EAX, EBX, ECX, 51)
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 52)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 53)
+ F3(EAX, EBX, ECX, EDX, ESI, EDI, 54)
+ F3(EDI, EAX, EBX, ECX, EDX, ESI, 55)
+ F3(ESI, EDI, EAX, EBX, ECX, EDX, 56)
+ F3(EDX, ESI, EDI, EAX, EBX, ECX, 57)
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 58)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 59)
+
+ /* Fourth Round */
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 60)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 61)
+ F4(ESI, EDI, EAX, EBX, ECX, EDX, 62)
+ F4(EDX, ESI, EDI, EAX, EBX, ECX, 63)
+ F4(ECX, EDX, ESI, EDI, EAX, EBX, 64)
+ F4(EBX, ECX, EDX, ESI, EDI, EAX, 65)
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 66)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 67)
+ F4(ESI, EDI, EAX, EBX, ECX, EDX, 68)
+ F4(EDX, ESI, EDI, EAX, EBX, ECX, 69)
+ F4(ECX, EDX, ESI, EDI, EAX, EBX, 70)
+ F4(EBX, ECX, EDX, ESI, EDI, EAX, 71)
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 72)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 73)
+ F4(ESI, EDI, EAX, EBX, ECX, EDX, 74)
+ F4(EDX, ESI, EDI, EAX, EBX, ECX, 75)
+ F4(ECX, EDX, ESI, EDI, EAX, EBX, 76)
+ F4(EBX, ECX, EDX, ESI, EDI, EAX, 77)
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 78)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 79)
+
+ ASSIGN(ESP, ARRAY4(ESP, 80))
+
+ ASSIGN(EBP, ARG(1))
+ ADD(ARRAY4(EBP, 0), EDX)
+ ADD(ARRAY4(EBP, 1), EDI)
+ ADD(ARRAY4(EBP, 2), EAX)
+ ADD(ARRAY4(EBP, 3), EBX)
+ ADD(ARRAY4(EBP, 4), ECX)
+
+ RESTORE_REGS()
+END_FUNCTION(botan_sha160_asm_ia32)
diff --git a/modules/asm/mp_amd64/bswap.h b/modules/asm/mp_amd64/bswap.h
new file mode 100644
index 000000000..3c77b460c
--- /dev/null
+++ b/modules/asm/mp_amd64/bswap.h
@@ -0,0 +1,36 @@
+/*************************************************
+* Byte Swapping Operations Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_BSWAP_H__
+#define BOTAN_BSWAP_H__
+
+#include <botan/types.h>
+#include <botan/rotate.h>
+
+namespace Botan {
+
+/*************************************************
+* Byte Swapping Functions *
+*************************************************/
+inline u16bit reverse_bytes(u16bit input)
+ {
+ return rotate_left(input, 8);
+ }
+
+inline u32bit reverse_bytes(u32bit input)
+ {
+ asm("bswapl %0" : "=r" (input) : "0" (input));
+ return input;
+ }
+
+inline u64bit reverse_bytes(u64bit input)
+ {
+ asm("bswapq %0" : "=r" (input) : "0" (input));
+ return input;
+ }
+
+}
+
+#endif
diff --git a/modules/asm/mp_amd64/modinfo.txt b/modules/asm/mp_amd64/modinfo.txt
new file mode 100644
index 000000000..a042a3976
--- /dev/null
+++ b/modules/asm/mp_amd64/modinfo.txt
@@ -0,0 +1,21 @@
+realname "x86-64 MPI Assembler Core"
+
+mp_bits 64
+
+load_on asm_ok
+
+<replace>
+bswap.h
+mp_asm.h
+mp_asmi.h
+#mp_mulop.cpp
+</replace>
+
+<arch>
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
diff --git a/modules/asm/mp_amd64/mp_asm.h b/modules/asm/mp_amd64/mp_asm.h
new file mode 100644
index 000000000..eca7bae6c
--- /dev/null
+++ b/modules/asm/mp_amd64/mp_asm.h
@@ -0,0 +1,67 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2008 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_H__
+#define BOTAN_MP_ASM_H__
+
+#include <botan/mp_types.h>
+
+#if (BOTAN_MP_WORD_BITS != 64)
+ #error The mp_amd64 module requires that BOTAN_MP_WORD_BITS == 64
+#endif
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for amd64 Assembly *
+*************************************************/
+#define ASM(x) x "\n\t"
+
+/*************************************************
+* Word Multiply *
+*************************************************/
+inline word word_madd2(word a, word b, word* c)
+ {
+ asm(
+ ASM("mulq %[b]")
+ ASM("addq %[c],%[a]")
+ ASM("adcq $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c)
+ : "0"(a), "1"(b), [c]"g"(*c) : "cc");
+
+ return a;
+ }
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd3(word a, word b, word c, word* d)
+ {
+ asm(
+ ASM("mulq %[b]")
+
+ ASM("addq %[c],%[a]")
+ ASM("adcq $0,%[carry]")
+
+ ASM("addq %[d],%[a]")
+ ASM("adcq $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d)
+ : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc");
+
+ return a;
+ }
+
+#undef ASM
+
+}
+
+}
+
+#endif
diff --git a/modules/asm/mp_amd64/mp_asmi.h b/modules/asm/mp_amd64/mp_asmi.h
new file mode 100644
index 000000000..16632a38d
--- /dev/null
+++ b/modules/asm/mp_amd64/mp_asmi.h
@@ -0,0 +1,233 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2007 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/mp_asm.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for amd64 Assembly *
+*************************************************/
+#ifndef ASM
+ #define ASM(x) x "\n\t"
+#endif
+
+#define ADDSUB2_OP(OPERATION, INDEX) \
+ ASM("movq 8*" #INDEX "(%[y]), %[carry]") \
+ ASM(OPERATION " %[carry], 8*" #INDEX "(%[x])") \
+
+#define ADDSUB3_OP(OPERATION, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]), %[carry]") \
+ ASM(OPERATION " 8*" #INDEX "(%[y]), %[carry]") \
+ ASM("movq %[carry], 8*" #INDEX "(%[z])") \
+
+#define LINMUL_OP(WRITE_TO, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]),%%rax") \
+ ASM("mulq %[y]") \
+ ASM("addq %[carry],%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("movq %%rdx,%[carry]") \
+ ASM("movq %%rax, 8*" #INDEX "(%[" WRITE_TO "])")
+
+#define MULADD_OP(IGNORED, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]),%%rax") \
+ ASM("mulq %[y]") \
+ ASM("addq %[carry],%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("addq 8*" #INDEX "(%[z]),%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("movq %%rdx,%[carry]") \
+ ASM("movq %%rax, 8*" #INDEX " (%[z])")
+
+#define DO_8_TIMES(MACRO, ARG) \
+ MACRO(ARG, 0) \
+ MACRO(ARG, 1) \
+ MACRO(ARG, 2) \
+ MACRO(ARG, 3) \
+ MACRO(ARG, 4) \
+ MACRO(ARG, 5) \
+ MACRO(ARG, 6) \
+ MACRO(ARG, 7)
+
+#define ADD_OR_SUBTRACT(CORE_CODE) \
+ ASM("rorq %[carry]") \
+ CORE_CODE \
+ ASM("sbbq %[carry],%[carry]") \
+ ASM("negq %[carry]")
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("adcq %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("sbbq %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Three Argument *
+*************************************************/
+inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "x")
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "z")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Multiply/Add *
+*************************************************/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(MULADD_OP, "")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mulq %[y]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mulq %[y]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+
+#undef ASM
+#undef DO_8_TIMES
+#undef ADD_OR_SUBTRACT
+#undef ADDSUB2_OP
+#undef ADDSUB3_OP
+#undef LINMUL_OP
+#undef MULADD_OP
+
+}
+
+}
+#endif
diff --git a/modules/asm/mp_amd64/mp_mulop.cpp b/modules/asm/mp_amd64/mp_mulop.cpp
new file mode 100644
index 000000000..d1aa51489
--- /dev/null
+++ b/modules/asm/mp_amd64/mp_mulop.cpp
@@ -0,0 +1,94 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/mp_asm.h>
+#include <botan/mp_asmi.h>
+#include <botan/mp_core.h>
+#include <botan/mem_ops.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Simple O(N^2) Multiplication *
+*************************************************/
+void bigint_simple_mul(word z[], const word x[], u32bit x_size,
+ const word y[], u32bit y_size)
+ {
+ const u32bit blocks = x_size - (x_size % 8);
+
+ clear_mem(z, x_size + y_size);
+
+ for(u32bit i = 0; i != y_size; ++i)
+ {
+ word carry = 0;
+
+ for(u32bit j = 0; j != blocks; j += 8)
+ carry = word8_madd3(z + i + j, x + j, y[i], carry);
+
+ for(u32bit j = blocks; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry);
+
+ z[x_size+i] = carry;
+ }
+ }
+
+inline word word_sqr(word x,
+
+/*************************************************
+* Simple O(N^2) Squaring
+
+This is exactly the same algorithm as bigint_simple_mul,
+however because C/C++ compilers suck at alias analysis it
+is good to have the version where the compiler knows
+that x == y
+*************************************************/
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+ {
+ clear_mem(z, 2*x_size);
+
+ for(u32bit i = 0; i != x_size; ++i)
+ {
+ const word x_i = x[i];
+
+ word carry = z[2*i];
+ z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry);
+
+ for(u32bit j = i; j != x_size; ++j)
+ {
+ // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry;
+
+ /*
+ load z[i+j] into register
+ load x[j] into %hi
+ mulq %[x_i] -> x[i] * x[j] -> %lo:%hi
+ shlq %lo, $1
+
+ // put carry bit (cf) from %lo into %temp
+ xorl %temp
+ adcq $0, %temp
+
+ // high bit of lo now in cf
+ shl %hi, $1
+ // add in lowest bid from %lo
+ orl %temp, %hi
+
+ addq %[c], %[lo]
+ adcq $0, %[hi]
+ addq %[z_ij], %[lo]
+ adcq $0, %[hi]
+
+ */
+
+ }
+
+ z[x_size+i] = carry;
+ }
+ }
+
+}
+
+}
diff --git a/modules/asm/mp_asm64/modinfo.txt b/modules/asm/mp_asm64/modinfo.txt
new file mode 100644
index 000000000..a9e5d53da
--- /dev/null
+++ b/modules/asm/mp_asm64/modinfo.txt
@@ -0,0 +1,25 @@
+realname "64-bit RISC MPI Assembler Core"
+
+mp_bits 64
+
+load_on asm_ok
+
+<replace>
+mp_asm.h
+</replace>
+
+<arch>
+alpha
+ia64
+mips64
+ppc64
+sparc64
+</arch>
+
+# The inline asm only works with gcc, but it looks like (at least on
+# UltraSPARC), using 64-bit words and the sythensized multiply is a 5 to 25%
+# win, so it's probably worth using elsewhere.
+<cc>
+gcc
+sunwspro
+</cc>
diff --git a/modules/asm/mp_asm64/mp_asm.h b/modules/asm/mp_asm64/mp_asm.h
new file mode 100644
index 000000000..e455b3616
--- /dev/null
+++ b/modules/asm/mp_asm64/mp_asm.h
@@ -0,0 +1,110 @@
+/*************************************************
+* MPI Multiply-Add Core Header File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_MP_MADD_H__
+#define BOTAN_MP_MADD_H__
+
+#include <botan/mp_types.h>
+
+namespace Botan {
+
+#if (BOTAN_MP_WORD_BITS != 64)
+ #error The mp_asm64 module requires that BOTAN_MP_WORD_BITS == 64
+#endif
+
+#if defined(BOTAN_TARGET_ARCH_IS_ALPHA)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("umulh %1,%2,%0" : "=r" (z0) : "r" (a), "r" (b)); \
+ z1 = a * b; \
+} while(0);
+
+#elif defined(BOTAN_TARGET_ARCH_IS_IA64)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("xmpy.hu %0=%1,%2" : "=f" (z0) : "f" (a), "f" (b)); \
+ z1 = a * b; \
+} while(0);
+
+#elif defined(BOTAN_TARGET_ARCH_IS_PPC64)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("mulhdu %0,%1,%2" : "=r" (z0) : "r" (a), "r" (b) : "cc"); \
+ z1 = a * b; \
+} while(0);
+
+#elif defined(BOTAN_TARGET_ARCH_IS_MIPS64)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("dmultu %2,%3" : "=h" (z0), "=l" (z1) : "r" (a), "r" (b)); \
+} while(0);
+
+#else
+
+// Do a 64x64->128 multiply using four 64x64->64 multiplies
+// plus some adds and shifts. Last resort for CPUs like UltraSPARC,
+// with 64-bit registers/ALU, but no 64x64->128 multiply.
+inline void bigint_2word_mul(word a, word b, word* z1, word* z0)
+ {
+ const u32bit MP_HWORD_BITS = MP_WORD_BITS / 2;
+ const word MP_HWORD_MASK = ((word)1 << MP_HWORD_BITS) - 1;
+
+ const word a_hi = (a >> MP_HWORD_BITS);
+ const word a_lo = (a & MP_HWORD_MASK);
+ const word b_hi = (b >> MP_HWORD_BITS);
+ const word b_lo = (b & MP_HWORD_MASK);
+
+ word x0 = a_hi * b_hi;
+ word x1 = a_lo * b_hi;
+ word x2 = a_hi * b_lo;
+ word x3 = a_lo * b_lo;
+
+ x2 += x3 >> (MP_HWORD_BITS);
+ x2 += x1;
+ if(x2 < x1)
+ x0 += ((word)1 << MP_HWORD_BITS);
+
+ *z0 = x0 + (x2 >> MP_HWORD_BITS);
+ *z1 = ((x2 & MP_HWORD_MASK) << MP_HWORD_BITS) + (x3 & MP_HWORD_MASK);
+ }
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) bigint_2word_mul(a, b, &z1, &z0)
+
+#endif
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd2(word a, word b, word* c)
+ {
+ word z0 = 0, z1 = 0;
+
+ BOTAN_WORD_MUL(a, b, z1, z0);
+
+ z1 += *c; if(z1 < *c) z0++;
+
+ *c = z0;
+ return z1;
+ }
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd3(word a, word b, word c, word* d)
+ {
+ word z0 = 0, z1 = 0;
+
+ BOTAN_WORD_MUL(a, b, z1, z0);
+
+ z1 += c; if(z1 < c) z0++;
+ z1 += *d; if(z1 < *d) z0++;
+
+ *d = z0;
+ return z1;
+ }
+
+}
+
+#endif
diff --git a/modules/asm/mp_ia32/modinfo.txt b/modules/asm/mp_ia32/modinfo.txt
new file mode 100644
index 000000000..cf4959250
--- /dev/null
+++ b/modules/asm/mp_ia32/modinfo.txt
@@ -0,0 +1,19 @@
+realname "x86 MPI Assembler Core"
+
+mp_bits 32
+
+load_on asm_ok
+
+<replace>
+mp_asm.h
+mp_asmi.h
+</replace>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
diff --git a/modules/asm/mp_ia32/mp_asm.h b/modules/asm/mp_ia32/mp_asm.h
new file mode 100644
index 000000000..b45140321
--- /dev/null
+++ b/modules/asm/mp_ia32/mp_asm.h
@@ -0,0 +1,65 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2008 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_H__
+#define BOTAN_MP_ASM_H__
+
+#include <botan/mp_types.h>
+
+#if (BOTAN_MP_WORD_BITS != 32)
+ #error The mp_ia32 module requires that BOTAN_MP_WORD_BITS == 32
+#endif
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for x86 Assembly *
+*************************************************/
+#define ASM(x) x "\n\t"
+
+/*************************************************
+* Word Multiply *
+*************************************************/
+inline word word_madd2(word a, word b, word* c)
+ {
+ asm(
+ ASM("mull %[b]")
+ ASM("addl %[c],%[a]")
+ ASM("adcl $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c)
+ : "0"(a), "1"(b), [c]"g"(*c) : "cc");
+
+ return a;
+ }
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd3(word a, word b, word c, word* d)
+ {
+ asm(
+ ASM("mull %[b]")
+
+ ASM("addl %[c],%[a]")
+ ASM("adcl $0,%[carry]")
+
+ ASM("addl %[d],%[a]")
+ ASM("adcl $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d)
+ : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc");
+
+ return a;
+ }
+
+}
+
+}
+
+#endif
diff --git a/modules/asm/mp_ia32/mp_asmi.h b/modules/asm/mp_ia32/mp_asmi.h
new file mode 100644
index 000000000..9de0c11e3
--- /dev/null
+++ b/modules/asm/mp_ia32/mp_asmi.h
@@ -0,0 +1,225 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2007 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/mp_asm.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for x86 Assembly *
+*************************************************/
+#ifndef ASM
+ #define ASM(x) x "\n\t"
+#endif
+
+#define ADDSUB2_OP(OPERATION, INDEX) \
+ ASM("movl 4*" #INDEX "(%[y]), %[carry]") \
+ ASM(OPERATION " %[carry], 4*" #INDEX "(%[x])") \
+
+#define ADDSUB3_OP(OPERATION, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]), %[carry]") \
+ ASM(OPERATION " 4*" #INDEX "(%[y]), %[carry]") \
+ ASM("movl %[carry], 4*" #INDEX "(%[z])") \
+
+#define LINMUL_OP(WRITE_TO, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]),%%eax") \
+ ASM("mull %[y]") \
+ ASM("addl %[carry],%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("movl %%edx,%[carry]") \
+ ASM("movl %%eax, 4*" #INDEX "(%[" WRITE_TO "])")
+
+#define MULADD_OP(IGNORED, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]),%%eax") \
+ ASM("mull %[y]") \
+ ASM("addl %[carry],%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("addl 4*" #INDEX "(%[z]),%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("movl %%edx,%[carry]") \
+ ASM("movl %%eax, 4*" #INDEX " (%[z])")
+
+#define DO_8_TIMES(MACRO, ARG) \
+ MACRO(ARG, 0) \
+ MACRO(ARG, 1) \
+ MACRO(ARG, 2) \
+ MACRO(ARG, 3) \
+ MACRO(ARG, 4) \
+ MACRO(ARG, 5) \
+ MACRO(ARG, 6) \
+ MACRO(ARG, 7)
+
+#define ADD_OR_SUBTRACT(CORE_CODE) \
+ ASM("rorl %[carry]") \
+ CORE_CODE \
+ ASM("sbbl %[carry],%[carry]") \
+ ASM("negl %[carry]")
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("adcl %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("sbbl %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Three Argument *
+*************************************************/
+inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "x")
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "z")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Multiply/Add *
+*************************************************/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(MULADD_OP, "")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mull %[y]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mull %[y]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+}
+
+}
+
+#endif
diff --git a/modules/asm/mp_ia32_msvc/modinfo.txt b/modules/asm/mp_ia32_msvc/modinfo.txt
new file mode 100644
index 000000000..36d9d0290
--- /dev/null
+++ b/modules/asm/mp_ia32_msvc/modinfo.txt
@@ -0,0 +1,17 @@
+realname "x86 MPI Assembler Core (MSVC)"
+
+mp_bits 32
+
+#load_on asm_ok
+
+<replace>
+mp_asmi.h
+</replace>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+msvc
+</cc>
diff --git a/modules/asm/mp_ia32_msvc/mp_asmi.h b/modules/asm/mp_ia32_msvc/mp_asmi.h
new file mode 100644
index 000000000..5eaa46eb4
--- /dev/null
+++ b/modules/asm/mp_ia32_msvc/mp_asmi.h
@@ -0,0 +1,547 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2006 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include "mp_asm.h"
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ word z = x + y;
+ word c1 = (z < x);
+ z += *carry;
+ *carry = c1 | (z < *carry);
+ return z;
+ }
+
+/*************************************************
+* Four Word Block Addition, Two Argument *
+*************************************************/
+inline word word4_addcarry(word x[4], word carry)
+ {
+ __asm {
+ mov edx,[x]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ adc [edx],0
+ mov eax,[esi+4]
+ adc [edx+4],0
+ mov eax,[esi+8]
+ adc [edx+8],0
+ mov eax,[esi+12]
+ adc [edx+12],0
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ __asm {
+ mov edx,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[esi]
+ adc [edx],eax
+ mov eax,[esi+4]
+ adc [edx+4],eax
+ mov eax,[esi+8]
+ adc [edx+8],eax
+ mov eax,[esi+12]
+ adc [edx+12],eax
+ mov eax,[esi+16]
+ adc [edx+16],eax
+ mov eax,[esi+20]
+ adc [edx+20],eax
+ mov eax,[esi+24]
+ adc [edx+24],eax
+ mov eax,[esi+28]
+ adc [edx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ mov ebx,[z]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ adc eax,[esi]
+ mov [ebx],eax
+
+ mov eax,[edi+4]
+ adc eax,[esi+4]
+ mov [ebx+4],eax
+
+ mov eax,[edi+8]
+ adc eax,[esi+8]
+ mov [ebx+8],eax
+
+ mov eax,[edi+12]
+ adc eax,[esi+12]
+ mov [ebx+12],eax
+
+ mov eax,[edi+16]
+ adc eax,[esi+16]
+ mov [ebx+16],eax
+
+ mov eax,[edi+20]
+ adc eax,[esi+20]
+ mov [ebx+20],eax
+
+ mov eax,[edi+24]
+ adc eax,[esi+24]
+ mov [ebx+24],eax
+
+ mov eax,[edi+28]
+ adc eax,[esi+28]
+ mov [ebx+28],eax
+
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ word t0 = x - y;
+ word c1 = (t0 > x);
+ word z = t0 - *carry;
+ *carry = c1 | (z > t0);
+ return z;
+ }
+
+/*************************************************
+* Four Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ _asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [edi],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [edi+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [edi+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [edi+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [edi+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [edi+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [edi+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [edi+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Subtraction, Three Argument *
+*************************************************/
+__forceinline word word8_sub3(word z[8], const word x[8],
+ const word y[8], word carry)
+ {
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov ebx,[z]
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [ebx],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [ebx+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [ebx+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [ebx+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [ebx+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [ebx+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [ebx+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [ebx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+ __asm
+ {
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+28],eax //load a
+
+ mov eax,edx //store carry
+ }
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+__forceinline word word8_muladd(word z[8], const word x[8],
+ word y, word carry)
+ {
+ __asm
+ {
+ mov esi,[x]
+ mov ebx,[y]
+ mov edi,[z]
+ mov eax,[esi] //load a
+ mul ebx //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ add eax,[edi] //sum lo z
+ adc edx,0 //sum hi z
+ mov ecx,edx //carry for next block = hi z
+ mov [edi],eax //save lo z
+
+ mov eax,[esi+4]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+4]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+4],eax
+
+ mov eax,[esi+8]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+8]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+8],eax
+
+ mov eax,[esi+12]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+12]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+12],eax
+
+ mov eax,[esi+16]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+16]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+16],eax
+
+ mov eax,[esi+20]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+20]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+20],eax
+
+ mov eax,[esi+24]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+24]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+24],eax
+
+ mov eax,[esi+28]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+28]
+ adc edx,0
+ mov [edi+28],eax
+ mov eax,edx
+ }
+ }
+
+__forceinline word word8_linmul3(word z[4], const word x[4], word y, word carry)
+ {
+ __asm
+ {
+#if 0
+ //it's slower!!!
+ mov edx,[z]
+ mov eax,[x]
+ movd mm7,[y]
+
+ movd mm0,[eax]
+ movd mm1,[eax+4]
+ movd mm2,[eax+8]
+ pmuludq mm0,mm7
+ pmuludq mm1,mm7
+ pmuludq mm2,mm7
+
+ movd mm6,[carry]
+ paddq mm0,mm6
+ movd [edx],mm0
+
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+4],mm1
+
+ movd mm3,[eax+12]
+ psrlq mm1,32
+ paddq mm2,mm1
+ movd [edx+8],mm2
+
+ pmuludq mm3,mm7
+ movd mm4,[eax+16]
+ psrlq mm2,32
+ paddq mm3,mm2
+ movd [edx+12],mm3
+
+ pmuludq mm4,mm7
+ movd mm5,[eax+20]
+ psrlq mm3,32
+ paddq mm4,mm3
+ movd [edx+16],mm4
+
+ pmuludq mm5,mm7
+ movd mm0,[eax+24]
+ psrlq mm4,32
+ paddq mm5,mm4
+ movd [edx+20],mm5
+
+ pmuludq mm0,mm7
+ movd mm1,[eax+28]
+ psrlq mm5,32
+ paddq mm0,mm5
+ movd [edx+24],mm0
+
+ pmuludq mm1,mm7
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+28],mm1
+
+ psrlq mm1,32
+ movd eax,mm1
+ emms
+#else
+ mov edi,[z]
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [edi+28],eax //load a
+ mov eax,edx //store carry
+#endif
+ }
+ }
+
+/*************************************************
+* Eight Word Block Multiply/Add *
+*************************************************/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+ z[0] = word_madd3(x[0], y, z[0], &carry);
+ z[1] = word_madd3(x[1], y, z[1], &carry);
+ z[2] = word_madd3(x[2], y, z[2], &carry);
+ z[3] = word_madd3(x[3], y, z[3], &carry);
+ z[4] = word_madd3(x[4], y, z[4], &carry);
+ z[5] = word_madd3(x[5], y, z[5], &carry);
+ z[6] = word_madd3(x[6], y, z[6], &carry);
+ z[7] = word_madd3(x[7], y, z[7], &carry);
+ return carry;
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b)
+ {
+ dword z = (dword)a * b + (*w0);
+ *w0 = (word)z; //lo
+
+ word t1 = (word)(z >> BOTAN_MP_WORD_BITS); //hi
+ *w1 += t1; //w1+=lo
+ *w2 += (*w1 < t1) ? 1 : 0; //w2+=carry
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b)
+ {
+ dword z = (dword)a * b;
+ word t0 = (word)z;
+ word t1 = (word)(z >> BOTAN_MP_WORD_BITS);
+
+ *w0 += t0;
+ *w1 += t1 + ((*w0 < t0) ? 1 : 0);
+ *w2 += (*w1 < t1) ? 1 : 0;
+
+ *w0 += t0;
+ *w1 += t1 + ((*w0 < t0) ? 1 : 0);
+ *w2 += (*w1 < t1) ? 1 : 0;
+ }
+
+}
+
+}
+
+#endif
diff --git a/modules/asm/sha1_sse2/modinfo.txt b/modules/asm/sha1_sse2/modinfo.txt
new file mode 100644
index 000000000..e1805260c
--- /dev/null
+++ b/modules/asm/sha1_sse2/modinfo.txt
@@ -0,0 +1,22 @@
+realname "SSE2 implementation of SHA-1"
+
+load_on request
+
+<replace>
+sha160.cpp
+sha160.h
+</replace>
+
+<add>
+sha1core.cpp
+</add>
+
+<arch>
+pentium4
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
diff --git a/modules/asm/sha1_sse2/sha160.cpp b/modules/asm/sha1_sse2/sha160.cpp
new file mode 100644
index 000000000..dfb5fdfe5
--- /dev/null
+++ b/modules/asm/sha1_sse2/sha160.cpp
@@ -0,0 +1,52 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/sha160.h>
+#include <botan/loadstor.h>
+#include <botan/bit_ops.h>
+
+namespace Botan {
+
+extern "C" void botan_sha1_sse(u32bit[5], const byte[64]);
+
+/*************************************************
+* SHA-160 Compression Function *
+*************************************************/
+void SHA_160::hash(const byte input[])
+ {
+ botan_sha1_sse(digest, input);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void SHA_160::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(j % 4, digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void SHA_160::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ digest[4] = 0xC3D2E1F0;
+ }
+
+/*************************************************
+* SHA_160 Constructor *
+*************************************************/
+SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true)
+ {
+ clear();
+ }
+
+}
diff --git a/modules/asm/sha1_sse2/sha160.h b/modules/asm/sha1_sse2/sha160.h
new file mode 100644
index 000000000..c6f8482cf
--- /dev/null
+++ b/modules/asm/sha1_sse2/sha160.h
@@ -0,0 +1,32 @@
+/*************************************************
+* SHA-160 Header File *
+* (C) 1999-2007 The Botan Project *
+*************************************************/
+
+#ifndef BOTAN_SHA_160_H__
+#define BOTAN_SHA_160_H__
+
+#include <botan/mdx_hash.h>
+
+namespace Botan {
+
+/*************************************************
+* SHA-160 *
+*************************************************/
+class SHA_160 : public MDx_HashFunction
+ {
+ public:
+ void clear() throw();
+ std::string name() const { return "SHA-160"; }
+ HashFunction* clone() const { return new SHA_160; }
+ SHA_160();
+ private:
+ void hash(const byte[]);
+ void copy_out(byte[]);
+
+ SecureBuffer<u32bit, 5> digest;
+ };
+
+}
+
+#endif
diff --git a/modules/asm/sha1_sse2/sha1core.cpp b/modules/asm/sha1_sse2/sha1core.cpp
new file mode 100644
index 000000000..23dbfc5e2
--- /dev/null
+++ b/modules/asm/sha1_sse2/sha1core.cpp
@@ -0,0 +1,327 @@
+/* this code is public domain.
+ *
+ * dean gaudet <[email protected]>
+ *
+ * this code was inspired by this paper:
+ *
+ * SHA: A Design for Parallel Architectures?
+ * Antoon Bosselaers, Ren�e Govaerts and Joos Vandewalle
+ * <http://www.esat.kuleuven.ac.be/~cosicart/pdf/AB-9700.pdf>
+ *
+ * more information available on this implementation here:
+ *
+ * http://arctic.org/~dean/crypto/sha1.html
+ *
+ * version: 2
+ */
+
+/*
+ * Lightly modified for Botan, tested under GCC 4.1.1 and ICC 9.1
+ * on a Linux/Core2 system.
+ *
+ */
+#include <botan/types.h>
+#include <xmmintrin.h>
+
+namespace Botan {
+
+typedef union {
+ u32bit u32[4];
+ __m128i u128;
+ } v4si __attribute__((aligned(16)));
+
+static const v4si K00_19 = { { 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 } };
+static const v4si K20_39 = { { 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 } };
+static const v4si K40_59 = { { 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc } };
+static const v4si K60_79 = { { 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 } };
+
+#define UNALIGNED 1
+#if UNALIGNED
+#define load(p) _mm_loadu_si128(p)
+#else
+#define load(p) (*p)
+#endif
+
+
+/*
+the first 16 bytes only need byte swapping
+
+prepared points to 4x u32bit, 16-byte aligned
+
+W points to the 4 dwords which need preparing --
+and is overwritten with the swapped bytes
+*/
+#define prep00_15(prep, W) do { \
+ __m128i r1, r2; \
+ \
+ r1 = (W); \
+ if (1) { \
+ r1 = _mm_shufflehi_epi16(r1, _MM_SHUFFLE(2, 3, 0, 1)); \
+ r1 = _mm_shufflelo_epi16(r1, _MM_SHUFFLE(2, 3, 0, 1)); \
+ r2 = _mm_slli_epi16(r1, 8); \
+ r1 = _mm_srli_epi16(r1, 8); \
+ r1 = _mm_or_si128(r1, r2); \
+ (W) = r1; \
+ } \
+ (prep).u128 = _mm_add_epi32(K00_19.u128, r1); \
+ } while(0)
+
+
+
+/*
+for each multiple of 4, t, we want to calculate this:
+
+W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
+W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
+W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
+
+we'll actually calculate this:
+
+W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
+W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
+W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
+W[t+3] ^= rol(W[t+0], 1);
+
+the parameters are:
+
+W0 = &W[t-16];
+W1 = &W[t-12];
+W2 = &W[t- 8];
+W3 = &W[t- 4];
+
+and on output:
+prepared = W0 + K
+W0 = W[t]..W[t+3]
+*/
+
+/* note that there is a step here where i want to do a rol by 1, which
+* normally would look like this:
+*
+* r1 = psrld r0,$31
+* r0 = pslld r0,$1
+* r0 = por r0,r1
+*
+* but instead i do this:
+*
+* r1 = pcmpltd r0,zero
+* r0 = paddd r0,r0
+* r0 = psub r0,r1
+*
+* because pcmpltd and paddd are availabe in both MMX units on
+* efficeon, pentium-m, and opteron but shifts are available in
+* only one unit.
+*/
+#define prep(prep, XW0, XW1, XW2, XW3, K) do { \
+ __m128i r0, r1, r2, r3; \
+ \
+ /* load W[t-4] 16-byte aligned, and shift */ \
+ r3 = _mm_srli_si128((XW3), 4); \
+ r0 = (XW0); \
+ /* get high 64-bits of XW0 into low 64-bits */ \
+ r1 = _mm_shuffle_epi32((XW0), _MM_SHUFFLE(1,0,3,2)); \
+ /* load high 64-bits of r1 */ \
+ r1 = _mm_unpacklo_epi64(r1, (XW1)); \
+ r2 = (XW2); \
+ \
+ r0 = _mm_xor_si128(r1, r0); \
+ r2 = _mm_xor_si128(r3, r2); \
+ r0 = _mm_xor_si128(r2, r0); \
+ /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */ \
+ \
+ r2 = _mm_slli_si128(r0, 12); \
+ r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128()); \
+ r0 = _mm_add_epi32(r0, r0); /* shift left by 1 */ \
+ r0 = _mm_sub_epi32(r0, r1); /* r0 has W[t]..W[t+2] */ \
+ \
+ r3 = _mm_srli_epi32(r2, 30); \
+ r2 = _mm_slli_epi32(r2, 2); \
+ \
+ r0 = _mm_xor_si128(r0, r3); \
+ r0 = _mm_xor_si128(r0, r2); /* r0 now has W[t+3] */ \
+ \
+ (XW0) = r0; \
+ (prep).u128 = _mm_add_epi32(r0, (K).u128); \
+ } while(0)
+
+
+static inline u32bit rol(u32bit src, u32bit amt)
+ {
+ /* gcc and icc appear to turn this into a rotate */
+ return (src << amt) | (src >> (32 - amt));
+ }
+
+
+static inline u32bit f00_19(u32bit x, u32bit y, u32bit z)
+ {
+ /* FIPS 180-2 says this: (x & y) ^ (~x & z)
+ * but we can calculate it in fewer steps.
+ */
+ return ((y ^ z) & x) ^ z;
+ }
+
+
+static inline u32bit f20_39(u32bit x, u32bit y, u32bit z)
+ {
+ return (x ^ z) ^ y;
+ }
+
+
+static inline u32bit f40_59(u32bit x, u32bit y, u32bit z)
+ {
+ /* FIPS 180-2 says this: (x & y) ^ (x & z) ^ (y & z)
+ * but we can calculate it in fewer steps.
+ */
+ return (x & z) | ((x | z) & y);
+ }
+
+
+static inline u32bit f60_79(u32bit x, u32bit y, u32bit z)
+ {
+ return f20_39(x, y, z);
+ }
+
+#define step(nn_mm, xa, xb, xc, xd, xe, xt, input) do { \
+ (xt) = (input) + f##nn_mm((xb), (xc), (xd)); \
+ (xb) = rol((xb), 30); \
+ (xt) += ((xe) + rol((xa), 5)); \
+ } while(0)
+
+extern "C" void botan_sha1_sse(u32bit* H,
+ const u32bit* inputu)
+ {
+ const __m128i * input = (const __m128i *)inputu;
+ __m128i W0, W1, W2, W3;
+ v4si prep0, prep1, prep2;
+ u32bit a, b, c, d, e, t;
+
+ a = H[0];
+ b = H[1];
+ c = H[2];
+ d = H[3];
+ e = H[4];
+
+ /* i've tried arranging the SSE2 code to be 4, 8, 12, and 16
+ * steps ahead of the integer code. 12 steps ahead seems
+ * to produce the best performance. -dean
+ */
+ W0 = load(&input[0]);
+ prep00_15(prep0, W0); /* prepare for 00 through 03 */
+ W1 = load(&input[1]);
+ prep00_15(prep1, W1); /* prepare for 04 through 07 */
+ W2 = load(&input[2]);
+ prep00_15(prep2, W2); /* prepare for 08 through 11 */
+
+ W3 = load(&input[3]);
+ step(00_19, a, b, c, d, e, t, prep0.u32[0]); /* 00 */
+ step(00_19, t, a, b, c, d, e, prep0.u32[1]); /* 01 */
+ step(00_19, e, t, a, b, c, d, prep0.u32[2]); /* 02 */
+ step(00_19, d, e, t, a, b, c, prep0.u32[3]); /* 03 */
+ prep00_15(prep0, W3);
+ step(00_19, c, d, e, t, a, b, prep1.u32[0]); /* 04 */
+ step(00_19, b, c, d, e, t, a, prep1.u32[1]); /* 05 */
+ step(00_19, a, b, c, d, e, t, prep1.u32[2]); /* 06 */
+ step(00_19, t, a, b, c, d, e, prep1.u32[3]); /* 07 */
+ prep(prep1, W0, W1, W2, W3, K00_19); /* prepare for 16 through 19 */
+ step(00_19, e, t, a, b, c, d, prep2.u32[0]); /* 08 */
+ step(00_19, d, e, t, a, b, c, prep2.u32[1]); /* 09 */
+ step(00_19, c, d, e, t, a, b, prep2.u32[2]); /* 10 */
+ step(00_19, b, c, d, e, t, a, prep2.u32[3]); /* 11 */
+ prep(prep2, W1, W2, W3, W0, K20_39); /* prepare for 20 through 23 */
+ step(00_19, a, b, c, d, e, t, prep0.u32[0]); /* 12 */
+ step(00_19, t, a, b, c, d, e, prep0.u32[1]); /* 13 */
+ step(00_19, e, t, a, b, c, d, prep0.u32[2]); /* 14 */
+ step(00_19, d, e, t, a, b, c, prep0.u32[3]); /* 15 */
+ prep(prep0, W2, W3, W0, W1, K20_39);
+ step(00_19, c, d, e, t, a, b, prep1.u32[0]); /* 16 */
+ step(00_19, b, c, d, e, t, a, prep1.u32[1]); /* 17 */
+ step(00_19, a, b, c, d, e, t, prep1.u32[2]); /* 18 */
+ step(00_19, t, a, b, c, d, e, prep1.u32[3]); /* 19 */
+
+ prep(prep1, W3, W0, W1, W2, K20_39);
+ step(20_39, e, t, a, b, c, d, prep2.u32[0]); /* 20 */
+ step(20_39, d, e, t, a, b, c, prep2.u32[1]); /* 21 */
+ step(20_39, c, d, e, t, a, b, prep2.u32[2]); /* 22 */
+ step(20_39, b, c, d, e, t, a, prep2.u32[3]); /* 23 */
+ prep(prep2, W0, W1, W2, W3, K20_39);
+ step(20_39, a, b, c, d, e, t, prep0.u32[0]); /* 24 */
+ step(20_39, t, a, b, c, d, e, prep0.u32[1]); /* 25 */
+ step(20_39, e, t, a, b, c, d, prep0.u32[2]); /* 26 */
+ step(20_39, d, e, t, a, b, c, prep0.u32[3]); /* 27 */
+ prep(prep0, W1, W2, W3, W0, K20_39);
+ step(20_39, c, d, e, t, a, b, prep1.u32[0]); /* 28 */
+ step(20_39, b, c, d, e, t, a, prep1.u32[1]); /* 29 */
+ step(20_39, a, b, c, d, e, t, prep1.u32[2]); /* 30 */
+ step(20_39, t, a, b, c, d, e, prep1.u32[3]); /* 31 */
+ prep(prep1, W2, W3, W0, W1, K40_59);
+ step(20_39, e, t, a, b, c, d, prep2.u32[0]); /* 32 */
+ step(20_39, d, e, t, a, b, c, prep2.u32[1]); /* 33 */
+ step(20_39, c, d, e, t, a, b, prep2.u32[2]); /* 34 */
+ step(20_39, b, c, d, e, t, a, prep2.u32[3]); /* 35 */
+ prep(prep2, W3, W0, W1, W2, K40_59);
+ step(20_39, a, b, c, d, e, t, prep0.u32[0]); /* 36 */
+ step(20_39, t, a, b, c, d, e, prep0.u32[1]); /* 37 */
+ step(20_39, e, t, a, b, c, d, prep0.u32[2]); /* 38 */
+ step(20_39, d, e, t, a, b, c, prep0.u32[3]); /* 39 */
+
+ prep(prep0, W0, W1, W2, W3, K40_59);
+ step(40_59, c, d, e, t, a, b, prep1.u32[0]); /* 40 */
+ step(40_59, b, c, d, e, t, a, prep1.u32[1]); /* 41 */
+ step(40_59, a, b, c, d, e, t, prep1.u32[2]); /* 42 */
+ step(40_59, t, a, b, c, d, e, prep1.u32[3]); /* 43 */
+ prep(prep1, W1, W2, W3, W0, K40_59);
+ step(40_59, e, t, a, b, c, d, prep2.u32[0]); /* 44 */
+ step(40_59, d, e, t, a, b, c, prep2.u32[1]); /* 45 */
+ step(40_59, c, d, e, t, a, b, prep2.u32[2]); /* 46 */
+ step(40_59, b, c, d, e, t, a, prep2.u32[3]); /* 47 */
+ prep(prep2, W2, W3, W0, W1, K40_59);
+ step(40_59, a, b, c, d, e, t, prep0.u32[0]); /* 48 */
+ step(40_59, t, a, b, c, d, e, prep0.u32[1]); /* 49 */
+ step(40_59, e, t, a, b, c, d, prep0.u32[2]); /* 50 */
+ step(40_59, d, e, t, a, b, c, prep0.u32[3]); /* 51 */
+ prep(prep0, W3, W0, W1, W2, K60_79);
+ step(40_59, c, d, e, t, a, b, prep1.u32[0]); /* 52 */
+ step(40_59, b, c, d, e, t, a, prep1.u32[1]); /* 53 */
+ step(40_59, a, b, c, d, e, t, prep1.u32[2]); /* 54 */
+ step(40_59, t, a, b, c, d, e, prep1.u32[3]); /* 55 */
+ prep(prep1, W0, W1, W2, W3, K60_79);
+ step(40_59, e, t, a, b, c, d, prep2.u32[0]); /* 56 */
+ step(40_59, d, e, t, a, b, c, prep2.u32[1]); /* 57 */
+ step(40_59, c, d, e, t, a, b, prep2.u32[2]); /* 58 */
+ step(40_59, b, c, d, e, t, a, prep2.u32[3]); /* 59 */
+
+ prep(prep2, W1, W2, W3, W0, K60_79);
+ step(60_79, a, b, c, d, e, t, prep0.u32[0]); /* 60 */
+ step(60_79, t, a, b, c, d, e, prep0.u32[1]); /* 61 */
+ step(60_79, e, t, a, b, c, d, prep0.u32[2]); /* 62 */
+ step(60_79, d, e, t, a, b, c, prep0.u32[3]); /* 63 */
+ prep(prep0, W2, W3, W0, W1, K60_79);
+ step(60_79, c, d, e, t, a, b, prep1.u32[0]); /* 64 */
+ step(60_79, b, c, d, e, t, a, prep1.u32[1]); /* 65 */
+ step(60_79, a, b, c, d, e, t, prep1.u32[2]); /* 66 */
+ step(60_79, t, a, b, c, d, e, prep1.u32[3]); /* 67 */
+ prep(prep1, W3, W0, W1, W2, K60_79);
+ step(60_79, e, t, a, b, c, d, prep2.u32[0]); /* 68 */
+ step(60_79, d, e, t, a, b, c, prep2.u32[1]); /* 69 */
+ step(60_79, c, d, e, t, a, b, prep2.u32[2]); /* 70 */
+ step(60_79, b, c, d, e, t, a, prep2.u32[3]); /* 71 */
+
+ step(60_79, a, b, c, d, e, t, prep0.u32[0]); /* 72 */
+ step(60_79, t, a, b, c, d, e, prep0.u32[1]); /* 73 */
+ step(60_79, e, t, a, b, c, d, prep0.u32[2]); /* 74 */
+ step(60_79, d, e, t, a, b, c, prep0.u32[3]); /* 75 */
+ /* no more input to prepare */
+ step(60_79, c, d, e, t, a, b, prep1.u32[0]); /* 76 */
+ step(60_79, b, c, d, e, t, a, prep1.u32[1]); /* 77 */
+ step(60_79, a, b, c, d, e, t, prep1.u32[2]); /* 78 */
+ step(60_79, t, a, b, c, d, e, prep1.u32[3]); /* 79 */
+ /* e, t, a, b, c, d */
+ H[0] += e;
+ H[1] += t;
+ H[2] += a;
+ H[3] += b;
+ H[4] += c;
+ }
+
+}