aboutsummaryrefslogtreecommitdiffstats
path: root/src/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/asm')
-rw-r--r--src/asm/asm_amd64/asm_macr.h125
-rw-r--r--src/asm/asm_amd64/modinfo.txt39
-rw-r--r--src/asm/asm_amd64/mp_monty.S397
-rw-r--r--src/asm/asm_amd64/mp_mulop_amd64.S128
-rw-r--r--src/asm/asm_amd64/sha160.cpp52
-rw-r--r--src/asm/asm_amd64/sha1_asm.S258
-rw-r--r--src/asm/asm_ia32/asm_macr.h131
-rw-r--r--src/asm/asm_ia32/md4.cpp43
-rw-r--r--src/asm/asm_ia32/md4core.S135
-rw-r--r--src/asm/asm_ia32/md5.cpp43
-rw-r--r--src/asm/asm_ia32/md5core.S164
-rw-r--r--src/asm/asm_ia32/modinfo.txt43
-rw-r--r--src/asm/asm_ia32/mp_mulop.S62
-rw-r--r--src/asm/asm_ia32/serp_asm.S667
-rw-r--r--src/asm/asm_ia32/serpent.cpp49
-rw-r--r--src/asm/asm_ia32/sha160.cpp52
-rw-r--r--src/asm/asm_ia32/sha1_asm.S242
-rw-r--r--src/asm/mp_amd64/bswap.h36
-rw-r--r--src/asm/mp_amd64/modinfo.txt21
-rw-r--r--src/asm/mp_amd64/mp_asm.h67
-rw-r--r--src/asm/mp_amd64/mp_asmi.h233
-rw-r--r--src/asm/mp_amd64/mp_mulop.cpp94
-rw-r--r--src/asm/mp_asm64/modinfo.txt25
-rw-r--r--src/asm/mp_asm64/mp_asm.h110
-rw-r--r--src/asm/mp_ia32/modinfo.txt19
-rw-r--r--src/asm/mp_ia32/mp_asm.h65
-rw-r--r--src/asm/mp_ia32/mp_asmi.h225
-rw-r--r--src/asm/mp_ia32_msvc/modinfo.txt17
-rw-r--r--src/asm/mp_ia32_msvc/mp_asmi.h547
-rw-r--r--src/asm/sha1_sse2/modinfo.txt22
-rw-r--r--src/asm/sha1_sse2/sha160.cpp52
-rw-r--r--src/asm/sha1_sse2/sha160.h32
-rw-r--r--src/asm/sha1_sse2/sha1core.cpp327
33 files changed, 4522 insertions, 0 deletions
diff --git a/src/asm/asm_amd64/asm_macr.h b/src/asm/asm_amd64/asm_macr.h
new file mode 100644
index 000000000..3cdd42dc6
--- /dev/null
+++ b/src/asm/asm_amd64/asm_macr.h
@@ -0,0 +1,125 @@
+/*************************************************
+* Assembly Macros Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_AMD64_ASM_MACROS_H__
+#define BOTAN_AMD64_ASM_MACROS_H__
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* General/Global Macros *
+*************************************************/
+#define ALIGN .p2align 4,,15
+
+#define START_LISTING(FILENAME) \
+ .file #FILENAME; \
+ .text; \
+ ALIGN;
+
+/*************************************************
+* Function Definitions *
+*************************************************/
+#define START_FUNCTION(func_name) \
+ ALIGN; \
+ .global func_name; \
+ .type func_name,@function; \
+func_name:
+
+#define END_FUNCTION(func_name) \
+ ret
+
+/*************************************************
+* Conditional Jumps *
+*************************************************/
+#define JUMP_IF_ZERO(REG, LABEL) \
+ cmp IMM(0), REG; \
+ jz LABEL
+
+#define JUMP_IF_LT(REG, NUM, LABEL) \
+ cmp IMM(NUM), REG; \
+ jl LABEL
+
+/*************************************************
+* Register Names *
+*************************************************/
+#define R0 %rax
+#define R1 %rbx
+#define R2 %rcx
+#define R2_32 %ecx
+#define R3 %rdx
+#define R3_32 %edx
+#define R4 %rsp
+#define R5 %rbp
+#define R6 %rsi
+#define R6_32 %esi
+#define R7 %rdi
+#define R8 %r8
+#define R9 %r9
+#define R9_32 %r9d
+#define R10 %r10
+#define R11 %r11
+#define R12 %r12
+#define R13 %r13
+#define R14 %r14
+#define R15 %r15
+#define R16 %r16
+
+#define ARG_1 R7
+#define ARG_2 R6
+#define ARG_2_32 R6_32
+#define ARG_3 R3
+#define ARG_3_32 R3_32
+#define ARG_4 R2
+#define ARG_4_32 R2_32
+#define ARG_5 R8
+#define ARG_6 R9
+#define ARG_6_32 R9_32
+
+#define TEMP_1 R10
+#define TEMP_2 R11
+#define TEMP_3 ARG_6
+#define TEMP_4 ARG_5
+#define TEMP_5 ARG_4
+#define TEMP_5_32 ARG_4_32
+#define TEMP_6 ARG_3
+#define TEMP_7 ARG_2
+#define TEMP_8 ARG_1
+#define TEMP_9 R0
+
+/*************************************************
+* Memory Access Operations *
+*************************************************/
+#define ARRAY8(REG, NUM) 8*(NUM)(REG)
+#define ARRAY4(REG, NUM) 4*(NUM)(REG)
+
+#define ASSIGN(TO, FROM) mov FROM, TO
+
+/*************************************************
+* ALU Operations *
+*************************************************/
+#define IMM(VAL) $VAL
+
+#define ADD(TO, FROM) add FROM, TO
+#define ADD_LAST_CARRY(REG) adc IMM(0), REG
+#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM))
+#define ADD_W_CARRY(TO1, TO2, FROM) add FROM, TO1; adc IMM(0), TO2;
+#define SUB_IMM(TO, NUM) sub IMM(NUM), TO
+#define MUL(REG) mul REG
+
+#define XOR(TO, FROM) xor FROM, TO
+#define AND(TO, FROM) and FROM, TO
+#define OR(TO, FROM) or FROM, TO
+#define NOT(REG) not REG
+#define ZEROIZE(REG) XOR(REG, REG)
+
+#define RETURN_VALUE_IS(V) ASSIGN(%rax, V)
+
+#define ROTL_IMM(REG, NUM) rol IMM(NUM), REG
+#define ROTR_IMM(REG, NUM) ror IMM(NUM), REG
+#define ADD3_IMM(TO, FROM, NUM) lea NUM(TO,FROM,1), TO
+
+#endif
diff --git a/src/asm/asm_amd64/modinfo.txt b/src/asm/asm_amd64/modinfo.txt
new file mode 100644
index 000000000..30aa5a413
--- /dev/null
+++ b/src/asm/asm_amd64/modinfo.txt
@@ -0,0 +1,39 @@
+realname "x86-64 Assembler"
+
+mp_bits 64
+
+load_on request
+
+<replace>
+sha160.cpp
+</replace>
+
+<ignore>
+#mp_mulop.cpp
+#mp_monty.cpp
+</ignore>
+
+<add>
+asm_macr.h
+#mp_mulop_amd64.S
+#mp_monty.S
+sha1_asm.S
+</add>
+
+<arch>
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
diff --git a/src/asm/asm_amd64/mp_monty.S b/src/asm/asm_amd64/mp_monty.S
new file mode 100644
index 000000000..3dd4040bc
--- /dev/null
+++ b/src/asm/asm_amd64/mp_monty.S
@@ -0,0 +1,397 @@
+/*************************************************
+* Montgomery Reduction Source File *
+* (C) 2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_monty.S)
+
+START_FUNCTION(bigint_monty_redc)
+ pushq %r15 #
+ pushq %r14 #
+ pushq %r13 #
+ pushq %r12 #
+ pushq %rbp #
+ pushq %rbx #
+
+ movq %rdi, %r14 # z
+ movq %rdx, %r12 # x
+ movl %esi, %ebp # z_size
+
+ xorl %esi, %esi # j.76
+ movq %r8, -16(%rsp) # u, u
+ movl %ecx, %ebx # x_size, x_size
+ movl %ecx, %r8d # x_size, blocks_of_8
+ andl $-8, %r8d #, blocks_of_8
+ testl %ecx, %ecx # x_size
+ je .L3 #,
+ mov %ecx, %eax # x_size, pretmp.71
+ leal 1(%rbx), %r15d #, k.73
+ salq $3, %rax #,
+ xorl %r13d, %r13d # j
+ movq %rax, -8(%rsp) #, pretmp.21
+ .p2align 4,,10
+ .p2align 3
+.L11:
+ mov %r13d, %eax # j, j
+ movq -16(%rsp), %rdi # u, y
+ leaq (%r14,%rax,8), %r11 #, z_j
+ xorl %r9d, %r9d # i
+ imulq (%r11), %rdi #* z_j, y
+ xorl %r10d, %r10d # carry
+ testl %r8d, %r8d # blocks_of_8
+ je .L7 #,
+ .p2align 4,,10
+ .p2align 3
+.LOOP_MUL_ADD:
+ mov %r9d, %ecx # i, i
+ addl $8, %r9d #, i
+ salq $3, %rcx #, D.2315
+ leaq (%r11,%rcx), %rsi #, tmp130
+ leaq (%r12,%rcx), %rcx #, tmp131
+
+ movq 8*0(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*0(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*0 (%rsi)
+
+ movq 8*1(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*1(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*1 (%rsi)
+
+ movq 8*2(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*2(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*2 (%rsi)
+
+ movq 8*3(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*3(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*3 (%rsi)
+
+ movq 8*4(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*4(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*4 (%rsi)
+
+ movq 8*5(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*5(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*5 (%rsi)
+
+ movq 8*6(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*6(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*6 (%rsi)
+
+ movq 8*7(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*7(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*7 (%rsi)
+
+ cmpl %r9d, %r8d # i, blocks_of_8
+ jne .LOOP_MUL_ADD #,
+ cmpl %r8d, %ebx # blocks_of_8, x_size
+ je .L8 #,
+.L7:
+ movl %r8d, %esi # blocks_of_8, i
+ .p2align 4,,10
+ .p2align 3
+.L5:
+ mov %esi, %eax # i, i
+ movq %rdi, %rcx # y, b
+ leaq (%r11, %rax,8), %r9 #, D.2325
+ incl %esi # i
+ movq (%r12, %rax,8), %rax #* x, tmp133
+
+ mulq %rcx # b
+ addq (%r9), %rax #* D.2325, a
+ adcq $0,%rdx #
+ addq %r10, %rax # carry, a
+ adcq $0,%rdx #
+
+ cmpl %esi, %ebx # i, x_size
+ movq %rdx, %r10 #, carry
+ movq %rax, (%r9) # a,* D.2325
+ jne .L5 #,
+.L8:
+ movq -8(%rsp), %rdx # pretmp.21,
+ leaq (%r11,%rdx), %rax #, D.2332
+ movq (%rax), %rcx #* D.2332, D.2333
+ leaq (%r10,%rcx), %rdx #, z_sum
+ movq %rdx, (%rax) # z_sum,* D.2332
+ cmpq %rdx, %rcx # z_sum, D.2333
+ jbe .L9 #,
+ cmpl %ebp, %r15d # z_size, k.73
+ je .L9 #,
+ movl %r15d, %ecx # k.73, k
+ jmp .L10 #
+ .p2align 4,,10
+ .p2align 3
+.L31:
+ incl %ecx # k
+ cmpl %ecx, %ebp # k, z_size
+ .p2align 4,,4
+ .p2align 3
+ je .L9 #,
+.L10:
+ mov %ecx, %edx # k, k
+ leaq (%r11,%rdx,8), %rdx #, D.2342
+ movq (%rdx), %rax #* D.2342, tmp136
+ incq %rax # D.2344
+ movq %rax, (%rdx) # D.2344,* D.2342
+ testq %rax, %rax # D.2344
+ je .L31 #,
+.L9:
+ incl %r13d # j
+ decl %ebp # z_size
+ cmpl %r13d, %ebx # j, x_size
+ jne .L11 #,
+ movl %ebx, %esi # x_size, j.76
+.L3:
+ leal (%rbx,%rbx), %eax #, tmp137
+ mov %eax, %eax
+ leaq (%r14, %rax,8), %rdi #, D.2349
+ cmpq $0, (%rdi) #,* D.2349
+ jne .L12 #,
+ testl %ebx, %ebx # x_size
+ je .L12 #,
+ leal -1(%rbx), %ecx #, j
+ leal (%rsi,%rcx), %edx #, tmp141
+ mov %ecx, %eax # j, j
+ movq (%r14,%rdx,8), %rbp #* z,
+ cmpq %rbp, (%r12, %rax,8) #,* x
+ jb .L12 #,
+ ja .L_EXIT #,
+ leal -2(%rsi,%rbx), %edx #, ivtmp.45
+ jmp .L14 #
+ .p2align 4,,10
+ .p2align 3
+.L15:
+ mov %edx, %eax # ivtmp.45, ivtmp.45
+ decl %ecx # j
+ movq (%r14, %rax,8), %rsi #* z, D.2360
+ mov %ecx, %eax # j, j
+ movq (%r12, %rax,8), %rax #* x, temp.68
+ cmpq %rax, %rsi
+ ja .L12 #,
+ decl %edx # ivtmp.45
+ cmpq %rax, %rsi
+ jb .L_EXIT #,
+.L14:
+ testl %ecx, %ecx # j
+ jne .L15 #,
+.L12:
+ xorl %ecx, %ecx # j
+ xorl %r10d, %r10d # carry
+ mov %ebx, %esi # x_size, pretmp.19
+ testl %r8d, %r8d # blocks_of_8
+ je .L17 #,
+ .p2align 4,,10
+ .p2align 3
+.L22:
+ mov %ecx, %edx # j, D.2375
+ addl $8, %ecx #, j
+ leaq (%rdx,%rsi), %rax #, tmp146
+ leaq (%r12,%rdx,8), %rdx #, tmp150
+ leaq (%r14, %rax,8), %rax #, tmp148
+
+ rorq %r10 # carry
+
+ movq 8*0(%rdx), %r10
+ sbbq %r10, 8*0(%rax)
+
+ movq 8*1(%rdx), %r10
+ sbbq %r10, 8*1(%rax)
+
+ movq 8*2(%rdx), %r10
+ sbbq %r10, 8*2(%rax)
+
+ movq 8*3(%rdx), %r10
+ sbbq %r10, 8*3(%rax)
+
+ movq 8*4(%rdx), %r10
+ sbbq %r10, 8*4(%rax)
+
+ movq 8*5(%rdx), %r10
+ sbbq %r10, 8*5(%rax)
+
+ movq 8*6(%rdx), %r10
+ sbbq %r10, 8*6(%rax)
+
+ movq 8*7(%rdx), %r10
+ sbbq %r10, 8*7(%rax)
+
+ sbbq %r10,%r10 # carry
+ negq %r10 # carry
+
+ cmpl %ecx, %r8d # j, blocks_of_8
+ jne .L22 #,
+.L17:
+ cmpl %r8d, %ebx # blocks_of_8, x_size
+ je .L19 #,
+ leal (%r8,%rbx), %r9d #, ivtmp.33
+ movl %r8d, %esi # blocks_of_8, j
+ .p2align 4,,10
+ .p2align 3
+.L20:
+ mov %r9d, %eax # ivtmp.33, ivtmp.33
+ mov %esi, %ecx # j, j
+ leaq (%r14, %rax,8), %rax #, D.2387
+ incl %esi # j
+ movq (%rax), %rdx #* D.2387, tmp153
+ incl %r9d # ivtmp.33
+
+ rorq %r10 # carry
+ sbbq (%r12,%rcx,8),%rdx #* x, x
+ sbbq %r10,%r10 # carry
+ negq %r10 # carry
+
+ cmpl %esi, %ebx # j, x_size
+ movq %rdx, (%rax) # x,* D.2387
+ jne .L20 #,
+.L19:
+ testq %r10, %r10 # carry
+ je .L_EXIT #,
+ decq (%rdi) #* D.2349
+.L_EXIT:
+ popq %rbx #
+ popq %rbp #
+ popq %r12 #
+ popq %r13 #
+ popq %r14 #
+ popq %r15 #
+END_FUNCTION(bigint_monty_redc)
+
+
+#if 0
+ #define Z_ARR ARG_1 // rdi
+#define Z_SIZE ARG_2_32 // esi
+// X_ARR is ARG_3 == rdx, moved b/c needed for multiply
+#define X_SIZE ARG_4_32 // ecx
+#define U ARG_5 // r8
+
+/*
+ We need all arguments for a while (we can reuse U eventually)
+ So only temp registers are
+ TEMP_1 %r10
+ TEMP_2 %r11
+ TEMP_3 = ARG_6 = %r9
+ void return, so also
+ R0 %rax (aka TEMP_9)
+ is free (but needed for multiply)
+
+ Can push:
+ %rbx (base pointer, callee saved)
+ %rpb (frame pointer, callee saved)
+ %r12-%r15 (callee saved)
+
+ Can push base/frame pointers since this is a leaf function
+ and does not reference any data.
+*/
+
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+#define LOOP_CTR_I %r12
+#define LOOP_CTR_J %r13
+
+#define CARRY TEMP_1
+#define Z_WORD TEMP_2
+#define X_ARR TEMP_3
+#define MUL_LO %rax
+#define MUL_HI %rdx
+
+ ASSIGN(X_ARR, ARG_3)
+
+ /*
+ ZEROIZE(CARRY)
+
+ ASSIGN(LOOP_CTR, X_SIZE)
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N) \
+ ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+ ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+ MUL(Y) ; \
+ ADD(Z_WORD, CARRY) ; \
+ ASSIGN(CARRY, MUL_HI) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ADD(Z_WORD, MUL_LO) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+ALIGN
+.LOOP_MULADD8:
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(Z_ARR, 64)
+ ADD_IMM(X_ARR, 64)
+ cmp IMM(8), LOOP_CTR
+ jge .LOOP_MULADD8
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(Z_ARR, 8)
+ ADD_IMM(X_ARR, 8)
+
+ cmp IMM(0), LOOP_CTR
+ jne .LOOP_MULADD1
+*/
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+#endif
diff --git a/src/asm/asm_amd64/mp_mulop_amd64.S b/src/asm/asm_amd64/mp_mulop_amd64.S
new file mode 100644
index 000000000..e5bba23fb
--- /dev/null
+++ b/src/asm/asm_amd64/mp_mulop_amd64.S
@@ -0,0 +1,128 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_mulop.S)
+
+#if 0
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+ {
+ const u32bit blocks = x_size - (x_size % 8);
+
+ clear_mem(z, 2*x_size);
+
+ for(u32bit i = 0; i != x_size; ++i)
+ {
+ word carry = 0;
+
+ /*
+ for(u32bit j = 0; j != blocks; j += 8)
+ carry = word8_madd3(z + i + j, x + j, x[i], carry);
+
+ for(u32bit j = blocks; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+ */
+
+
+ for(u32bit j = 0; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+ for(u32bit j = 0; j != x_size; ++j)
+ {
+ dword z = (dword)a * b + c + *d;
+ *d = (word)(z >> BOTAN_MP_WORD_BITS);
+ return (word)z;
+ }
+
+
+
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+ }
+
+
+
+ z[x_size+i] = carry;
+ }
+ }
+
+#endif
+
+START_FUNCTION(bigint_simple_sqr)
+
+#define Z_ARR ARG_1
+#define X_ARR ARG_2
+//#define X_SIZE ARG_3_32
+
+#define CARRY TEMP_1
+#define Z_WORD TEMP_2
+#define LOOP_I TEMP_3
+#define LOOP_J TEMP_4
+#define X_SIZE TEMP_5
+#define MUL_LO %rax
+// arg 3, xsize
+#define MUL_HI %rdx
+
+// need arg3 == rdx for multiply
+ ASSIGN(X_SIZE, ARG3_32)
+
+ ZEROIZE(CARRY)
+
+ ZEROIZE(LOOP_I)
+
+.LOOP_ZEROIZE_Z:
+
+ cmp LOOP_I, X_SIZE
+
+
+
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N) \
+ ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+ ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+ MUL(Y) ; \
+ ADD(Z_WORD, CARRY) ; \
+ ASSIGN(CARRY, MUL_HI) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ADD(Z_WORD, MUL_LO) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+.LOOP_MULADD8:
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(Z_ARR, 64)
+ ADD_IMM(X_ARR, 64)
+ cmp IMM(8), LOOP_CTR
+ jge .LOOP_MULADD8
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(Z_ARR, 8)
+ ADD_IMM(X_ARR, 8)
+
+ cmp IMM(0), LOOP_CTR
+ jne .LOOP_MULADD1
+
+.L_MULADD_DONE:
+ RETURN_VALUE_IS(CARRY)
+END_FUNCTION(bigint_simple_square)
diff --git a/src/asm/asm_amd64/sha160.cpp b/src/asm/asm_amd64/sha160.cpp
new file mode 100644
index 000000000..cfac02f45
--- /dev/null
+++ b/src/asm/asm_amd64/sha160.cpp
@@ -0,0 +1,52 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/sha160.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void botan_sha160_asm_amd64(u32bit[5], const byte[64], u32bit[80]);
+
+/*************************************************
+* SHA-160 Compression Function *
+*************************************************/
+void SHA_160::hash(const byte input[])
+ {
+ botan_sha160_asm_amd64(digest, input, W);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void SHA_160::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(j % 4, digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void SHA_160::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ W.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ digest[4] = 0xC3D2E1F0;
+ }
+
+/*************************************************
+* SHA_160 Constructor *
+*************************************************/
+SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true), W(80)
+ {
+ clear();
+ }
+
+}
diff --git a/src/asm/asm_amd64/sha1_asm.S b/src/asm/asm_amd64/sha1_asm.S
new file mode 100644
index 000000000..ecf4a18ce
--- /dev/null
+++ b/src/asm/asm_amd64/sha1_asm.S
@@ -0,0 +1,258 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(sha1_asm.S)
+
+START_FUNCTION(botan_sha160_asm_amd64)
+
+#define DIGEST_ARR %rdi
+#define INPUT %rsi
+#define W %rdx
+#define LOOP_CTR %eax
+
+#define A %r8d
+#define B %r9d
+#define C %r10d
+#define D %r11d
+#define E %ecx
+
+ ZEROIZE(LOOP_CTR)
+
+ALIGN;
+.LOOP_LOAD_INPUT:
+ addl $8, %eax
+
+ movq ARRAY8(INPUT, 0), %r8
+ movq ARRAY8(INPUT, 1), %r9
+ movq ARRAY8(INPUT, 2), %r10
+ movq ARRAY8(INPUT, 3), %r11
+
+ bswap %r8
+ bswap %r9
+ bswap %r10
+ bswap %r11
+
+ rolq $32, %r8
+ rolq $32, %r9
+ rolq $32, %r10
+ rolq $32, %r11
+
+ movq %r8, ARRAY8(W, 0)
+ movq %r9, ARRAY8(W, 1)
+ movq %r10, ARRAY8(W, 2)
+ movq %r11, ARRAY8(W, 3)
+
+ addq $32, W
+ addq $32, INPUT
+
+ cmp IMM(16), LOOP_CTR
+ jne .LOOP_LOAD_INPUT
+
+/*
+#define A %r8d
+#define B %r9d
+#define C %r10d
+#define D %r11d
+#define E %ecx
+*/
+
+ALIGN;
+.LOOP_EXPANSION:
+ addl $4, LOOP_CTR
+
+ ZEROIZE(A)
+ ASSIGN(B, ARRAY4(W, -1))
+ ASSIGN(C, ARRAY4(W, -2))
+ ASSIGN(D, ARRAY4(W, -3))
+
+ XOR(A, ARRAY4(W, -5))
+ XOR(B, ARRAY4(W, -6))
+ XOR(C, ARRAY4(W, -7))
+ XOR(D, ARRAY4(W, -8))
+
+ XOR(A, ARRAY4(W, -11))
+ XOR(B, ARRAY4(W, -12))
+ XOR(C, ARRAY4(W, -13))
+ XOR(D, ARRAY4(W, -14))
+
+ XOR(A, ARRAY4(W, -13))
+ XOR(B, ARRAY4(W, -14))
+ XOR(C, ARRAY4(W, -15))
+ XOR(D, ARRAY4(W, -16))
+
+ ROTL_IMM(D, 1)
+ ROTL_IMM(C, 1)
+ ROTL_IMM(B, 1)
+ XOR(A, D)
+ ROTL_IMM(A, 1)
+
+ ASSIGN(ARRAY4(W, 0), D)
+ ASSIGN(ARRAY4(W, 1), C)
+ ASSIGN(ARRAY4(W, 2), B)
+ ASSIGN(ARRAY4(W, 3), A)
+
+ addq $16, W
+ cmp IMM(80), LOOP_CTR
+ jne .LOOP_EXPANSION
+
+ subq $320, W
+
+#define MAGIC1 0x5A827999
+#define MAGIC2 0x6ED9EBA1
+#define MAGIC3 0x8F1BBCDC
+#define MAGIC4 0xCA62C1D6
+
+#define T %esi
+#define T2 %eax
+
+#define F1(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(W, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, C) ; \
+ XOR(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC1) ; \
+ AND(E, B) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2_4(A, B, C, D, E, F, N, MAGIC) \
+ ASSIGN(T2, ARRAY4(W, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ XOR(E, C) ; \
+ ADD3_IMM(F, T2, MAGIC) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F3(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(W, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ OR(E, C) ; \
+ AND(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC3) ; \
+ ASSIGN(T2, B) ; \
+ AND(T2, C) ; \
+ OR(E, T2) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2(A, B, C, D, E, F, W) \
+ F2_4(A, B, C, D, E, F, W, MAGIC2)
+
+#define F4(A, B, C, D, E, F, W) \
+ F2_4(A, B, C, D, E, F, W, MAGIC4)
+
+ ASSIGN(T, ARRAY4(DIGEST_ARR, 0))
+ ASSIGN(B, ARRAY4(DIGEST_ARR, 1))
+ ASSIGN(C, ARRAY4(DIGEST_ARR, 2))
+ ASSIGN(D, ARRAY4(DIGEST_ARR, 3))
+ ASSIGN(E, ARRAY4(DIGEST_ARR, 4))
+
+ /* First Round */
+ F1(A, B, C, D, E, T, 0)
+ F1(T, A, B, C, D, E, 1)
+ F1(E, T, A, B, C, D, 2)
+ F1(D, E, T, A, B, C, 3)
+ F1(C, D, E, T, A, B, 4)
+ F1(B, C, D, E, T, A, 5)
+ F1(A, B, C, D, E, T, 6)
+ F1(T, A, B, C, D, E, 7)
+ F1(E, T, A, B, C, D, 8)
+ F1(D, E, T, A, B, C, 9)
+ F1(C, D, E, T, A, B, 10)
+ F1(B, C, D, E, T, A, 11)
+ F1(A, B, C, D, E, T, 12)
+ F1(T, A, B, C, D, E, 13)
+ F1(E, T, A, B, C, D, 14)
+ F1(D, E, T, A, B, C, 15)
+ F1(C, D, E, T, A, B, 16)
+ F1(B, C, D, E, T, A, 17)
+ F1(A, B, C, D, E, T, 18)
+ F1(T, A, B, C, D, E, 19)
+
+ /* Second Round */
+ F2(E, T, A, B, C, D, 20)
+ F2(D, E, T, A, B, C, 21)
+ F2(C, D, E, T, A, B, 22)
+ F2(B, C, D, E, T, A, 23)
+ F2(A, B, C, D, E, T, 24)
+ F2(T, A, B, C, D, E, 25)
+ F2(E, T, A, B, C, D, 26)
+ F2(D, E, T, A, B, C, 27)
+ F2(C, D, E, T, A, B, 28)
+ F2(B, C, D, E, T, A, 29)
+ F2(A, B, C, D, E, T, 30)
+ F2(T, A, B, C, D, E, 31)
+ F2(E, T, A, B, C, D, 32)
+ F2(D, E, T, A, B, C, 33)
+ F2(C, D, E, T, A, B, 34)
+ F2(B, C, D, E, T, A, 35)
+ F2(A, B, C, D, E, T, 36)
+ F2(T, A, B, C, D, E, 37)
+ F2(E, T, A, B, C, D, 38)
+ F2(D, E, T, A, B, C, 39)
+
+ /* Third Round */
+ F3(C, D, E, T, A, B, 40)
+ F3(B, C, D, E, T, A, 41)
+ F3(A, B, C, D, E, T, 42)
+ F3(T, A, B, C, D, E, 43)
+ F3(E, T, A, B, C, D, 44)
+ F3(D, E, T, A, B, C, 45)
+ F3(C, D, E, T, A, B, 46)
+ F3(B, C, D, E, T, A, 47)
+ F3(A, B, C, D, E, T, 48)
+ F3(T, A, B, C, D, E, 49)
+ F3(E, T, A, B, C, D, 50)
+ F3(D, E, T, A, B, C, 51)
+ F3(C, D, E, T, A, B, 52)
+ F3(B, C, D, E, T, A, 53)
+ F3(A, B, C, D, E, T, 54)
+ F3(T, A, B, C, D, E, 55)
+ F3(E, T, A, B, C, D, 56)
+ F3(D, E, T, A, B, C, 57)
+ F3(C, D, E, T, A, B, 58)
+ F3(B, C, D, E, T, A, 59)
+
+ /* Fourth Round */
+ F4(A, B, C, D, E, T, 60)
+ F4(T, A, B, C, D, E, 61)
+ F4(E, T, A, B, C, D, 62)
+ F4(D, E, T, A, B, C, 63)
+ F4(C, D, E, T, A, B, 64)
+ F4(B, C, D, E, T, A, 65)
+ F4(A, B, C, D, E, T, 66)
+ F4(T, A, B, C, D, E, 67)
+ F4(E, T, A, B, C, D, 68)
+ F4(D, E, T, A, B, C, 69)
+ F4(C, D, E, T, A, B, 70)
+ F4(B, C, D, E, T, A, 71)
+ F4(A, B, C, D, E, T, 72)
+ F4(T, A, B, C, D, E, 73)
+ F4(E, T, A, B, C, D, 74)
+ F4(D, E, T, A, B, C, 75)
+ F4(C, D, E, T, A, B, 76)
+ F4(B, C, D, E, T, A, 77)
+ F4(A, B, C, D, E, T, 78)
+ F4(T, A, B, C, D, E, 79)
+
+ ADD(ARRAY4(DIGEST_ARR, 0), D)
+ ADD(ARRAY4(DIGEST_ARR, 1), T)
+ ADD(ARRAY4(DIGEST_ARR, 2), A)
+ ADD(ARRAY4(DIGEST_ARR, 3), B)
+ ADD(ARRAY4(DIGEST_ARR, 4), C)
+
+END_FUNCTION(botan_sha160_asm_amd64)
diff --git a/src/asm/asm_ia32/asm_macr.h b/src/asm/asm_ia32/asm_macr.h
new file mode 100644
index 000000000..6d5dbb59d
--- /dev/null
+++ b/src/asm/asm_ia32/asm_macr.h
@@ -0,0 +1,131 @@
+/*************************************************
+* Assembly Macros Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_IA32_ASM_MACROS_H__
+#define BOTAN_IA32_ASM_MACROS_H__
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* General/Global Macros *
+*************************************************/
+#define ALIGN .p2align 4,,15
+
+#define START_LISTING(FILENAME) \
+ .file #FILENAME; \
+ .text; \
+ .p2align 4,,15;
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* Function Definitions *
+*************************************************/
+#define START_FUNCTION(func_name) \
+ .align 8; \
+ ALIGN; \
+ .global func_name; \
+ .type func_name,@function; \
+func_name:
+
+#define END_FUNCTION(func_name) \
+ ret
+
+/*************************************************
+* Loop Control *
+*************************************************/
+#define START_LOOP(LABEL) \
+ ALIGN; \
+ LABEL##_LOOP:
+
+#define LOOP_UNTIL_EQ(REG, NUM, LABEL) \
+ cmpl IMM(NUM), REG; \
+ jne LABEL##_LOOP
+
+#define LOOP_UNTIL_LT(REG, NUM, LABEL) \
+ cmpl IMM(NUM), REG; \
+ jge LABEL##_LOOP
+
+/*************************************************
+ Conditional Jumps *
+*************************************************/
+#define JUMP_IF_ZERO(REG, LABEL) \
+ cmpl IMM(0), REG; \
+ jz LABEL
+
+#define JUMP_IF_LT(REG, NUM, LABEL) \
+ cmpl IMM(NUM), REG; \
+ jl LABEL
+
+/*************************************************
+* Register Names *
+*************************************************/
+#define EAX %eax
+#define EBX %ebx
+#define ECX %ecx
+#define EDX %edx
+#define EBP %ebp
+#define EDI %edi
+#define ESI %esi
+#define ESP %esp
+
+/*************************************************
+* Memory Access Operations *
+*************************************************/
+#define ARRAY1(REG, NUM) (NUM)(REG)
+#define ARRAY4(REG, NUM) 4*(NUM)(REG)
+#define ARRAY4_INDIRECT(BASE, OFFSET, NUM) 4*(NUM)(BASE,OFFSET,4)
+#define ARG(NUM) 4*(PUSHED) + ARRAY4(ESP, NUM)
+
+#define ASSIGN(TO, FROM) movl FROM, TO
+#define ASSIGN_BYTE(TO, FROM) movzbl FROM, TO
+
+#define PUSH(REG) pushl REG
+#define POP(REG) popl REG
+
+#define SPILL_REGS() \
+ PUSH(EBP) ; \
+ PUSH(EDI) ; \
+ PUSH(ESI) ; \
+ PUSH(EBX)
+
+#define RESTORE_REGS() \
+ POP(EBX) ; \
+ POP(ESI) ; \
+ POP(EDI) ; \
+ POP(EBP)
+
+/*************************************************
+* ALU Operations *
+*************************************************/
+#define IMM(VAL) $VAL
+
+#define ADD(TO, FROM) addl FROM, TO
+#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM))
+#define ADD_W_CARRY(TO1, TO2, FROM) addl FROM, TO1; adcl IMM(0), TO2;
+#define SUB_IMM(TO, NUM) subl IMM(NUM), TO
+#define ADD2_IMM(TO, FROM, NUM) leal NUM(FROM), TO
+#define ADD3_IMM(TO, FROM, NUM) leal NUM(TO,FROM,1), TO
+#define MUL(REG) mull REG
+
+#define SHL_IMM(REG, SHIFT) shll IMM(SHIFT), REG
+#define SHR_IMM(REG, SHIFT) shrl IMM(SHIFT), REG
+#define SHL2_3(TO, FROM) leal 0(,FROM,8), TO
+
+#define XOR(TO, FROM) xorl FROM, TO
+#define AND(TO, FROM) andl FROM, TO
+#define OR(TO, FROM) orl FROM, TO
+#define NOT(REG) notl REG
+#define ZEROIZE(REG) XOR(REG, REG)
+
+#define ROTL_IMM(REG, NUM) roll IMM(NUM), REG
+#define ROTR_IMM(REG, NUM) rorl IMM(NUM), REG
+#define BSWAP(REG) bswapl REG
+
+#endif
diff --git a/src/asm/asm_ia32/md4.cpp b/src/asm/asm_ia32/md4.cpp
new file mode 100644
index 000000000..e3dc79012
--- /dev/null
+++ b/src/asm/asm_ia32/md4.cpp
@@ -0,0 +1,43 @@
+/*************************************************
+* MD4 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/md4.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void md4_core(u32bit[4], const byte[64], u32bit[16]);
+
+/*************************************************
+* MD4 Compression Function *
+*************************************************/
+void MD4::hash(const byte input[])
+ {
+ md4_core(digest, input, M);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void MD4::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(3 - (j % 4), digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void MD4::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ M.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ }
+
+}
diff --git a/src/asm/asm_ia32/md4core.S b/src/asm/asm_ia32/md4core.S
new file mode 100644
index 000000000..662e9924a
--- /dev/null
+++ b/src/asm/asm_ia32/md4core.S
@@ -0,0 +1,135 @@
+/*************************************************
+* MD4 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(md4core.S)
+
+START_FUNCTION(md4_core)
+ SPILL_REGS()
+
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(2)) /* input block */
+ ASSIGN(EDI, ARG(3)) /* expanded words */
+
+ ZEROIZE(ESI)
+
+START_LOOP(.LOAD_INPUT)
+ ADD_IMM(ESI, 4)
+
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ADD_IMM(EBP, 16)
+
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-4), EAX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-3), EBX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-2), ECX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-1), EDX)
+LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT)
+
+ ASSIGN(EBP, ARG(1))
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+#define MSG EDI
+#define T1 ESI
+#define T2 EBP
+
+#define FF(A, B, C, D, N, S) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, C) ; \
+ XOR(T2, D) ; \
+ AND(T2, B) ; \
+ XOR(T2, D) ; \
+ ADD(A, T1) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ;
+
+#define GG(A, B, C, D, N, S) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ OR(T2, C) ; \
+ AND(T2, D) ; \
+ ADD3_IMM(A, T1, 0x5A827999) ; \
+ ASSIGN(T1, B) ; \
+ AND(T1, C) ; \
+ OR(T2, T1) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ;
+
+#define HH(A, B, C, D, N, S) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ XOR(T2, C) ; \
+ XOR(T2, D) ; \
+ ADD3_IMM(A, T1, 0x6ED9EBA1) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ;
+
+ FF(EAX,EBX,ECX,EDX, 0, 3);
+ FF(EDX,EAX,EBX,ECX, 1, 7);
+ FF(ECX,EDX,EAX,EBX, 2,11);
+ FF(EBX,ECX,EDX,EAX, 3,19);
+ FF(EAX,EBX,ECX,EDX, 4, 3);
+ FF(EDX,EAX,EBX,ECX, 5, 7);
+ FF(ECX,EDX,EAX,EBX, 6,11);
+ FF(EBX,ECX,EDX,EAX, 7,19);
+ FF(EAX,EBX,ECX,EDX, 8, 3);
+ FF(EDX,EAX,EBX,ECX, 9, 7);
+ FF(ECX,EDX,EAX,EBX,10,11);
+ FF(EBX,ECX,EDX,EAX,11,19);
+ FF(EAX,EBX,ECX,EDX,12, 3);
+ FF(EDX,EAX,EBX,ECX,13, 7);
+ FF(ECX,EDX,EAX,EBX,14,11);
+ FF(EBX,ECX,EDX,EAX,15,19);
+
+ GG(EAX,EBX,ECX,EDX, 0, 3);
+ GG(EDX,EAX,EBX,ECX, 4, 5);
+ GG(ECX,EDX,EAX,EBX, 8, 9);
+ GG(EBX,ECX,EDX,EAX,12,13);
+ GG(EAX,EBX,ECX,EDX, 1, 3);
+ GG(EDX,EAX,EBX,ECX, 5, 5);
+ GG(ECX,EDX,EAX,EBX, 9, 9);
+ GG(EBX,ECX,EDX,EAX,13,13);
+ GG(EAX,EBX,ECX,EDX, 2, 3);
+ GG(EDX,EAX,EBX,ECX, 6, 5);
+ GG(ECX,EDX,EAX,EBX,10, 9);
+ GG(EBX,ECX,EDX,EAX,14,13);
+ GG(EAX,EBX,ECX,EDX, 3, 3);
+ GG(EDX,EAX,EBX,ECX, 7, 5);
+ GG(ECX,EDX,EAX,EBX,11, 9);
+ GG(EBX,ECX,EDX,EAX,15,13);
+
+ HH(EAX,EBX,ECX,EDX, 0, 3);
+ HH(EDX,EAX,EBX,ECX, 8, 9);
+ HH(ECX,EDX,EAX,EBX, 4,11);
+ HH(EBX,ECX,EDX,EAX,12,15);
+ HH(EAX,EBX,ECX,EDX, 2, 3);
+ HH(EDX,EAX,EBX,ECX,10, 9);
+ HH(ECX,EDX,EAX,EBX, 6,11);
+ HH(EBX,ECX,EDX,EAX,14,15);
+ HH(EAX,EBX,ECX,EDX, 1, 3);
+ HH(EDX,EAX,EBX,ECX, 9, 9);
+ HH(ECX,EDX,EAX,EBX, 5,11);
+ HH(EBX,ECX,EDX,EAX,13,15);
+ HH(EAX,EBX,ECX,EDX, 3, 3);
+ HH(EDX,EAX,EBX,ECX,11, 9);
+ HH(ECX,EDX,EAX,EBX, 7,11);
+ HH(EBX,ECX,EDX,EAX,15,15);
+
+ ASSIGN(EBP, ARG(1))
+ ADD(ARRAY4(EBP, 0), EAX)
+ ADD(ARRAY4(EBP, 1), EBX)
+ ADD(ARRAY4(EBP, 2), ECX)
+ ADD(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+END_FUNCTION(md4_core)
diff --git a/src/asm/asm_ia32/md5.cpp b/src/asm/asm_ia32/md5.cpp
new file mode 100644
index 000000000..cfe48e7e9
--- /dev/null
+++ b/src/asm/asm_ia32/md5.cpp
@@ -0,0 +1,43 @@
+/*************************************************
+* MD5 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/md5.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void md5_core(u32bit[4], const byte[64], u32bit[16]);
+
+/*************************************************
+* MD5 Compression Function *
+*************************************************/
+void MD5::hash(const byte input[])
+ {
+ md5_core(digest, input, M);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void MD5::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(3 - (j % 4), digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void MD5::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ M.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ }
+
+}
diff --git a/src/asm/asm_ia32/md5core.S b/src/asm/asm_ia32/md5core.S
new file mode 100644
index 000000000..8ebe469f3
--- /dev/null
+++ b/src/asm/asm_ia32/md5core.S
@@ -0,0 +1,164 @@
+/*************************************************
+* MD5 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(md5core.S)
+
+START_FUNCTION(md5_core)
+ SPILL_REGS()
+
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(2)) /* input block */
+ ASSIGN(EDI, ARG(3)) /* expanded words */
+
+ ZEROIZE(ESI)
+
+START_LOOP(.LOAD_INPUT)
+ ADD_IMM(ESI, 4)
+
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ADD_IMM(EBP, 16)
+
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-4), EAX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-3), EBX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-2), ECX)
+ ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-1), EDX)
+LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT)
+
+ ASSIGN(EBP, ARG(1))
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+#define MSG EDI
+#define T1 ESI
+#define T2 EBP
+
+#define FF(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, C) ; \
+ XOR(T2, D) ; \
+ AND(T2, B) ; \
+ XOR(T2, D) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+#define GG(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ XOR(T2, C) ; \
+ AND(T2, D) ; \
+ XOR(T2, C) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+#define HH(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, B) ; \
+ XOR(T2, C) ; \
+ XOR(T2, D) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+#define II(A, B, C, D, N, S, MAGIC) \
+ ASSIGN(T1, ARRAY4(MSG, N)) ; \
+ ASSIGN(T2, D) ; \
+ NOT(T2) ; \
+ OR(T2, B) ; \
+ XOR(T2, C) ; \
+ ADD3_IMM(A, T1, MAGIC) ; \
+ ADD(A, T2) ; \
+ ROTL_IMM(A, S) ; \
+ ADD(A, B) ;
+
+ FF(EAX,EBX,ECX,EDX, 0, 7,0xD76AA478);
+ FF(EDX,EAX,EBX,ECX, 1,12,0xE8C7B756);
+ FF(ECX,EDX,EAX,EBX, 2,17,0x242070DB);
+ FF(EBX,ECX,EDX,EAX, 3,22,0xC1BDCEEE);
+ FF(EAX,EBX,ECX,EDX, 4, 7,0xF57C0FAF);
+ FF(EDX,EAX,EBX,ECX, 5,12,0x4787C62A);
+ FF(ECX,EDX,EAX,EBX, 6,17,0xA8304613);
+ FF(EBX,ECX,EDX,EAX, 7,22,0xFD469501);
+ FF(EAX,EBX,ECX,EDX, 8, 7,0x698098D8);
+ FF(EDX,EAX,EBX,ECX, 9,12,0x8B44F7AF);
+ FF(ECX,EDX,EAX,EBX,10,17,0xFFFF5BB1);
+ FF(EBX,ECX,EDX,EAX,11,22,0x895CD7BE);
+ FF(EAX,EBX,ECX,EDX,12, 7,0x6B901122);
+ FF(EDX,EAX,EBX,ECX,13,12,0xFD987193);
+ FF(ECX,EDX,EAX,EBX,14,17,0xA679438E);
+ FF(EBX,ECX,EDX,EAX,15,22,0x49B40821);
+
+ GG(EAX,EBX,ECX,EDX, 1, 5,0xF61E2562);
+ GG(EDX,EAX,EBX,ECX, 6, 9,0xC040B340);
+ GG(ECX,EDX,EAX,EBX,11,14,0x265E5A51);
+ GG(EBX,ECX,EDX,EAX, 0,20,0xE9B6C7AA);
+ GG(EAX,EBX,ECX,EDX, 5, 5,0xD62F105D);
+ GG(EDX,EAX,EBX,ECX,10, 9,0x02441453);
+ GG(ECX,EDX,EAX,EBX,15,14,0xD8A1E681);
+ GG(EBX,ECX,EDX,EAX, 4,20,0xE7D3FBC8);
+ GG(EAX,EBX,ECX,EDX, 9, 5,0x21E1CDE6);
+ GG(EDX,EAX,EBX,ECX,14, 9,0xC33707D6);
+ GG(ECX,EDX,EAX,EBX, 3,14,0xF4D50D87);
+ GG(EBX,ECX,EDX,EAX, 8,20,0x455A14ED);
+ GG(EAX,EBX,ECX,EDX,13, 5,0xA9E3E905);
+ GG(EDX,EAX,EBX,ECX, 2, 9,0xFCEFA3F8);
+ GG(ECX,EDX,EAX,EBX, 7,14,0x676F02D9);
+ GG(EBX,ECX,EDX,EAX,12,20,0x8D2A4C8A);
+
+ HH(EAX,EBX,ECX,EDX, 5, 4,0xFFFA3942);
+ HH(EDX,EAX,EBX,ECX, 8,11,0x8771F681);
+ HH(ECX,EDX,EAX,EBX,11,16,0x6D9D6122);
+ HH(EBX,ECX,EDX,EAX,14,23,0xFDE5380C);
+ HH(EAX,EBX,ECX,EDX, 1, 4,0xA4BEEA44);
+ HH(EDX,EAX,EBX,ECX, 4,11,0x4BDECFA9);
+ HH(ECX,EDX,EAX,EBX, 7,16,0xF6BB4B60);
+ HH(EBX,ECX,EDX,EAX,10,23,0xBEBFBC70);
+ HH(EAX,EBX,ECX,EDX,13, 4,0x289B7EC6);
+ HH(EDX,EAX,EBX,ECX, 0,11,0xEAA127FA);
+ HH(ECX,EDX,EAX,EBX, 3,16,0xD4EF3085);
+ HH(EBX,ECX,EDX,EAX, 6,23,0x04881D05);
+ HH(EAX,EBX,ECX,EDX, 9, 4,0xD9D4D039);
+ HH(EDX,EAX,EBX,ECX,12,11,0xE6DB99E5);
+ HH(ECX,EDX,EAX,EBX,15,16,0x1FA27CF8);
+ HH(EBX,ECX,EDX,EAX, 2,23,0xC4AC5665);
+
+ II(EAX,EBX,ECX,EDX, 0, 6,0xF4292244);
+ II(EDX,EAX,EBX,ECX, 7,10,0x432AFF97);
+ II(ECX,EDX,EAX,EBX,14,15,0xAB9423A7);
+ II(EBX,ECX,EDX,EAX, 5,21,0xFC93A039);
+ II(EAX,EBX,ECX,EDX,12, 6,0x655B59C3);
+ II(EDX,EAX,EBX,ECX, 3,10,0x8F0CCC92);
+ II(ECX,EDX,EAX,EBX,10,15,0xFFEFF47D);
+ II(EBX,ECX,EDX,EAX, 1,21,0x85845DD1);
+ II(EAX,EBX,ECX,EDX, 8, 6,0x6FA87E4F);
+ II(EDX,EAX,EBX,ECX,15,10,0xFE2CE6E0);
+ II(ECX,EDX,EAX,EBX, 6,15,0xA3014314);
+ II(EBX,ECX,EDX,EAX,13,21,0x4E0811A1);
+ II(EAX,EBX,ECX,EDX, 4, 6,0xF7537E82);
+ II(EDX,EAX,EBX,ECX,11,10,0xBD3AF235);
+ II(ECX,EDX,EAX,EBX, 2,15,0x2AD7D2BB);
+ II(EBX,ECX,EDX,EAX, 9,21,0xEB86D391);
+
+ ASSIGN(EBP, ARG(1))
+ ADD(ARRAY4(EBP, 0), EAX)
+ ADD(ARRAY4(EBP, 1), EBX)
+ ADD(ARRAY4(EBP, 2), ECX)
+ ADD(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+END_FUNCTION(md5_core)
diff --git a/src/asm/asm_ia32/modinfo.txt b/src/asm/asm_ia32/modinfo.txt
new file mode 100644
index 000000000..12c8cd96d
--- /dev/null
+++ b/src/asm/asm_ia32/modinfo.txt
@@ -0,0 +1,43 @@
+realname "x86 Assembler"
+
+#mp_bits 32
+
+load_on asm_ok
+
+<replace>
+md4.cpp
+md5.cpp
+sha160.cpp
+serpent.cpp
+</replace>
+
+<ignore>
+#mp_mulop.cpp
+</ignore>
+
+<add>
+asm_macr.h
+md4core.S
+md5core.S
+sha1_asm.S
+serp_asm.S
+#mp_mulop.S
+</add>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
diff --git a/src/asm/asm_ia32/mp_mulop.S b/src/asm/asm_ia32/mp_mulop.S
new file mode 100644
index 000000000..a5f0d3b27
--- /dev/null
+++ b/src/asm/asm_ia32/mp_mulop.S
@@ -0,0 +1,62 @@
+/*************************************************
+* Multiply/Add Algorithm Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_muladd.S)
+
+START_FUNCTION(bigint_mul_add_words)
+ SPILL_REGS()
+#define PUSHED 4
+
+#define LOOP_CTR ESI
+ ASSIGN(LOOP_CTR, ARG(3)) /* x_size */
+ ZEROIZE(EDI)
+
+ ASSIGN(ECX, ARG(1)) /* z[] */
+ ASSIGN(EBX, ARG(2)) /* x[] */
+ ASSIGN(EBP, ARG(4)) /* y */
+
+#define MULADD_OP(N) \
+ ASSIGN(EAX, ARRAY4(EBX, N)) ; \
+ MUL(EBP) ; \
+ ADD_W_CARRY(EAX, EDX, EDI) ; \
+ ASSIGN(EDI, EDX) ; \
+ ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ;
+
+ JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP)
+
+START_LOOP(.MULADD8)
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(EBX, 32)
+ ADD_IMM(ECX, 32)
+LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8)
+
+ JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE)
+
+START_LOOP(.MULADD1)
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(EBX, 4)
+ ADD_IMM(ECX, 4)
+LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1)
+
+.MUL_ADD_DONE:
+
+ ASSIGN(EAX, EDI)
+#undef PUSHED
+ RESTORE_REGS()
+END_FUNCTION(bigint_mul_add_words)
diff --git a/src/asm/asm_ia32/serp_asm.S b/src/asm/asm_ia32/serp_asm.S
new file mode 100644
index 000000000..c8915382d
--- /dev/null
+++ b/src/asm/asm_ia32/serp_asm.S
@@ -0,0 +1,667 @@
+/*************************************************
+* Serpent Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(serp_asm.S)
+
+#define SBOX_E1(A, B, C, D, T) \
+ XOR(D, A) ; \
+ ASSIGN(T, B) ; \
+ AND(B, D) ; \
+ XOR(T, C) ; \
+ XOR(B, A) ; \
+ OR(A, D) ; \
+ XOR(A, T) ; \
+ XOR(T, D) ; \
+ XOR(D, C) ; \
+ OR(C, B) ; \
+ XOR(C, T) ; \
+ NOT(T) ; \
+ OR(T, B) ; \
+ XOR(B, D) ; \
+ XOR(B, T) ; \
+ OR(D, A) ; \
+ XOR(B, D) ; \
+ XOR(T, D) ; \
+ ASSIGN(D, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_E2(A, B, C, D, T) \
+ NOT(A) ; \
+ NOT(C) ; \
+ ASSIGN(T, A) ; \
+ AND(A, B) ; \
+ XOR(C, A) ; \
+ OR(A, D) ; \
+ XOR(D, C) ; \
+ XOR(B, A) ; \
+ XOR(A, T) ; \
+ OR(T, B) ; \
+ XOR(B, D) ; \
+ OR(C, A) ; \
+ AND(C, T) ; \
+ XOR(A, B) ; \
+ AND(B, C) ; \
+ XOR(B, A) ; \
+ AND(A, C) ; \
+ XOR(T, A) ; \
+ ASSIGN(A, C) ; \
+ ASSIGN(C, D) ; \
+ ASSIGN(D, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_E3(A, B, C, D, T) \
+ ASSIGN(T, A) ; \
+ AND(A, C) ; \
+ XOR(A, D) ; \
+ XOR(C, B) ; \
+ XOR(C, A) ; \
+ OR(D, T) ; \
+ XOR(D, B) ; \
+ XOR(T, C) ; \
+ ASSIGN(B, D) ; \
+ OR(D, T) ; \
+ XOR(D, A) ; \
+ AND(A, B) ; \
+ XOR(T, A) ; \
+ XOR(B, D) ; \
+ XOR(B, T) ; \
+ NOT(T) ; \
+ ASSIGN(A, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_E4(A, B, C, D, T) \
+ ASSIGN(T, A) ; \
+ OR(A, D) ; \
+ XOR(D, B) ; \
+ AND(B, T) ; \
+ XOR(T, C) ; \
+ XOR(C, D) ; \
+ AND(D, A) ; \
+ OR(T, B) ; \
+ XOR(D, T) ; \
+ XOR(A, B) ; \
+ AND(T, A) ; \
+ XOR(B, D) ; \
+ XOR(T, C) ; \
+ OR(B, A) ; \
+ XOR(B, C) ; \
+ XOR(A, D) ; \
+ ASSIGN(C, B) ; \
+ OR(B, D) ; \
+ XOR(B, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, C) ; \
+ ASSIGN(C, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_E5(A, B, C, D, T) \
+ XOR(B, D) ; \
+ NOT(D) ; \
+ XOR(C, D) ; \
+ XOR(D, A) ; \
+ ASSIGN(T, B) ; \
+ AND(B, D) ; \
+ XOR(B, C) ; \
+ XOR(T, D) ; \
+ XOR(A, T) ; \
+ AND(C, T) ; \
+ XOR(C, A) ; \
+ AND(A, B) ; \
+ XOR(D, A) ; \
+ OR(T, B) ; \
+ XOR(T, A) ; \
+ OR(A, D) ; \
+ XOR(A, C) ; \
+ AND(C, D) ; \
+ NOT(A) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_E6(A, B, C, D, T) \
+ XOR(A, B) ; \
+ XOR(B, D) ; \
+ NOT(D) ; \
+ ASSIGN(T, B) ; \
+ AND(B, A) ; \
+ XOR(C, D) ; \
+ XOR(B, C) ; \
+ OR(C, T) ; \
+ XOR(T, D) ; \
+ AND(D, B) ; \
+ XOR(D, A) ; \
+ XOR(T, B) ; \
+ XOR(T, C) ; \
+ XOR(C, A) ; \
+ AND(A, D) ; \
+ NOT(C) ; \
+ XOR(A, T) ; \
+ OR(T, D) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_E7(A, B, C, D, T) \
+ NOT(C) ; \
+ ASSIGN(T, D) ; \
+ AND(D, A) ; \
+ XOR(A, T) ; \
+ XOR(D, C) ; \
+ OR(C, T) ; \
+ XOR(B, D) ; \
+ XOR(C, A) ; \
+ OR(A, B) ; \
+ XOR(C, B) ; \
+ XOR(T, A) ; \
+ OR(A, D) ; \
+ XOR(A, C) ; \
+ XOR(T, D) ; \
+ XOR(T, A) ; \
+ NOT(D) ; \
+ AND(C, T) ; \
+ XOR(C, D) ; \
+ ASSIGN(D, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_E8(A, B, C, D, T) \
+ ASSIGN(T, B) ; \
+ OR(B, C) ; \
+ XOR(B, D) ; \
+ XOR(T, C) ; \
+ XOR(C, B) ; \
+ OR(D, T) ; \
+ AND(D, A) ; \
+ XOR(T, C) ; \
+ XOR(D, B) ; \
+ OR(B, T) ; \
+ XOR(B, A) ; \
+ OR(A, T) ; \
+ XOR(A, C) ; \
+ XOR(B, T) ; \
+ XOR(C, B) ; \
+ AND(B, A) ; \
+ XOR(B, T) ; \
+ NOT(C) ; \
+ OR(C, A) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, A) ; \
+ ASSIGN(A, T) ;
+
+#define SBOX_D1(A, B, C, D, T) \
+ NOT(C) ; \
+ ASSIGN(T, B) ; \
+ OR(B, A) ; \
+ NOT(T) ; \
+ XOR(B, C) ; \
+ OR(C, T) ; \
+ XOR(B, D) ; \
+ XOR(A, T) ; \
+ XOR(C, A) ; \
+ AND(A, D) ; \
+ XOR(T, A) ; \
+ OR(A, B) ; \
+ XOR(A, C) ; \
+ XOR(D, T) ; \
+ XOR(C, B) ; \
+ XOR(D, A) ; \
+ XOR(D, B) ; \
+ AND(C, D) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_D2(A, B, C, D, T) \
+ ASSIGN(T, B) ; \
+ XOR(B, D) ; \
+ AND(D, B) ; \
+ XOR(T, C) ; \
+ XOR(D, A) ; \
+ OR(A, B) ; \
+ XOR(C, D) ; \
+ XOR(A, T) ; \
+ OR(A, C) ; \
+ XOR(B, D) ; \
+ XOR(A, B) ; \
+ OR(B, D) ; \
+ XOR(B, A) ; \
+ NOT(T) ; \
+ XOR(T, B) ; \
+ OR(B, A) ; \
+ XOR(B, A) ; \
+ OR(B, T) ; \
+ XOR(D, B) ; \
+ ASSIGN(B, A) ; \
+ ASSIGN(A, T) ; \
+ ASSIGN(T, D) ; \
+ ASSIGN(D, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_D3(A, B, C, D, T) \
+ XOR(C, D) ; \
+ XOR(D, A) ; \
+ ASSIGN(T, D) ; \
+ AND(D, C) ; \
+ XOR(D, B) ; \
+ OR(B, C) ; \
+ XOR(B, T) ; \
+ AND(T, D) ; \
+ XOR(C, D) ; \
+ AND(T, A) ; \
+ XOR(T, C) ; \
+ AND(C, B) ; \
+ OR(C, A) ; \
+ NOT(D) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ AND(A, B) ; \
+ XOR(D, T) ; \
+ XOR(D, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ;
+
+#define SBOX_D4(A, B, C, D, T) \
+ ASSIGN(T, C) ; \
+ XOR(C, B) ; \
+ XOR(A, C) ; \
+ AND(T, C) ; \
+ XOR(T, A) ; \
+ AND(A, B) ; \
+ XOR(B, D) ; \
+ OR(D, T) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ XOR(B, T) ; \
+ AND(D, C) ; \
+ XOR(D, B) ; \
+ XOR(B, A) ; \
+ OR(B, C) ; \
+ XOR(A, D) ; \
+ XOR(B, T) ; \
+ XOR(A, B) ; \
+ ASSIGN(T, A) ; \
+ ASSIGN(A, C) ; \
+ ASSIGN(C, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_D5(A, B, C, D, T) \
+ ASSIGN(T, C) ; \
+ AND(C, D) ; \
+ XOR(C, B) ; \
+ OR(B, D) ; \
+ AND(B, A) ; \
+ XOR(T, C) ; \
+ XOR(T, B) ; \
+ AND(B, C) ; \
+ NOT(A) ; \
+ XOR(D, T) ; \
+ XOR(B, D) ; \
+ AND(D, A) ; \
+ XOR(D, C) ; \
+ XOR(A, B) ; \
+ AND(C, A) ; \
+ XOR(D, A) ; \
+ XOR(C, T) ; \
+ OR(C, D) ; \
+ XOR(D, A) ; \
+ XOR(C, B) ; \
+ ASSIGN(B, D) ; \
+ ASSIGN(D, T) ;
+
+#define SBOX_D6(A, B, C, D, T) \
+ NOT(B) ; \
+ ASSIGN(T, D) ; \
+ XOR(C, B) ; \
+ OR(D, A) ; \
+ XOR(D, C) ; \
+ OR(C, B) ; \
+ AND(C, A) ; \
+ XOR(T, D) ; \
+ XOR(C, T) ; \
+ OR(T, A) ; \
+ XOR(T, B) ; \
+ AND(B, C) ; \
+ XOR(B, D) ; \
+ XOR(T, C) ; \
+ AND(D, T) ; \
+ XOR(T, B) ; \
+ XOR(D, T) ; \
+ NOT(T) ; \
+ XOR(D, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, T) ; \
+ ASSIGN(T, D) ; \
+ ASSIGN(D, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_D7(A, B, C, D, T) \
+ XOR(A, C) ; \
+ ASSIGN(T, C) ; \
+ AND(C, A) ; \
+ XOR(T, D) ; \
+ NOT(C) ; \
+ XOR(D, B) ; \
+ XOR(C, D) ; \
+ OR(T, A) ; \
+ XOR(A, C) ; \
+ XOR(D, T) ; \
+ XOR(T, B) ; \
+ AND(B, D) ; \
+ XOR(B, A) ; \
+ XOR(A, D) ; \
+ OR(A, C) ; \
+ XOR(D, B) ; \
+ XOR(T, A) ; \
+ ASSIGN(A, B) ; \
+ ASSIGN(B, C) ; \
+ ASSIGN(C, T) ;
+
+#define SBOX_D8(A, B, C, D, T) \
+ ASSIGN(T, C) ; \
+ XOR(C, A) ; \
+ AND(A, D) ; \
+ OR(T, D) ; \
+ NOT(C) ; \
+ XOR(D, B) ; \
+ OR(B, A) ; \
+ XOR(A, C) ; \
+ AND(C, T) ; \
+ AND(D, T) ; \
+ XOR(B, C) ; \
+ XOR(C, A) ; \
+ OR(A, C) ; \
+ XOR(T, B) ; \
+ XOR(A, D) ; \
+ XOR(D, T) ; \
+ OR(T, A) ; \
+ XOR(D, C) ; \
+ XOR(T, C) ; \
+ ASSIGN(C, B) ; \
+ ASSIGN(B, A) ; \
+ ASSIGN(A, D) ; \
+ ASSIGN(D, T) ;
+
+#define TRANSFORM(A, B, C, D, T) \
+ ROTL_IMM(A, 13) ; \
+ ROTL_IMM(C, 3) ; \
+ SHL2_3(T, A) ; \
+ XOR(B, A) ; \
+ XOR(D, C) ; \
+ XOR(B, C) ; \
+ XOR(D, T) ; \
+ ROTL_IMM(B, 1) ; \
+ ROTL_IMM(D, 7) ; \
+ ASSIGN(T, B) ; \
+ SHL_IMM(T, 7) ; \
+ XOR(A, B) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ XOR(C, T) ; \
+ ROTL_IMM(A, 5) ; \
+ ROTL_IMM(C, 22) ;
+
+#define I_TRANSFORM(A, B, C, D, T) \
+ ROTR_IMM(C, 22) ; \
+ ROTR_IMM(A, 5) ; \
+ ASSIGN(T, B) ; \
+ SHL_IMM(T, 7) ; \
+ XOR(A, B) ; \
+ XOR(C, D) ; \
+ XOR(A, D) ; \
+ XOR(C, T) ; \
+ ROTR_IMM(D, 7) ; \
+ ROTR_IMM(B, 1) ; \
+ SHL2_3(T, A) ; \
+ XOR(B, C) ; \
+ XOR(D, C) ; \
+ XOR(B, A) ; \
+ XOR(D, T) ; \
+ ROTR_IMM(C, 3) ; \
+ ROTR_IMM(A, 13) ;
+
+#define KEY_XOR(A, B, C, D, N) \
+ XOR(A, ARRAY4(EDI, (4*N ))) ; \
+ XOR(B, ARRAY4(EDI, (4*N+1))) ; \
+ XOR(C, ARRAY4(EDI, (4*N+2))) ; \
+ XOR(D, ARRAY4(EDI, (4*N+3))) ;
+
+/*************************************************
+* Serpent Encryption *
+*************************************************/
+START_FUNCTION(serpent_encrypt)
+ SPILL_REGS()
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(1)) /* input block */
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ASSIGN(EDI, ARG(3)) /* round keys */
+ ZEROIZE(EBP)
+
+#define E_ROUND(A, B, C, D, T, N, SBOX) \
+ KEY_XOR(A, B, C, D, N) \
+ SBOX(A, B, C, D, T) \
+ TRANSFORM(A, B, C, D, T)
+
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 0, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 1, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 2, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 3, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 4, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 5, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 6, SBOX_E7)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 7, SBOX_E8)
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 8, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 9, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 10, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 11, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 12, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 13, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 14, SBOX_E7)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 15, SBOX_E8)
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 16, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 17, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 18, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 19, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 20, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 21, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 22, SBOX_E7)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 23, SBOX_E8)
+
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 24, SBOX_E1)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 25, SBOX_E2)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 26, SBOX_E3)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 27, SBOX_E4)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 28, SBOX_E5)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 29, SBOX_E6)
+ E_ROUND(EAX, EBX, ECX, EDX, EBP, 30, SBOX_E7)
+
+ KEY_XOR(EAX, EBX, ECX, EDX, 31)
+ SBOX_E8(EAX, EBX, ECX, EDX, EBP)
+ KEY_XOR(EAX, EBX, ECX, EDX, 32)
+
+ ASSIGN(EBP, ARG(2)) /* output block */
+ ASSIGN(ARRAY4(EBP, 0), EAX)
+ ASSIGN(ARRAY4(EBP, 1), EBX)
+ ASSIGN(ARRAY4(EBP, 2), ECX)
+ ASSIGN(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+#undef PUSHED
+END_FUNCTION(serpent_encrypt)
+
+/*************************************************
+* Serpent Decryption *
+*************************************************/
+START_FUNCTION(serpent_decrypt)
+ SPILL_REGS()
+#define PUSHED 4
+
+ ASSIGN(EBP, ARG(1)) /* input block */
+ ASSIGN(EAX, ARRAY4(EBP, 0))
+ ASSIGN(EBX, ARRAY4(EBP, 1))
+ ASSIGN(ECX, ARRAY4(EBP, 2))
+ ASSIGN(EDX, ARRAY4(EBP, 3))
+
+ ASSIGN(EDI, ARG(3)) /* round keys */
+
+ ZEROIZE(EBP)
+
+#define D_ROUND(A, B, C, D, T, N, SBOX) \
+ I_TRANSFORM(A, B, C, D, T) \
+ SBOX(A, B, C, D, T) \
+ KEY_XOR(A, B, C, D, N) \
+
+ KEY_XOR(EAX, EBX, ECX, EDX, 32)
+ SBOX_D8(EAX, EBX, ECX, EDX, EBP)
+ KEY_XOR(EAX, EBX, ECX, EDX, 31)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 30, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 29, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 28, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 27, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 26, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 25, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 24, SBOX_D1)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 23, SBOX_D8)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 22, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 21, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 20, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 19, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 18, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 17, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 16, SBOX_D1)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 15, SBOX_D8)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 14, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 13, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 12, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 11, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 10, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 9, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 8, SBOX_D1)
+
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 7, SBOX_D8)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 6, SBOX_D7)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 5, SBOX_D6)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 4, SBOX_D5)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 3, SBOX_D4)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 2, SBOX_D3)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 1, SBOX_D2)
+ D_ROUND(EAX, EBX, ECX, EDX, EBP, 0, SBOX_D1)
+
+ ASSIGN(EBP, ARG(2)) /* output block */
+ ASSIGN(ARRAY4(EBP, 0), EAX)
+ ASSIGN(ARRAY4(EBP, 1), EBX)
+ ASSIGN(ARRAY4(EBP, 2), ECX)
+ ASSIGN(ARRAY4(EBP, 3), EDX)
+
+ RESTORE_REGS()
+#undef PUSHED
+END_FUNCTION(serpent_decrypt)
+
+/*************************************************
+* Serpent Key Schedule *
+*************************************************/
+START_FUNCTION(serpent_key_schedule)
+ SPILL_REGS()
+#define PUSHED 4
+
+ ASSIGN(EDI, ARG(1)) /* round keys */
+ ASSIGN(ESI, IMM(8))
+ ADD_IMM(EDI, 32)
+
+START_LOOP(.EXPANSION)
+ ASSIGN(EAX, ARRAY4(EDI, -1))
+ ASSIGN(EBX, ARRAY4(EDI, -3))
+ ASSIGN(ECX, ARRAY4(EDI, -5))
+ ASSIGN(EDX, ARRAY4(EDI, -8))
+
+ ASSIGN(EBP, ESI)
+ SUB_IMM(EBP, 8)
+ XOR(EBP, IMM(0x9E3779B9))
+ XOR(EAX, EBX)
+ XOR(ECX, EDX)
+ XOR(EAX, EBP)
+ XOR(EAX, ECX)
+
+ ROTL_IMM(EAX, 11)
+
+ ASSIGN(ARRAY4(EDI, 0), EAX)
+
+ ADD_IMM(ESI, 1)
+ ADD_IMM(EDI, 4)
+LOOP_UNTIL_EQ(ESI, 140, .EXPANSION)
+
+ ASSIGN(EDI, ARG(1)) /* round keys */
+
+#define LOAD_AND_SBOX(MSG, SBOX) \
+ ASSIGN(EAX, ARRAY4(EDI, (4*MSG+ 8))) ; \
+ ASSIGN(EBX, ARRAY4(EDI, (4*MSG+ 9))) ; \
+ ASSIGN(ECX, ARRAY4(EDI, (4*MSG+10))) ; \
+ ASSIGN(EDX, ARRAY4(EDI, (4*MSG+11))) ; \
+ SBOX(EAX, EBX, ECX, EDX, EBP) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+ 8)), EAX) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+ 9)), EBX) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+10)), ECX) ; \
+ ASSIGN(ARRAY4(EDI, (4*MSG+11)), EDX)
+
+ LOAD_AND_SBOX( 0, SBOX_E4)
+ LOAD_AND_SBOX( 1, SBOX_E3)
+ LOAD_AND_SBOX( 2, SBOX_E2)
+ LOAD_AND_SBOX( 3, SBOX_E1)
+
+ LOAD_AND_SBOX( 4, SBOX_E8)
+ LOAD_AND_SBOX( 5, SBOX_E7)
+ LOAD_AND_SBOX( 6, SBOX_E6)
+ LOAD_AND_SBOX( 7, SBOX_E5)
+ LOAD_AND_SBOX( 8, SBOX_E4)
+ LOAD_AND_SBOX( 9, SBOX_E3)
+ LOAD_AND_SBOX(10, SBOX_E2)
+ LOAD_AND_SBOX(11, SBOX_E1)
+
+ LOAD_AND_SBOX(12, SBOX_E8)
+ LOAD_AND_SBOX(13, SBOX_E7)
+ LOAD_AND_SBOX(14, SBOX_E6)
+ LOAD_AND_SBOX(15, SBOX_E5)
+ LOAD_AND_SBOX(16, SBOX_E4)
+ LOAD_AND_SBOX(17, SBOX_E3)
+ LOAD_AND_SBOX(18, SBOX_E2)
+ LOAD_AND_SBOX(19, SBOX_E1)
+
+ LOAD_AND_SBOX(20, SBOX_E8)
+ LOAD_AND_SBOX(21, SBOX_E7)
+ LOAD_AND_SBOX(22, SBOX_E6)
+ LOAD_AND_SBOX(23, SBOX_E5)
+ LOAD_AND_SBOX(24, SBOX_E4)
+ LOAD_AND_SBOX(25, SBOX_E3)
+ LOAD_AND_SBOX(26, SBOX_E2)
+ LOAD_AND_SBOX(27, SBOX_E1)
+
+ LOAD_AND_SBOX(28, SBOX_E8)
+ LOAD_AND_SBOX(29, SBOX_E7)
+ LOAD_AND_SBOX(30, SBOX_E6)
+ LOAD_AND_SBOX(31, SBOX_E5)
+ LOAD_AND_SBOX(32, SBOX_E4)
+
+ RESTORE_REGS()
+#undef PUSHED
+END_FUNCTION(serpent_key_schedule)
diff --git a/src/asm/asm_ia32/serpent.cpp b/src/asm/asm_ia32/serpent.cpp
new file mode 100644
index 000000000..aacb72b0f
--- /dev/null
+++ b/src/asm/asm_ia32/serpent.cpp
@@ -0,0 +1,49 @@
+/*************************************************
+* Serpent Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/serpent.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" {
+
+void serpent_encrypt(const byte[16], byte[16], const u32bit[132]);
+void serpent_decrypt(const byte[16], byte[16], const u32bit[132]);
+void serpent_key_schedule(u32bit[140]);
+
+}
+
+/*************************************************
+* Serpent Encryption *
+*************************************************/
+void Serpent::enc(const byte in[], byte out[]) const
+ {
+ serpent_encrypt(in, out, round_key);
+ }
+
+/*************************************************
+* Serpent Decryption *
+*************************************************/
+void Serpent::dec(const byte in[], byte out[]) const
+ {
+ serpent_decrypt(in, out, round_key);
+ }
+
+/*************************************************
+* Serpent Key Schedule *
+*************************************************/
+void Serpent::key(const byte key[], u32bit length)
+ {
+ SecureBuffer<u32bit, 140> W;
+ for(u32bit j = 0; j != length / 4; ++j)
+ W[j] = make_u32bit(key[4*j+3], key[4*j+2], key[4*j+1], key[4*j]);
+ W[length / 4] |= u32bit(1) << ((length%4)*8);
+
+ serpent_key_schedule(W);
+ round_key.copy(W + 8, 132);
+ }
+
+}
diff --git a/src/asm/asm_ia32/sha160.cpp b/src/asm/asm_ia32/sha160.cpp
new file mode 100644
index 000000000..7725541d5
--- /dev/null
+++ b/src/asm/asm_ia32/sha160.cpp
@@ -0,0 +1,52 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/sha160.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+extern "C" void botan_sha160_asm_ia32(u32bit[5], const byte[64], u32bit[81]);
+
+/*************************************************
+* SHA-160 Compression Function *
+*************************************************/
+void SHA_160::hash(const byte input[])
+ {
+ botan_sha160_asm_ia32(digest, input, W);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void SHA_160::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(j % 4, digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void SHA_160::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ W.clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ digest[4] = 0xC3D2E1F0;
+ }
+
+/*************************************************
+* SHA_160 Constructor *
+*************************************************/
+SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true), W(81)
+ {
+ clear();
+ }
+
+}
diff --git a/src/asm/asm_ia32/sha1_asm.S b/src/asm/asm_ia32/sha1_asm.S
new file mode 100644
index 000000000..85bc9dc2c
--- /dev/null
+++ b/src/asm/asm_ia32/sha1_asm.S
@@ -0,0 +1,242 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(sha1_asm.S)
+
+START_FUNCTION(botan_sha160_asm_ia32)
+ SPILL_REGS()
+
+#define PUSHED 4
+
+ ASSIGN(EDI, ARG(2))
+ ASSIGN(EBP, ARG(3))
+
+ ZEROIZE(ESI)
+
+START_LOOP(.LOAD_INPUT)
+ ADD_IMM(ESI, 4)
+
+ ASSIGN(EAX, ARRAY4(EDI, 0))
+ ASSIGN(EBX, ARRAY4(EDI, 1))
+ ASSIGN(ECX, ARRAY4(EDI, 2))
+ ASSIGN(EDX, ARRAY4(EDI, 3))
+
+ ADD_IMM(EDI, 16)
+
+ BSWAP(EAX)
+ BSWAP(EBX)
+ BSWAP(ECX)
+ BSWAP(EDX)
+
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-4), EAX)
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-3), EBX)
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-2), ECX)
+ ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-1), EDX)
+LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT)
+
+ ADD2_IMM(EDI, EBP, 64)
+
+START_LOOP(.EXPANSION)
+ ADD_IMM(ESI, 4)
+
+ ZEROIZE(EAX)
+ ASSIGN(EBX, ARRAY4(EDI, -1))
+ ASSIGN(ECX, ARRAY4(EDI, -2))
+ ASSIGN(EDX, ARRAY4(EDI, -3))
+
+ XOR(EAX, ARRAY4(EDI, -5))
+ XOR(EBX, ARRAY4(EDI, -6))
+ XOR(ECX, ARRAY4(EDI, -7))
+ XOR(EDX, ARRAY4(EDI, -8))
+
+ XOR(EAX, ARRAY4(EDI, -11))
+ XOR(EBX, ARRAY4(EDI, -12))
+ XOR(ECX, ARRAY4(EDI, -13))
+ XOR(EDX, ARRAY4(EDI, -14))
+
+ XOR(EAX, ARRAY4(EDI, -13))
+ XOR(EBX, ARRAY4(EDI, -14))
+ XOR(ECX, ARRAY4(EDI, -15))
+ XOR(EDX, ARRAY4(EDI, -16))
+
+ ROTL_IMM(EDX, 1)
+ ROTL_IMM(ECX, 1)
+ ROTL_IMM(EBX, 1)
+ XOR(EAX, EDX)
+ ROTL_IMM(EAX, 1)
+
+ ASSIGN(ARRAY4(EDI, 0), EDX)
+ ASSIGN(ARRAY4(EDI, 1), ECX)
+ ASSIGN(ARRAY4(EDI, 2), EBX)
+ ASSIGN(ARRAY4(EDI, 3), EAX)
+
+ ADD_IMM(EDI, 16)
+LOOP_UNTIL_EQ(ESI, 80, .EXPANSION)
+
+#define MAGIC1 0x5A827999
+#define MAGIC2 0x6ED9EBA1
+#define MAGIC3 0x8F1BBCDC
+#define MAGIC4 0xCA62C1D6
+
+#define MSG ESP
+#define T2 EBP
+
+#define F1(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(MSG, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, C) ; \
+ XOR(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC1) ; \
+ AND(E, B) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2_4(A, B, C, D, E, F, N, MAGIC) \
+ ASSIGN(T2, ARRAY4(MSG, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ XOR(E, C) ; \
+ ADD3_IMM(F, T2, MAGIC) ; \
+ XOR(E, D) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F3(A, B, C, D, E, F, N) \
+ ASSIGN(T2, ARRAY4(MSG, N)) ; \
+ ASSIGN(A, F) ; \
+ ROTL_IMM(F, 5) ; \
+ ADD(F, E) ; \
+ ASSIGN(E, B) ; \
+ OR(E, C) ; \
+ AND(E, D) ; \
+ ADD3_IMM(F, T2, MAGIC3) ; \
+ ASSIGN(T2, B) ; \
+ AND(T2, C) ; \
+ OR(E, T2) ; \
+ ROTR_IMM(B, 2) ; \
+ ADD(E, F) ;
+
+#define F2(A, B, C, D, E, F, MSG) \
+ F2_4(A, B, C, D, E, F, MSG, MAGIC2)
+
+#define F4(A, B, C, D, E, F, MSG) \
+ F2_4(A, B, C, D, E, F, MSG, MAGIC4)
+
+ ASSIGN(EAX, ARG(1))
+ ASSIGN(EDI, ARRAY4(EAX, 0))
+ ASSIGN(EBX, ARRAY4(EAX, 1))
+ ASSIGN(ECX, ARRAY4(EAX, 2))
+ ASSIGN(EDX, ARRAY4(EAX, 3))
+ ASSIGN(ESI, ARRAY4(EAX, 4))
+
+ ASSIGN(ARRAY4(EBP, 80), ESP)
+ ASSIGN(ESP, EBP)
+
+ /* First Round */
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 0)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 1)
+ F1(ESI, EDI, EAX, EBX, ECX, EDX, 2)
+ F1(EDX, ESI, EDI, EAX, EBX, ECX, 3)
+ F1(ECX, EDX, ESI, EDI, EAX, EBX, 4)
+ F1(EBX, ECX, EDX, ESI, EDI, EAX, 5)
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 6)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 7)
+ F1(ESI, EDI, EAX, EBX, ECX, EDX, 8)
+ F1(EDX, ESI, EDI, EAX, EBX, ECX, 9)
+ F1(ECX, EDX, ESI, EDI, EAX, EBX, 10)
+ F1(EBX, ECX, EDX, ESI, EDI, EAX, 11)
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 12)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 13)
+ F1(ESI, EDI, EAX, EBX, ECX, EDX, 14)
+ F1(EDX, ESI, EDI, EAX, EBX, ECX, 15)
+ F1(ECX, EDX, ESI, EDI, EAX, EBX, 16)
+ F1(EBX, ECX, EDX, ESI, EDI, EAX, 17)
+ F1(EAX, EBX, ECX, EDX, ESI, EDI, 18)
+ F1(EDI, EAX, EBX, ECX, EDX, ESI, 19)
+
+ /* Second Round */
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 20)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 21)
+ F2(ECX, EDX, ESI, EDI, EAX, EBX, 22)
+ F2(EBX, ECX, EDX, ESI, EDI, EAX, 23)
+ F2(EAX, EBX, ECX, EDX, ESI, EDI, 24)
+ F2(EDI, EAX, EBX, ECX, EDX, ESI, 25)
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 26)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 27)
+ F2(ECX, EDX, ESI, EDI, EAX, EBX, 28)
+ F2(EBX, ECX, EDX, ESI, EDI, EAX, 29)
+ F2(EAX, EBX, ECX, EDX, ESI, EDI, 30)
+ F2(EDI, EAX, EBX, ECX, EDX, ESI, 31)
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 32)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 33)
+ F2(ECX, EDX, ESI, EDI, EAX, EBX, 34)
+ F2(EBX, ECX, EDX, ESI, EDI, EAX, 35)
+ F2(EAX, EBX, ECX, EDX, ESI, EDI, 36)
+ F2(EDI, EAX, EBX, ECX, EDX, ESI, 37)
+ F2(ESI, EDI, EAX, EBX, ECX, EDX, 38)
+ F2(EDX, ESI, EDI, EAX, EBX, ECX, 39)
+
+ /* Third Round */
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 40)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 41)
+ F3(EAX, EBX, ECX, EDX, ESI, EDI, 42)
+ F3(EDI, EAX, EBX, ECX, EDX, ESI, 43)
+ F3(ESI, EDI, EAX, EBX, ECX, EDX, 44)
+ F3(EDX, ESI, EDI, EAX, EBX, ECX, 45)
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 46)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 47)
+ F3(EAX, EBX, ECX, EDX, ESI, EDI, 48)
+ F3(EDI, EAX, EBX, ECX, EDX, ESI, 49)
+ F3(ESI, EDI, EAX, EBX, ECX, EDX, 50)
+ F3(EDX, ESI, EDI, EAX, EBX, ECX, 51)
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 52)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 53)
+ F3(EAX, EBX, ECX, EDX, ESI, EDI, 54)
+ F3(EDI, EAX, EBX, ECX, EDX, ESI, 55)
+ F3(ESI, EDI, EAX, EBX, ECX, EDX, 56)
+ F3(EDX, ESI, EDI, EAX, EBX, ECX, 57)
+ F3(ECX, EDX, ESI, EDI, EAX, EBX, 58)
+ F3(EBX, ECX, EDX, ESI, EDI, EAX, 59)
+
+ /* Fourth Round */
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 60)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 61)
+ F4(ESI, EDI, EAX, EBX, ECX, EDX, 62)
+ F4(EDX, ESI, EDI, EAX, EBX, ECX, 63)
+ F4(ECX, EDX, ESI, EDI, EAX, EBX, 64)
+ F4(EBX, ECX, EDX, ESI, EDI, EAX, 65)
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 66)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 67)
+ F4(ESI, EDI, EAX, EBX, ECX, EDX, 68)
+ F4(EDX, ESI, EDI, EAX, EBX, ECX, 69)
+ F4(ECX, EDX, ESI, EDI, EAX, EBX, 70)
+ F4(EBX, ECX, EDX, ESI, EDI, EAX, 71)
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 72)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 73)
+ F4(ESI, EDI, EAX, EBX, ECX, EDX, 74)
+ F4(EDX, ESI, EDI, EAX, EBX, ECX, 75)
+ F4(ECX, EDX, ESI, EDI, EAX, EBX, 76)
+ F4(EBX, ECX, EDX, ESI, EDI, EAX, 77)
+ F4(EAX, EBX, ECX, EDX, ESI, EDI, 78)
+ F4(EDI, EAX, EBX, ECX, EDX, ESI, 79)
+
+ ASSIGN(ESP, ARRAY4(ESP, 80))
+
+ ASSIGN(EBP, ARG(1))
+ ADD(ARRAY4(EBP, 0), EDX)
+ ADD(ARRAY4(EBP, 1), EDI)
+ ADD(ARRAY4(EBP, 2), EAX)
+ ADD(ARRAY4(EBP, 3), EBX)
+ ADD(ARRAY4(EBP, 4), ECX)
+
+ RESTORE_REGS()
+END_FUNCTION(botan_sha160_asm_ia32)
diff --git a/src/asm/mp_amd64/bswap.h b/src/asm/mp_amd64/bswap.h
new file mode 100644
index 000000000..3c77b460c
--- /dev/null
+++ b/src/asm/mp_amd64/bswap.h
@@ -0,0 +1,36 @@
+/*************************************************
+* Byte Swapping Operations Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_BSWAP_H__
+#define BOTAN_BSWAP_H__
+
+#include <botan/types.h>
+#include <botan/rotate.h>
+
+namespace Botan {
+
+/*************************************************
+* Byte Swapping Functions *
+*************************************************/
+inline u16bit reverse_bytes(u16bit input)
+ {
+ return rotate_left(input, 8);
+ }
+
+inline u32bit reverse_bytes(u32bit input)
+ {
+ asm("bswapl %0" : "=r" (input) : "0" (input));
+ return input;
+ }
+
+inline u64bit reverse_bytes(u64bit input)
+ {
+ asm("bswapq %0" : "=r" (input) : "0" (input));
+ return input;
+ }
+
+}
+
+#endif
diff --git a/src/asm/mp_amd64/modinfo.txt b/src/asm/mp_amd64/modinfo.txt
new file mode 100644
index 000000000..a042a3976
--- /dev/null
+++ b/src/asm/mp_amd64/modinfo.txt
@@ -0,0 +1,21 @@
+realname "x86-64 MPI Assembler Core"
+
+mp_bits 64
+
+load_on asm_ok
+
+<replace>
+bswap.h
+mp_asm.h
+mp_asmi.h
+#mp_mulop.cpp
+</replace>
+
+<arch>
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
diff --git a/src/asm/mp_amd64/mp_asm.h b/src/asm/mp_amd64/mp_asm.h
new file mode 100644
index 000000000..eca7bae6c
--- /dev/null
+++ b/src/asm/mp_amd64/mp_asm.h
@@ -0,0 +1,67 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2008 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_H__
+#define BOTAN_MP_ASM_H__
+
+#include <botan/mp_types.h>
+
+#if (BOTAN_MP_WORD_BITS != 64)
+ #error The mp_amd64 module requires that BOTAN_MP_WORD_BITS == 64
+#endif
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for amd64 Assembly *
+*************************************************/
+#define ASM(x) x "\n\t"
+
+/*************************************************
+* Word Multiply *
+*************************************************/
+inline word word_madd2(word a, word b, word* c)
+ {
+ asm(
+ ASM("mulq %[b]")
+ ASM("addq %[c],%[a]")
+ ASM("adcq $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c)
+ : "0"(a), "1"(b), [c]"g"(*c) : "cc");
+
+ return a;
+ }
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd3(word a, word b, word c, word* d)
+ {
+ asm(
+ ASM("mulq %[b]")
+
+ ASM("addq %[c],%[a]")
+ ASM("adcq $0,%[carry]")
+
+ ASM("addq %[d],%[a]")
+ ASM("adcq $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d)
+ : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc");
+
+ return a;
+ }
+
+#undef ASM
+
+}
+
+}
+
+#endif
diff --git a/src/asm/mp_amd64/mp_asmi.h b/src/asm/mp_amd64/mp_asmi.h
new file mode 100644
index 000000000..16632a38d
--- /dev/null
+++ b/src/asm/mp_amd64/mp_asmi.h
@@ -0,0 +1,233 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2007 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/mp_asm.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for amd64 Assembly *
+*************************************************/
+#ifndef ASM
+ #define ASM(x) x "\n\t"
+#endif
+
+#define ADDSUB2_OP(OPERATION, INDEX) \
+ ASM("movq 8*" #INDEX "(%[y]), %[carry]") \
+ ASM(OPERATION " %[carry], 8*" #INDEX "(%[x])") \
+
+#define ADDSUB3_OP(OPERATION, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]), %[carry]") \
+ ASM(OPERATION " 8*" #INDEX "(%[y]), %[carry]") \
+ ASM("movq %[carry], 8*" #INDEX "(%[z])") \
+
+#define LINMUL_OP(WRITE_TO, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]),%%rax") \
+ ASM("mulq %[y]") \
+ ASM("addq %[carry],%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("movq %%rdx,%[carry]") \
+ ASM("movq %%rax, 8*" #INDEX "(%[" WRITE_TO "])")
+
+#define MULADD_OP(IGNORED, INDEX) \
+ ASM("movq 8*" #INDEX "(%[x]),%%rax") \
+ ASM("mulq %[y]") \
+ ASM("addq %[carry],%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("addq 8*" #INDEX "(%[z]),%%rax") \
+ ASM("adcq $0,%%rdx") \
+ ASM("movq %%rdx,%[carry]") \
+ ASM("movq %%rax, 8*" #INDEX " (%[z])")
+
+#define DO_8_TIMES(MACRO, ARG) \
+ MACRO(ARG, 0) \
+ MACRO(ARG, 1) \
+ MACRO(ARG, 2) \
+ MACRO(ARG, 3) \
+ MACRO(ARG, 4) \
+ MACRO(ARG, 5) \
+ MACRO(ARG, 6) \
+ MACRO(ARG, 7)
+
+#define ADD_OR_SUBTRACT(CORE_CODE) \
+ ASM("rorq %[carry]") \
+ CORE_CODE \
+ ASM("sbbq %[carry],%[carry]") \
+ ASM("negq %[carry]")
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("adcq %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("sbbq %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Three Argument *
+*************************************************/
+inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "x")
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "z")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Multiply/Add *
+*************************************************/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(MULADD_OP, "")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%rax", "%rdx");
+ return carry;
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mulq %[y]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mulq %[y]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ ASM("addq %[x],%[w0]")
+ ASM("adcq %[y],%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+
+#undef ASM
+#undef DO_8_TIMES
+#undef ADD_OR_SUBTRACT
+#undef ADDSUB2_OP
+#undef ADDSUB3_OP
+#undef LINMUL_OP
+#undef MULADD_OP
+
+}
+
+}
+#endif
diff --git a/src/asm/mp_amd64/mp_mulop.cpp b/src/asm/mp_amd64/mp_mulop.cpp
new file mode 100644
index 000000000..d1aa51489
--- /dev/null
+++ b/src/asm/mp_amd64/mp_mulop.cpp
@@ -0,0 +1,94 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/mp_asm.h>
+#include <botan/mp_asmi.h>
+#include <botan/mp_core.h>
+#include <botan/mem_ops.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Simple O(N^2) Multiplication *
+*************************************************/
+void bigint_simple_mul(word z[], const word x[], u32bit x_size,
+ const word y[], u32bit y_size)
+ {
+ const u32bit blocks = x_size - (x_size % 8);
+
+ clear_mem(z, x_size + y_size);
+
+ for(u32bit i = 0; i != y_size; ++i)
+ {
+ word carry = 0;
+
+ for(u32bit j = 0; j != blocks; j += 8)
+ carry = word8_madd3(z + i + j, x + j, y[i], carry);
+
+ for(u32bit j = blocks; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry);
+
+ z[x_size+i] = carry;
+ }
+ }
+
+inline word word_sqr(word x,
+
+/*************************************************
+* Simple O(N^2) Squaring
+
+This is exactly the same algorithm as bigint_simple_mul,
+however because C/C++ compilers suck at alias analysis it
+is good to have the version where the compiler knows
+that x == y
+*************************************************/
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+ {
+ clear_mem(z, 2*x_size);
+
+ for(u32bit i = 0; i != x_size; ++i)
+ {
+ const word x_i = x[i];
+
+ word carry = z[2*i];
+ z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry);
+
+ for(u32bit j = i; j != x_size; ++j)
+ {
+ // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry;
+
+ /*
+ load z[i+j] into register
+ load x[j] into %hi
+ mulq %[x_i] -> x[i] * x[j] -> %lo:%hi
+ shlq %lo, $1
+
+ // put carry bit (cf) from %lo into %temp
+ xorl %temp
+ adcq $0, %temp
+
+ // high bit of lo now in cf
+ shl %hi, $1
+ // add in lowest bid from %lo
+ orl %temp, %hi
+
+ addq %[c], %[lo]
+ adcq $0, %[hi]
+ addq %[z_ij], %[lo]
+ adcq $0, %[hi]
+
+ */
+
+ }
+
+ z[x_size+i] = carry;
+ }
+ }
+
+}
+
+}
diff --git a/src/asm/mp_asm64/modinfo.txt b/src/asm/mp_asm64/modinfo.txt
new file mode 100644
index 000000000..a9e5d53da
--- /dev/null
+++ b/src/asm/mp_asm64/modinfo.txt
@@ -0,0 +1,25 @@
+realname "64-bit RISC MPI Assembler Core"
+
+mp_bits 64
+
+load_on asm_ok
+
+<replace>
+mp_asm.h
+</replace>
+
+<arch>
+alpha
+ia64
+mips64
+ppc64
+sparc64
+</arch>
+
+# The inline asm only works with gcc, but it looks like (at least on
+# UltraSPARC), using 64-bit words and the sythensized multiply is a 5 to 25%
+# win, so it's probably worth using elsewhere.
+<cc>
+gcc
+sunwspro
+</cc>
diff --git a/src/asm/mp_asm64/mp_asm.h b/src/asm/mp_asm64/mp_asm.h
new file mode 100644
index 000000000..e455b3616
--- /dev/null
+++ b/src/asm/mp_asm64/mp_asm.h
@@ -0,0 +1,110 @@
+/*************************************************
+* MPI Multiply-Add Core Header File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_MP_MADD_H__
+#define BOTAN_MP_MADD_H__
+
+#include <botan/mp_types.h>
+
+namespace Botan {
+
+#if (BOTAN_MP_WORD_BITS != 64)
+ #error The mp_asm64 module requires that BOTAN_MP_WORD_BITS == 64
+#endif
+
+#if defined(BOTAN_TARGET_ARCH_IS_ALPHA)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("umulh %1,%2,%0" : "=r" (z0) : "r" (a), "r" (b)); \
+ z1 = a * b; \
+} while(0);
+
+#elif defined(BOTAN_TARGET_ARCH_IS_IA64)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("xmpy.hu %0=%1,%2" : "=f" (z0) : "f" (a), "f" (b)); \
+ z1 = a * b; \
+} while(0);
+
+#elif defined(BOTAN_TARGET_ARCH_IS_PPC64)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("mulhdu %0,%1,%2" : "=r" (z0) : "r" (a), "r" (b) : "cc"); \
+ z1 = a * b; \
+} while(0);
+
+#elif defined(BOTAN_TARGET_ARCH_IS_MIPS64)
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) do { \
+ asm("dmultu %2,%3" : "=h" (z0), "=l" (z1) : "r" (a), "r" (b)); \
+} while(0);
+
+#else
+
+// Do a 64x64->128 multiply using four 64x64->64 multiplies
+// plus some adds and shifts. Last resort for CPUs like UltraSPARC,
+// with 64-bit registers/ALU, but no 64x64->128 multiply.
+inline void bigint_2word_mul(word a, word b, word* z1, word* z0)
+ {
+ const u32bit MP_HWORD_BITS = MP_WORD_BITS / 2;
+ const word MP_HWORD_MASK = ((word)1 << MP_HWORD_BITS) - 1;
+
+ const word a_hi = (a >> MP_HWORD_BITS);
+ const word a_lo = (a & MP_HWORD_MASK);
+ const word b_hi = (b >> MP_HWORD_BITS);
+ const word b_lo = (b & MP_HWORD_MASK);
+
+ word x0 = a_hi * b_hi;
+ word x1 = a_lo * b_hi;
+ word x2 = a_hi * b_lo;
+ word x3 = a_lo * b_lo;
+
+ x2 += x3 >> (MP_HWORD_BITS);
+ x2 += x1;
+ if(x2 < x1)
+ x0 += ((word)1 << MP_HWORD_BITS);
+
+ *z0 = x0 + (x2 >> MP_HWORD_BITS);
+ *z1 = ((x2 & MP_HWORD_MASK) << MP_HWORD_BITS) + (x3 & MP_HWORD_MASK);
+ }
+
+#define BOTAN_WORD_MUL(a,b,z1,z0) bigint_2word_mul(a, b, &z1, &z0)
+
+#endif
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd2(word a, word b, word* c)
+ {
+ word z0 = 0, z1 = 0;
+
+ BOTAN_WORD_MUL(a, b, z1, z0);
+
+ z1 += *c; if(z1 < *c) z0++;
+
+ *c = z0;
+ return z1;
+ }
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd3(word a, word b, word c, word* d)
+ {
+ word z0 = 0, z1 = 0;
+
+ BOTAN_WORD_MUL(a, b, z1, z0);
+
+ z1 += c; if(z1 < c) z0++;
+ z1 += *d; if(z1 < *d) z0++;
+
+ *d = z0;
+ return z1;
+ }
+
+}
+
+#endif
diff --git a/src/asm/mp_ia32/modinfo.txt b/src/asm/mp_ia32/modinfo.txt
new file mode 100644
index 000000000..cf4959250
--- /dev/null
+++ b/src/asm/mp_ia32/modinfo.txt
@@ -0,0 +1,19 @@
+realname "x86 MPI Assembler Core"
+
+mp_bits 32
+
+load_on asm_ok
+
+<replace>
+mp_asm.h
+mp_asmi.h
+</replace>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
diff --git a/src/asm/mp_ia32/mp_asm.h b/src/asm/mp_ia32/mp_asm.h
new file mode 100644
index 000000000..b45140321
--- /dev/null
+++ b/src/asm/mp_ia32/mp_asm.h
@@ -0,0 +1,65 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2008 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_H__
+#define BOTAN_MP_ASM_H__
+
+#include <botan/mp_types.h>
+
+#if (BOTAN_MP_WORD_BITS != 32)
+ #error The mp_ia32 module requires that BOTAN_MP_WORD_BITS == 32
+#endif
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for x86 Assembly *
+*************************************************/
+#define ASM(x) x "\n\t"
+
+/*************************************************
+* Word Multiply *
+*************************************************/
+inline word word_madd2(word a, word b, word* c)
+ {
+ asm(
+ ASM("mull %[b]")
+ ASM("addl %[c],%[a]")
+ ASM("adcl $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c)
+ : "0"(a), "1"(b), [c]"g"(*c) : "cc");
+
+ return a;
+ }
+
+/*************************************************
+* Word Multiply/Add *
+*************************************************/
+inline word word_madd3(word a, word b, word c, word* d)
+ {
+ asm(
+ ASM("mull %[b]")
+
+ ASM("addl %[c],%[a]")
+ ASM("adcl $0,%[carry]")
+
+ ASM("addl %[d],%[a]")
+ ASM("adcl $0,%[carry]")
+
+ : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d)
+ : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc");
+
+ return a;
+ }
+
+}
+
+}
+
+#endif
diff --git a/src/asm/mp_ia32/mp_asmi.h b/src/asm/mp_ia32/mp_asmi.h
new file mode 100644
index 000000000..9de0c11e3
--- /dev/null
+++ b/src/asm/mp_ia32/mp_asmi.h
@@ -0,0 +1,225 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2007 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include <botan/mp_asm.h>
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Helper Macros for x86 Assembly *
+*************************************************/
+#ifndef ASM
+ #define ASM(x) x "\n\t"
+#endif
+
+#define ADDSUB2_OP(OPERATION, INDEX) \
+ ASM("movl 4*" #INDEX "(%[y]), %[carry]") \
+ ASM(OPERATION " %[carry], 4*" #INDEX "(%[x])") \
+
+#define ADDSUB3_OP(OPERATION, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]), %[carry]") \
+ ASM(OPERATION " 4*" #INDEX "(%[y]), %[carry]") \
+ ASM("movl %[carry], 4*" #INDEX "(%[z])") \
+
+#define LINMUL_OP(WRITE_TO, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]),%%eax") \
+ ASM("mull %[y]") \
+ ASM("addl %[carry],%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("movl %%edx,%[carry]") \
+ ASM("movl %%eax, 4*" #INDEX "(%[" WRITE_TO "])")
+
+#define MULADD_OP(IGNORED, INDEX) \
+ ASM("movl 4*" #INDEX "(%[x]),%%eax") \
+ ASM("mull %[y]") \
+ ASM("addl %[carry],%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("addl 4*" #INDEX "(%[z]),%%eax") \
+ ASM("adcl $0,%%edx") \
+ ASM("movl %%edx,%[carry]") \
+ ASM("movl %%eax, 4*" #INDEX " (%[z])")
+
+#define DO_8_TIMES(MACRO, ARG) \
+ MACRO(ARG, 0) \
+ MACRO(ARG, 1) \
+ MACRO(ARG, 2) \
+ MACRO(ARG, 3) \
+ MACRO(ARG, 4) \
+ MACRO(ARG, 5) \
+ MACRO(ARG, 6) \
+ MACRO(ARG, 7)
+
+#define ADD_OR_SUBTRACT(CORE_CODE) \
+ ASM("rorl %[carry]") \
+ CORE_CODE \
+ ASM("sbbl %[carry],%[carry]") \
+ ASM("negl %[carry]")
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("adcl %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(ASM("sbbl %[y],%[x]"))
+ : [x]"=r"(x), [carry]"=r"(*carry)
+ : "0"(x), [y]"rm"(y), "1"(*carry)
+ : "cc");
+ return x;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Subtraction, Three Argument *
+*************************************************/
+inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ asm(
+ ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl"))
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry)
+ : "cc", "memory");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "x")
+ : [carry]"=r"(carry)
+ : [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(LINMUL_OP, "z")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+ }
+
+/*************************************************
+* Eight Word Block Multiply/Add *
+*************************************************/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+ asm(
+ DO_8_TIMES(MULADD_OP, "")
+ : [carry]"=r"(carry)
+ : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry)
+ : "cc", "%eax", "%edx");
+ return carry;
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mull %[y]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y)
+ {
+ asm(
+ ASM("mull %[y]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ ASM("addl %[x],%[w0]")
+ ASM("adcl %[y],%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+ }
+
+}
+
+}
+
+#endif
diff --git a/src/asm/mp_ia32_msvc/modinfo.txt b/src/asm/mp_ia32_msvc/modinfo.txt
new file mode 100644
index 000000000..36d9d0290
--- /dev/null
+++ b/src/asm/mp_ia32_msvc/modinfo.txt
@@ -0,0 +1,17 @@
+realname "x86 MPI Assembler Core (MSVC)"
+
+mp_bits 32
+
+#load_on asm_ok
+
+<replace>
+mp_asmi.h
+</replace>
+
+<arch>
+ia32
+</arch>
+
+<cc>
+msvc
+</cc>
diff --git a/src/asm/mp_ia32_msvc/mp_asmi.h b/src/asm/mp_ia32_msvc/mp_asmi.h
new file mode 100644
index 000000000..5eaa46eb4
--- /dev/null
+++ b/src/asm/mp_ia32_msvc/mp_asmi.h
@@ -0,0 +1,547 @@
+/*************************************************
+* Lowest Level MPI Algorithms Header File *
+* (C) 1999-2006 Jack Lloyd *
+* 2006 Luca Piccarreta *
+*************************************************/
+
+#ifndef BOTAN_MP_ASM_INTERNAL_H__
+#define BOTAN_MP_ASM_INTERNAL_H__
+
+#include "mp_asm.h"
+
+namespace Botan {
+
+extern "C" {
+
+/*************************************************
+* Word Addition *
+*************************************************/
+inline word word_add(word x, word y, word* carry)
+ {
+ word z = x + y;
+ word c1 = (z < x);
+ z += *carry;
+ *carry = c1 | (z < *carry);
+ return z;
+ }
+
+/*************************************************
+* Four Word Block Addition, Two Argument *
+*************************************************/
+inline word word4_addcarry(word x[4], word carry)
+ {
+ __asm {
+ mov edx,[x]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ adc [edx],0
+ mov eax,[esi+4]
+ adc [edx+4],0
+ mov eax,[esi+8]
+ adc [edx+8],0
+ mov eax,[esi+12]
+ adc [edx+12],0
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Addition, Two Argument *
+*************************************************/
+inline word word8_add2(word x[8], const word y[8], word carry)
+ {
+ __asm {
+ mov edx,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[esi]
+ adc [edx],eax
+ mov eax,[esi+4]
+ adc [edx+4],eax
+ mov eax,[esi+8]
+ adc [edx+8],eax
+ mov eax,[esi+12]
+ adc [edx+12],eax
+ mov eax,[esi+16]
+ adc [edx+16],eax
+ mov eax,[esi+20]
+ adc [edx+20],eax
+ mov eax,[esi+24]
+ adc [edx+24],eax
+ mov eax,[esi+28]
+ adc [edx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Addition, Three Argument *
+*************************************************/
+inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
+ {
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ mov ebx,[z]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ adc eax,[esi]
+ mov [ebx],eax
+
+ mov eax,[edi+4]
+ adc eax,[esi+4]
+ mov [ebx+4],eax
+
+ mov eax,[edi+8]
+ adc eax,[esi+8]
+ mov [ebx+8],eax
+
+ mov eax,[edi+12]
+ adc eax,[esi+12]
+ mov [ebx+12],eax
+
+ mov eax,[edi+16]
+ adc eax,[esi+16]
+ mov [ebx+16],eax
+
+ mov eax,[edi+20]
+ adc eax,[esi+20]
+ mov [ebx+20],eax
+
+ mov eax,[edi+24]
+ adc eax,[esi+24]
+ mov [ebx+24],eax
+
+ mov eax,[edi+28]
+ adc eax,[esi+28]
+ mov [ebx+28],eax
+
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Word Subtraction *
+*************************************************/
+inline word word_sub(word x, word y, word* carry)
+ {
+ word t0 = x - y;
+ word c1 = (t0 > x);
+ word z = t0 - *carry;
+ *carry = c1 | (z > t0);
+ return z;
+ }
+
+/*************************************************
+* Four Word Block Subtraction, Two Argument *
+*************************************************/
+inline word word8_sub2(word x[8], const word y[8], word carry)
+ {
+ _asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [edi],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [edi+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [edi+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [edi+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [edi+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [edi+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [edi+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [edi+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Subtraction, Three Argument *
+*************************************************/
+__forceinline word word8_sub3(word z[8], const word x[8],
+ const word y[8], word carry)
+ {
+ __asm {
+ mov edi,[x]
+ mov esi,[y]
+ xor eax,eax
+ sub eax,[carry] //force CF=1 iff *carry==1
+ mov ebx,[z]
+ mov eax,[edi]
+ sbb eax,[esi]
+ mov [ebx],eax
+ mov eax,[edi+4]
+ sbb eax,[esi+4]
+ mov [ebx+4],eax
+ mov eax,[edi+8]
+ sbb eax,[esi+8]
+ mov [ebx+8],eax
+ mov eax,[edi+12]
+ sbb eax,[esi+12]
+ mov [ebx+12],eax
+ mov eax,[edi+16]
+ sbb eax,[esi+16]
+ mov [ebx+16],eax
+ mov eax,[edi+20]
+ sbb eax,[esi+20]
+ mov [ebx+20],eax
+ mov eax,[edi+24]
+ sbb eax,[esi+24]
+ mov [ebx+24],eax
+ mov eax,[edi+28]
+ sbb eax,[esi+28]
+ mov [ebx+28],eax
+ sbb eax,eax
+ neg eax
+ }
+ }
+
+/*************************************************
+* Four Word Block Linear Multiplication *
+*************************************************/
+inline word word8_linmul2(word x[8], word y, word carry)
+ {
+ __asm
+ {
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [esi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [esi+28],eax //load a
+
+ mov eax,edx //store carry
+ }
+ }
+
+/*************************************************
+* Eight Word Block Linear Multiplication *
+*************************************************/
+__forceinline word word8_muladd(word z[8], const word x[8],
+ word y, word carry)
+ {
+ __asm
+ {
+ mov esi,[x]
+ mov ebx,[y]
+ mov edi,[z]
+ mov eax,[esi] //load a
+ mul ebx //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ add eax,[edi] //sum lo z
+ adc edx,0 //sum hi z
+ mov ecx,edx //carry for next block = hi z
+ mov [edi],eax //save lo z
+
+ mov eax,[esi+4]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+4]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+4],eax
+
+ mov eax,[esi+8]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+8]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+8],eax
+
+ mov eax,[esi+12]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+12]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+12],eax
+
+ mov eax,[esi+16]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+16]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+16],eax
+
+ mov eax,[esi+20]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+20]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+20],eax
+
+ mov eax,[esi+24]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+24]
+ adc edx,0
+ mov ecx,edx
+ mov [edi+24],eax
+
+ mov eax,[esi+28]
+ mul ebx
+ add eax,ecx
+ adc edx,0
+ add eax,[edi+28]
+ adc edx,0
+ mov [edi+28],eax
+ mov eax,edx
+ }
+ }
+
+__forceinline word word8_linmul3(word z[4], const word x[4], word y, word carry)
+ {
+ __asm
+ {
+#if 0
+ //it's slower!!!
+ mov edx,[z]
+ mov eax,[x]
+ movd mm7,[y]
+
+ movd mm0,[eax]
+ movd mm1,[eax+4]
+ movd mm2,[eax+8]
+ pmuludq mm0,mm7
+ pmuludq mm1,mm7
+ pmuludq mm2,mm7
+
+ movd mm6,[carry]
+ paddq mm0,mm6
+ movd [edx],mm0
+
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+4],mm1
+
+ movd mm3,[eax+12]
+ psrlq mm1,32
+ paddq mm2,mm1
+ movd [edx+8],mm2
+
+ pmuludq mm3,mm7
+ movd mm4,[eax+16]
+ psrlq mm2,32
+ paddq mm3,mm2
+ movd [edx+12],mm3
+
+ pmuludq mm4,mm7
+ movd mm5,[eax+20]
+ psrlq mm3,32
+ paddq mm4,mm3
+ movd [edx+16],mm4
+
+ pmuludq mm5,mm7
+ movd mm0,[eax+24]
+ psrlq mm4,32
+ paddq mm5,mm4
+ movd [edx+20],mm5
+
+ pmuludq mm0,mm7
+ movd mm1,[eax+28]
+ psrlq mm5,32
+ paddq mm0,mm5
+ movd [edx+24],mm0
+
+ pmuludq mm1,mm7
+ psrlq mm0,32
+ paddq mm1,mm0
+ movd [edx+28],mm1
+
+ psrlq mm1,32
+ movd eax,mm1
+ emms
+#else
+ mov edi,[z]
+ mov esi,[x]
+ mov eax,[esi] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,[carry] //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi],eax //load a
+
+ mov eax,[esi+4] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+4],eax //load a
+
+ mov eax,[esi+8] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+8],eax //load a
+
+ mov eax,[esi+12] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+12],eax //load a
+
+ mov eax,[esi+16] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+16],eax //load a
+
+ mov eax,[esi+20] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+20],eax //load a
+
+ mov eax,[esi+24] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov ecx,edx //store carry
+ mov [edi+24],eax //load a
+
+ mov eax,[esi+28] //load a
+ mul [y] //edx(hi):eax(lo)=a*b
+ add eax,ecx //sum lo carry
+ adc edx,0 //sum hi carry
+ mov [edi+28],eax //load a
+ mov eax,edx //store carry
+#endif
+ }
+ }
+
+/*************************************************
+* Eight Word Block Multiply/Add *
+*************************************************/
+inline word word8_madd3(word z[8], const word x[8], word y, word carry)
+ {
+ z[0] = word_madd3(x[0], y, z[0], &carry);
+ z[1] = word_madd3(x[1], y, z[1], &carry);
+ z[2] = word_madd3(x[2], y, z[2], &carry);
+ z[3] = word_madd3(x[3], y, z[3], &carry);
+ z[4] = word_madd3(x[4], y, z[4], &carry);
+ z[5] = word_madd3(x[5], y, z[5], &carry);
+ z[6] = word_madd3(x[6], y, z[6], &carry);
+ z[7] = word_madd3(x[7], y, z[7], &carry);
+ return carry;
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b)
+ {
+ dword z = (dword)a * b + (*w0);
+ *w0 = (word)z; //lo
+
+ word t1 = (word)(z >> BOTAN_MP_WORD_BITS); //hi
+ *w1 += t1; //w1+=lo
+ *w2 += (*w1 < t1) ? 1 : 0; //w2+=carry
+ }
+
+/*************************************************
+* Multiply-Add Accumulator *
+*************************************************/
+inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b)
+ {
+ dword z = (dword)a * b;
+ word t0 = (word)z;
+ word t1 = (word)(z >> BOTAN_MP_WORD_BITS);
+
+ *w0 += t0;
+ *w1 += t1 + ((*w0 < t0) ? 1 : 0);
+ *w2 += (*w1 < t1) ? 1 : 0;
+
+ *w0 += t0;
+ *w1 += t1 + ((*w0 < t0) ? 1 : 0);
+ *w2 += (*w1 < t1) ? 1 : 0;
+ }
+
+}
+
+}
+
+#endif
diff --git a/src/asm/sha1_sse2/modinfo.txt b/src/asm/sha1_sse2/modinfo.txt
new file mode 100644
index 000000000..e1805260c
--- /dev/null
+++ b/src/asm/sha1_sse2/modinfo.txt
@@ -0,0 +1,22 @@
+realname "SSE2 implementation of SHA-1"
+
+load_on request
+
+<replace>
+sha160.cpp
+sha160.h
+</replace>
+
+<add>
+sha1core.cpp
+</add>
+
+<arch>
+pentium4
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
diff --git a/src/asm/sha1_sse2/sha160.cpp b/src/asm/sha1_sse2/sha160.cpp
new file mode 100644
index 000000000..dfb5fdfe5
--- /dev/null
+++ b/src/asm/sha1_sse2/sha160.cpp
@@ -0,0 +1,52 @@
+/*************************************************
+* SHA-160 Source File *
+* (C) 1999-2007 Jack Lloyd *
+*************************************************/
+
+#include <botan/sha160.h>
+#include <botan/loadstor.h>
+#include <botan/bit_ops.h>
+
+namespace Botan {
+
+extern "C" void botan_sha1_sse(u32bit[5], const byte[64]);
+
+/*************************************************
+* SHA-160 Compression Function *
+*************************************************/
+void SHA_160::hash(const byte input[])
+ {
+ botan_sha1_sse(digest, input);
+ }
+
+/*************************************************
+* Copy out the digest *
+*************************************************/
+void SHA_160::copy_out(byte output[])
+ {
+ for(u32bit j = 0; j != OUTPUT_LENGTH; ++j)
+ output[j] = get_byte(j % 4, digest[j/4]);
+ }
+
+/*************************************************
+* Clear memory of sensitive data *
+*************************************************/
+void SHA_160::clear() throw()
+ {
+ MDx_HashFunction::clear();
+ digest[0] = 0x67452301;
+ digest[1] = 0xEFCDAB89;
+ digest[2] = 0x98BADCFE;
+ digest[3] = 0x10325476;
+ digest[4] = 0xC3D2E1F0;
+ }
+
+/*************************************************
+* SHA_160 Constructor *
+*************************************************/
+SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true)
+ {
+ clear();
+ }
+
+}
diff --git a/src/asm/sha1_sse2/sha160.h b/src/asm/sha1_sse2/sha160.h
new file mode 100644
index 000000000..c6f8482cf
--- /dev/null
+++ b/src/asm/sha1_sse2/sha160.h
@@ -0,0 +1,32 @@
+/*************************************************
+* SHA-160 Header File *
+* (C) 1999-2007 The Botan Project *
+*************************************************/
+
+#ifndef BOTAN_SHA_160_H__
+#define BOTAN_SHA_160_H__
+
+#include <botan/mdx_hash.h>
+
+namespace Botan {
+
+/*************************************************
+* SHA-160 *
+*************************************************/
+class SHA_160 : public MDx_HashFunction
+ {
+ public:
+ void clear() throw();
+ std::string name() const { return "SHA-160"; }
+ HashFunction* clone() const { return new SHA_160; }
+ SHA_160();
+ private:
+ void hash(const byte[]);
+ void copy_out(byte[]);
+
+ SecureBuffer<u32bit, 5> digest;
+ };
+
+}
+
+#endif
diff --git a/src/asm/sha1_sse2/sha1core.cpp b/src/asm/sha1_sse2/sha1core.cpp
new file mode 100644
index 000000000..23dbfc5e2
--- /dev/null
+++ b/src/asm/sha1_sse2/sha1core.cpp
@@ -0,0 +1,327 @@
+/* this code is public domain.
+ *
+ * dean gaudet <[email protected]>
+ *
+ * this code was inspired by this paper:
+ *
+ * SHA: A Design for Parallel Architectures?
+ * Antoon Bosselaers, Ren�e Govaerts and Joos Vandewalle
+ * <http://www.esat.kuleuven.ac.be/~cosicart/pdf/AB-9700.pdf>
+ *
+ * more information available on this implementation here:
+ *
+ * http://arctic.org/~dean/crypto/sha1.html
+ *
+ * version: 2
+ */
+
+/*
+ * Lightly modified for Botan, tested under GCC 4.1.1 and ICC 9.1
+ * on a Linux/Core2 system.
+ *
+ */
+#include <botan/types.h>
+#include <xmmintrin.h>
+
+namespace Botan {
+
+typedef union {
+ u32bit u32[4];
+ __m128i u128;
+ } v4si __attribute__((aligned(16)));
+
+static const v4si K00_19 = { { 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 } };
+static const v4si K20_39 = { { 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 } };
+static const v4si K40_59 = { { 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc } };
+static const v4si K60_79 = { { 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 } };
+
+#define UNALIGNED 1
+#if UNALIGNED
+#define load(p) _mm_loadu_si128(p)
+#else
+#define load(p) (*p)
+#endif
+
+
+/*
+the first 16 bytes only need byte swapping
+
+prepared points to 4x u32bit, 16-byte aligned
+
+W points to the 4 dwords which need preparing --
+and is overwritten with the swapped bytes
+*/
+#define prep00_15(prep, W) do { \
+ __m128i r1, r2; \
+ \
+ r1 = (W); \
+ if (1) { \
+ r1 = _mm_shufflehi_epi16(r1, _MM_SHUFFLE(2, 3, 0, 1)); \
+ r1 = _mm_shufflelo_epi16(r1, _MM_SHUFFLE(2, 3, 0, 1)); \
+ r2 = _mm_slli_epi16(r1, 8); \
+ r1 = _mm_srli_epi16(r1, 8); \
+ r1 = _mm_or_si128(r1, r2); \
+ (W) = r1; \
+ } \
+ (prep).u128 = _mm_add_epi32(K00_19.u128, r1); \
+ } while(0)
+
+
+
+/*
+for each multiple of 4, t, we want to calculate this:
+
+W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
+W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
+W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
+
+we'll actually calculate this:
+
+W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
+W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
+W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
+W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
+W[t+3] ^= rol(W[t+0], 1);
+
+the parameters are:
+
+W0 = &W[t-16];
+W1 = &W[t-12];
+W2 = &W[t- 8];
+W3 = &W[t- 4];
+
+and on output:
+prepared = W0 + K
+W0 = W[t]..W[t+3]
+*/
+
+/* note that there is a step here where i want to do a rol by 1, which
+* normally would look like this:
+*
+* r1 = psrld r0,$31
+* r0 = pslld r0,$1
+* r0 = por r0,r1
+*
+* but instead i do this:
+*
+* r1 = pcmpltd r0,zero
+* r0 = paddd r0,r0
+* r0 = psub r0,r1
+*
+* because pcmpltd and paddd are availabe in both MMX units on
+* efficeon, pentium-m, and opteron but shifts are available in
+* only one unit.
+*/
+#define prep(prep, XW0, XW1, XW2, XW3, K) do { \
+ __m128i r0, r1, r2, r3; \
+ \
+ /* load W[t-4] 16-byte aligned, and shift */ \
+ r3 = _mm_srli_si128((XW3), 4); \
+ r0 = (XW0); \
+ /* get high 64-bits of XW0 into low 64-bits */ \
+ r1 = _mm_shuffle_epi32((XW0), _MM_SHUFFLE(1,0,3,2)); \
+ /* load high 64-bits of r1 */ \
+ r1 = _mm_unpacklo_epi64(r1, (XW1)); \
+ r2 = (XW2); \
+ \
+ r0 = _mm_xor_si128(r1, r0); \
+ r2 = _mm_xor_si128(r3, r2); \
+ r0 = _mm_xor_si128(r2, r0); \
+ /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */ \
+ \
+ r2 = _mm_slli_si128(r0, 12); \
+ r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128()); \
+ r0 = _mm_add_epi32(r0, r0); /* shift left by 1 */ \
+ r0 = _mm_sub_epi32(r0, r1); /* r0 has W[t]..W[t+2] */ \
+ \
+ r3 = _mm_srli_epi32(r2, 30); \
+ r2 = _mm_slli_epi32(r2, 2); \
+ \
+ r0 = _mm_xor_si128(r0, r3); \
+ r0 = _mm_xor_si128(r0, r2); /* r0 now has W[t+3] */ \
+ \
+ (XW0) = r0; \
+ (prep).u128 = _mm_add_epi32(r0, (K).u128); \
+ } while(0)
+
+
+static inline u32bit rol(u32bit src, u32bit amt)
+ {
+ /* gcc and icc appear to turn this into a rotate */
+ return (src << amt) | (src >> (32 - amt));
+ }
+
+
+static inline u32bit f00_19(u32bit x, u32bit y, u32bit z)
+ {
+ /* FIPS 180-2 says this: (x & y) ^ (~x & z)
+ * but we can calculate it in fewer steps.
+ */
+ return ((y ^ z) & x) ^ z;
+ }
+
+
+static inline u32bit f20_39(u32bit x, u32bit y, u32bit z)
+ {
+ return (x ^ z) ^ y;
+ }
+
+
+static inline u32bit f40_59(u32bit x, u32bit y, u32bit z)
+ {
+ /* FIPS 180-2 says this: (x & y) ^ (x & z) ^ (y & z)
+ * but we can calculate it in fewer steps.
+ */
+ return (x & z) | ((x | z) & y);
+ }
+
+
+static inline u32bit f60_79(u32bit x, u32bit y, u32bit z)
+ {
+ return f20_39(x, y, z);
+ }
+
+#define step(nn_mm, xa, xb, xc, xd, xe, xt, input) do { \
+ (xt) = (input) + f##nn_mm((xb), (xc), (xd)); \
+ (xb) = rol((xb), 30); \
+ (xt) += ((xe) + rol((xa), 5)); \
+ } while(0)
+
+extern "C" void botan_sha1_sse(u32bit* H,
+ const u32bit* inputu)
+ {
+ const __m128i * input = (const __m128i *)inputu;
+ __m128i W0, W1, W2, W3;
+ v4si prep0, prep1, prep2;
+ u32bit a, b, c, d, e, t;
+
+ a = H[0];
+ b = H[1];
+ c = H[2];
+ d = H[3];
+ e = H[4];
+
+ /* i've tried arranging the SSE2 code to be 4, 8, 12, and 16
+ * steps ahead of the integer code. 12 steps ahead seems
+ * to produce the best performance. -dean
+ */
+ W0 = load(&input[0]);
+ prep00_15(prep0, W0); /* prepare for 00 through 03 */
+ W1 = load(&input[1]);
+ prep00_15(prep1, W1); /* prepare for 04 through 07 */
+ W2 = load(&input[2]);
+ prep00_15(prep2, W2); /* prepare for 08 through 11 */
+
+ W3 = load(&input[3]);
+ step(00_19, a, b, c, d, e, t, prep0.u32[0]); /* 00 */
+ step(00_19, t, a, b, c, d, e, prep0.u32[1]); /* 01 */
+ step(00_19, e, t, a, b, c, d, prep0.u32[2]); /* 02 */
+ step(00_19, d, e, t, a, b, c, prep0.u32[3]); /* 03 */
+ prep00_15(prep0, W3);
+ step(00_19, c, d, e, t, a, b, prep1.u32[0]); /* 04 */
+ step(00_19, b, c, d, e, t, a, prep1.u32[1]); /* 05 */
+ step(00_19, a, b, c, d, e, t, prep1.u32[2]); /* 06 */
+ step(00_19, t, a, b, c, d, e, prep1.u32[3]); /* 07 */
+ prep(prep1, W0, W1, W2, W3, K00_19); /* prepare for 16 through 19 */
+ step(00_19, e, t, a, b, c, d, prep2.u32[0]); /* 08 */
+ step(00_19, d, e, t, a, b, c, prep2.u32[1]); /* 09 */
+ step(00_19, c, d, e, t, a, b, prep2.u32[2]); /* 10 */
+ step(00_19, b, c, d, e, t, a, prep2.u32[3]); /* 11 */
+ prep(prep2, W1, W2, W3, W0, K20_39); /* prepare for 20 through 23 */
+ step(00_19, a, b, c, d, e, t, prep0.u32[0]); /* 12 */
+ step(00_19, t, a, b, c, d, e, prep0.u32[1]); /* 13 */
+ step(00_19, e, t, a, b, c, d, prep0.u32[2]); /* 14 */
+ step(00_19, d, e, t, a, b, c, prep0.u32[3]); /* 15 */
+ prep(prep0, W2, W3, W0, W1, K20_39);
+ step(00_19, c, d, e, t, a, b, prep1.u32[0]); /* 16 */
+ step(00_19, b, c, d, e, t, a, prep1.u32[1]); /* 17 */
+ step(00_19, a, b, c, d, e, t, prep1.u32[2]); /* 18 */
+ step(00_19, t, a, b, c, d, e, prep1.u32[3]); /* 19 */
+
+ prep(prep1, W3, W0, W1, W2, K20_39);
+ step(20_39, e, t, a, b, c, d, prep2.u32[0]); /* 20 */
+ step(20_39, d, e, t, a, b, c, prep2.u32[1]); /* 21 */
+ step(20_39, c, d, e, t, a, b, prep2.u32[2]); /* 22 */
+ step(20_39, b, c, d, e, t, a, prep2.u32[3]); /* 23 */
+ prep(prep2, W0, W1, W2, W3, K20_39);
+ step(20_39, a, b, c, d, e, t, prep0.u32[0]); /* 24 */
+ step(20_39, t, a, b, c, d, e, prep0.u32[1]); /* 25 */
+ step(20_39, e, t, a, b, c, d, prep0.u32[2]); /* 26 */
+ step(20_39, d, e, t, a, b, c, prep0.u32[3]); /* 27 */
+ prep(prep0, W1, W2, W3, W0, K20_39);
+ step(20_39, c, d, e, t, a, b, prep1.u32[0]); /* 28 */
+ step(20_39, b, c, d, e, t, a, prep1.u32[1]); /* 29 */
+ step(20_39, a, b, c, d, e, t, prep1.u32[2]); /* 30 */
+ step(20_39, t, a, b, c, d, e, prep1.u32[3]); /* 31 */
+ prep(prep1, W2, W3, W0, W1, K40_59);
+ step(20_39, e, t, a, b, c, d, prep2.u32[0]); /* 32 */
+ step(20_39, d, e, t, a, b, c, prep2.u32[1]); /* 33 */
+ step(20_39, c, d, e, t, a, b, prep2.u32[2]); /* 34 */
+ step(20_39, b, c, d, e, t, a, prep2.u32[3]); /* 35 */
+ prep(prep2, W3, W0, W1, W2, K40_59);
+ step(20_39, a, b, c, d, e, t, prep0.u32[0]); /* 36 */
+ step(20_39, t, a, b, c, d, e, prep0.u32[1]); /* 37 */
+ step(20_39, e, t, a, b, c, d, prep0.u32[2]); /* 38 */
+ step(20_39, d, e, t, a, b, c, prep0.u32[3]); /* 39 */
+
+ prep(prep0, W0, W1, W2, W3, K40_59);
+ step(40_59, c, d, e, t, a, b, prep1.u32[0]); /* 40 */
+ step(40_59, b, c, d, e, t, a, prep1.u32[1]); /* 41 */
+ step(40_59, a, b, c, d, e, t, prep1.u32[2]); /* 42 */
+ step(40_59, t, a, b, c, d, e, prep1.u32[3]); /* 43 */
+ prep(prep1, W1, W2, W3, W0, K40_59);
+ step(40_59, e, t, a, b, c, d, prep2.u32[0]); /* 44 */
+ step(40_59, d, e, t, a, b, c, prep2.u32[1]); /* 45 */
+ step(40_59, c, d, e, t, a, b, prep2.u32[2]); /* 46 */
+ step(40_59, b, c, d, e, t, a, prep2.u32[3]); /* 47 */
+ prep(prep2, W2, W3, W0, W1, K40_59);
+ step(40_59, a, b, c, d, e, t, prep0.u32[0]); /* 48 */
+ step(40_59, t, a, b, c, d, e, prep0.u32[1]); /* 49 */
+ step(40_59, e, t, a, b, c, d, prep0.u32[2]); /* 50 */
+ step(40_59, d, e, t, a, b, c, prep0.u32[3]); /* 51 */
+ prep(prep0, W3, W0, W1, W2, K60_79);
+ step(40_59, c, d, e, t, a, b, prep1.u32[0]); /* 52 */
+ step(40_59, b, c, d, e, t, a, prep1.u32[1]); /* 53 */
+ step(40_59, a, b, c, d, e, t, prep1.u32[2]); /* 54 */
+ step(40_59, t, a, b, c, d, e, prep1.u32[3]); /* 55 */
+ prep(prep1, W0, W1, W2, W3, K60_79);
+ step(40_59, e, t, a, b, c, d, prep2.u32[0]); /* 56 */
+ step(40_59, d, e, t, a, b, c, prep2.u32[1]); /* 57 */
+ step(40_59, c, d, e, t, a, b, prep2.u32[2]); /* 58 */
+ step(40_59, b, c, d, e, t, a, prep2.u32[3]); /* 59 */
+
+ prep(prep2, W1, W2, W3, W0, K60_79);
+ step(60_79, a, b, c, d, e, t, prep0.u32[0]); /* 60 */
+ step(60_79, t, a, b, c, d, e, prep0.u32[1]); /* 61 */
+ step(60_79, e, t, a, b, c, d, prep0.u32[2]); /* 62 */
+ step(60_79, d, e, t, a, b, c, prep0.u32[3]); /* 63 */
+ prep(prep0, W2, W3, W0, W1, K60_79);
+ step(60_79, c, d, e, t, a, b, prep1.u32[0]); /* 64 */
+ step(60_79, b, c, d, e, t, a, prep1.u32[1]); /* 65 */
+ step(60_79, a, b, c, d, e, t, prep1.u32[2]); /* 66 */
+ step(60_79, t, a, b, c, d, e, prep1.u32[3]); /* 67 */
+ prep(prep1, W3, W0, W1, W2, K60_79);
+ step(60_79, e, t, a, b, c, d, prep2.u32[0]); /* 68 */
+ step(60_79, d, e, t, a, b, c, prep2.u32[1]); /* 69 */
+ step(60_79, c, d, e, t, a, b, prep2.u32[2]); /* 70 */
+ step(60_79, b, c, d, e, t, a, prep2.u32[3]); /* 71 */
+
+ step(60_79, a, b, c, d, e, t, prep0.u32[0]); /* 72 */
+ step(60_79, t, a, b, c, d, e, prep0.u32[1]); /* 73 */
+ step(60_79, e, t, a, b, c, d, prep0.u32[2]); /* 74 */
+ step(60_79, d, e, t, a, b, c, prep0.u32[3]); /* 75 */
+ /* no more input to prepare */
+ step(60_79, c, d, e, t, a, b, prep1.u32[0]); /* 76 */
+ step(60_79, b, c, d, e, t, a, prep1.u32[1]); /* 77 */
+ step(60_79, a, b, c, d, e, t, prep1.u32[2]); /* 78 */
+ step(60_79, t, a, b, c, d, e, prep1.u32[3]); /* 79 */
+ /* e, t, a, b, c, d */
+ H[0] += e;
+ H[1] += t;
+ H[2] += a;
+ H[3] += b;
+ H[4] += c;
+ }
+
+}