diff options
Diffstat (limited to 'src/asm')
33 files changed, 4522 insertions, 0 deletions
diff --git a/src/asm/asm_amd64/asm_macr.h b/src/asm/asm_amd64/asm_macr.h new file mode 100644 index 000000000..3cdd42dc6 --- /dev/null +++ b/src/asm/asm_amd64/asm_macr.h @@ -0,0 +1,125 @@ +/************************************************* +* Assembly Macros Header File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_AMD64_ASM_MACROS_H__ +#define BOTAN_AMD64_ASM_MACROS_H__ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +/************************************************* +* General/Global Macros * +*************************************************/ +#define ALIGN .p2align 4,,15 + +#define START_LISTING(FILENAME) \ + .file #FILENAME; \ + .text; \ + ALIGN; + +/************************************************* +* Function Definitions * +*************************************************/ +#define START_FUNCTION(func_name) \ + ALIGN; \ + .global func_name; \ + .type func_name,@function; \ +func_name: + +#define END_FUNCTION(func_name) \ + ret + +/************************************************* +* Conditional Jumps * +*************************************************/ +#define JUMP_IF_ZERO(REG, LABEL) \ + cmp IMM(0), REG; \ + jz LABEL + +#define JUMP_IF_LT(REG, NUM, LABEL) \ + cmp IMM(NUM), REG; \ + jl LABEL + +/************************************************* +* Register Names * +*************************************************/ +#define R0 %rax +#define R1 %rbx +#define R2 %rcx +#define R2_32 %ecx +#define R3 %rdx +#define R3_32 %edx +#define R4 %rsp +#define R5 %rbp +#define R6 %rsi +#define R6_32 %esi +#define R7 %rdi +#define R8 %r8 +#define R9 %r9 +#define R9_32 %r9d +#define R10 %r10 +#define R11 %r11 +#define R12 %r12 +#define R13 %r13 +#define R14 %r14 +#define R15 %r15 +#define R16 %r16 + +#define ARG_1 R7 +#define ARG_2 R6 +#define ARG_2_32 R6_32 +#define ARG_3 R3 +#define ARG_3_32 R3_32 +#define ARG_4 R2 +#define ARG_4_32 R2_32 +#define ARG_5 R8 +#define ARG_6 R9 +#define ARG_6_32 R9_32 + +#define TEMP_1 R10 +#define TEMP_2 R11 +#define TEMP_3 ARG_6 +#define TEMP_4 ARG_5 +#define TEMP_5 ARG_4 +#define TEMP_5_32 ARG_4_32 +#define TEMP_6 ARG_3 +#define TEMP_7 ARG_2 +#define TEMP_8 ARG_1 +#define TEMP_9 R0 + +/************************************************* +* Memory Access Operations * +*************************************************/ +#define ARRAY8(REG, NUM) 8*(NUM)(REG) +#define ARRAY4(REG, NUM) 4*(NUM)(REG) + +#define ASSIGN(TO, FROM) mov FROM, TO + +/************************************************* +* ALU Operations * +*************************************************/ +#define IMM(VAL) $VAL + +#define ADD(TO, FROM) add FROM, TO +#define ADD_LAST_CARRY(REG) adc IMM(0), REG +#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM)) +#define ADD_W_CARRY(TO1, TO2, FROM) add FROM, TO1; adc IMM(0), TO2; +#define SUB_IMM(TO, NUM) sub IMM(NUM), TO +#define MUL(REG) mul REG + +#define XOR(TO, FROM) xor FROM, TO +#define AND(TO, FROM) and FROM, TO +#define OR(TO, FROM) or FROM, TO +#define NOT(REG) not REG +#define ZEROIZE(REG) XOR(REG, REG) + +#define RETURN_VALUE_IS(V) ASSIGN(%rax, V) + +#define ROTL_IMM(REG, NUM) rol IMM(NUM), REG +#define ROTR_IMM(REG, NUM) ror IMM(NUM), REG +#define ADD3_IMM(TO, FROM, NUM) lea NUM(TO,FROM,1), TO + +#endif diff --git a/src/asm/asm_amd64/modinfo.txt b/src/asm/asm_amd64/modinfo.txt new file mode 100644 index 000000000..30aa5a413 --- /dev/null +++ b/src/asm/asm_amd64/modinfo.txt @@ -0,0 +1,39 @@ +realname "x86-64 Assembler" + +mp_bits 64 + +load_on request + +<replace> +sha160.cpp +</replace> + +<ignore> +#mp_mulop.cpp +#mp_monty.cpp +</ignore> + +<add> +asm_macr.h +#mp_mulop_amd64.S +#mp_monty.S +sha1_asm.S +</add> + +<arch> +amd64 +</arch> + +<cc> +gcc +icc +</cc> + +# ELF systems +<os> +linux +freebsd +netbsd +openbsd +solaris +</os> diff --git a/src/asm/asm_amd64/mp_monty.S b/src/asm/asm_amd64/mp_monty.S new file mode 100644 index 000000000..3dd4040bc --- /dev/null +++ b/src/asm/asm_amd64/mp_monty.S @@ -0,0 +1,397 @@ +/************************************************* +* Montgomery Reduction Source File * +* (C) 2008 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(mp_monty.S) + +START_FUNCTION(bigint_monty_redc) + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + + movq %rdi, %r14 # z + movq %rdx, %r12 # x + movl %esi, %ebp # z_size + + xorl %esi, %esi # j.76 + movq %r8, -16(%rsp) # u, u + movl %ecx, %ebx # x_size, x_size + movl %ecx, %r8d # x_size, blocks_of_8 + andl $-8, %r8d #, blocks_of_8 + testl %ecx, %ecx # x_size + je .L3 #, + mov %ecx, %eax # x_size, pretmp.71 + leal 1(%rbx), %r15d #, k.73 + salq $3, %rax #, + xorl %r13d, %r13d # j + movq %rax, -8(%rsp) #, pretmp.21 + .p2align 4,,10 + .p2align 3 +.L11: + mov %r13d, %eax # j, j + movq -16(%rsp), %rdi # u, y + leaq (%r14,%rax,8), %r11 #, z_j + xorl %r9d, %r9d # i + imulq (%r11), %rdi #* z_j, y + xorl %r10d, %r10d # carry + testl %r8d, %r8d # blocks_of_8 + je .L7 #, + .p2align 4,,10 + .p2align 3 +.LOOP_MUL_ADD: + mov %r9d, %ecx # i, i + addl $8, %r9d #, i + salq $3, %rcx #, D.2315 + leaq (%r11,%rcx), %rsi #, tmp130 + leaq (%r12,%rcx), %rcx #, tmp131 + + movq 8*0(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*0(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*0 (%rsi) + + movq 8*1(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*1(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*1 (%rsi) + + movq 8*2(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*2(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*2 (%rsi) + + movq 8*3(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*3(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*3 (%rsi) + + movq 8*4(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*4(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*4 (%rsi) + + movq 8*5(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*5(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*5 (%rsi) + + movq 8*6(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*6(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*6 (%rsi) + + movq 8*7(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*7(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*7 (%rsi) + + cmpl %r9d, %r8d # i, blocks_of_8 + jne .LOOP_MUL_ADD #, + cmpl %r8d, %ebx # blocks_of_8, x_size + je .L8 #, +.L7: + movl %r8d, %esi # blocks_of_8, i + .p2align 4,,10 + .p2align 3 +.L5: + mov %esi, %eax # i, i + movq %rdi, %rcx # y, b + leaq (%r11, %rax,8), %r9 #, D.2325 + incl %esi # i + movq (%r12, %rax,8), %rax #* x, tmp133 + + mulq %rcx # b + addq (%r9), %rax #* D.2325, a + adcq $0,%rdx # + addq %r10, %rax # carry, a + adcq $0,%rdx # + + cmpl %esi, %ebx # i, x_size + movq %rdx, %r10 #, carry + movq %rax, (%r9) # a,* D.2325 + jne .L5 #, +.L8: + movq -8(%rsp), %rdx # pretmp.21, + leaq (%r11,%rdx), %rax #, D.2332 + movq (%rax), %rcx #* D.2332, D.2333 + leaq (%r10,%rcx), %rdx #, z_sum + movq %rdx, (%rax) # z_sum,* D.2332 + cmpq %rdx, %rcx # z_sum, D.2333 + jbe .L9 #, + cmpl %ebp, %r15d # z_size, k.73 + je .L9 #, + movl %r15d, %ecx # k.73, k + jmp .L10 # + .p2align 4,,10 + .p2align 3 +.L31: + incl %ecx # k + cmpl %ecx, %ebp # k, z_size + .p2align 4,,4 + .p2align 3 + je .L9 #, +.L10: + mov %ecx, %edx # k, k + leaq (%r11,%rdx,8), %rdx #, D.2342 + movq (%rdx), %rax #* D.2342, tmp136 + incq %rax # D.2344 + movq %rax, (%rdx) # D.2344,* D.2342 + testq %rax, %rax # D.2344 + je .L31 #, +.L9: + incl %r13d # j + decl %ebp # z_size + cmpl %r13d, %ebx # j, x_size + jne .L11 #, + movl %ebx, %esi # x_size, j.76 +.L3: + leal (%rbx,%rbx), %eax #, tmp137 + mov %eax, %eax + leaq (%r14, %rax,8), %rdi #, D.2349 + cmpq $0, (%rdi) #,* D.2349 + jne .L12 #, + testl %ebx, %ebx # x_size + je .L12 #, + leal -1(%rbx), %ecx #, j + leal (%rsi,%rcx), %edx #, tmp141 + mov %ecx, %eax # j, j + movq (%r14,%rdx,8), %rbp #* z, + cmpq %rbp, (%r12, %rax,8) #,* x + jb .L12 #, + ja .L_EXIT #, + leal -2(%rsi,%rbx), %edx #, ivtmp.45 + jmp .L14 # + .p2align 4,,10 + .p2align 3 +.L15: + mov %edx, %eax # ivtmp.45, ivtmp.45 + decl %ecx # j + movq (%r14, %rax,8), %rsi #* z, D.2360 + mov %ecx, %eax # j, j + movq (%r12, %rax,8), %rax #* x, temp.68 + cmpq %rax, %rsi + ja .L12 #, + decl %edx # ivtmp.45 + cmpq %rax, %rsi + jb .L_EXIT #, +.L14: + testl %ecx, %ecx # j + jne .L15 #, +.L12: + xorl %ecx, %ecx # j + xorl %r10d, %r10d # carry + mov %ebx, %esi # x_size, pretmp.19 + testl %r8d, %r8d # blocks_of_8 + je .L17 #, + .p2align 4,,10 + .p2align 3 +.L22: + mov %ecx, %edx # j, D.2375 + addl $8, %ecx #, j + leaq (%rdx,%rsi), %rax #, tmp146 + leaq (%r12,%rdx,8), %rdx #, tmp150 + leaq (%r14, %rax,8), %rax #, tmp148 + + rorq %r10 # carry + + movq 8*0(%rdx), %r10 + sbbq %r10, 8*0(%rax) + + movq 8*1(%rdx), %r10 + sbbq %r10, 8*1(%rax) + + movq 8*2(%rdx), %r10 + sbbq %r10, 8*2(%rax) + + movq 8*3(%rdx), %r10 + sbbq %r10, 8*3(%rax) + + movq 8*4(%rdx), %r10 + sbbq %r10, 8*4(%rax) + + movq 8*5(%rdx), %r10 + sbbq %r10, 8*5(%rax) + + movq 8*6(%rdx), %r10 + sbbq %r10, 8*6(%rax) + + movq 8*7(%rdx), %r10 + sbbq %r10, 8*7(%rax) + + sbbq %r10,%r10 # carry + negq %r10 # carry + + cmpl %ecx, %r8d # j, blocks_of_8 + jne .L22 #, +.L17: + cmpl %r8d, %ebx # blocks_of_8, x_size + je .L19 #, + leal (%r8,%rbx), %r9d #, ivtmp.33 + movl %r8d, %esi # blocks_of_8, j + .p2align 4,,10 + .p2align 3 +.L20: + mov %r9d, %eax # ivtmp.33, ivtmp.33 + mov %esi, %ecx # j, j + leaq (%r14, %rax,8), %rax #, D.2387 + incl %esi # j + movq (%rax), %rdx #* D.2387, tmp153 + incl %r9d # ivtmp.33 + + rorq %r10 # carry + sbbq (%r12,%rcx,8),%rdx #* x, x + sbbq %r10,%r10 # carry + negq %r10 # carry + + cmpl %esi, %ebx # j, x_size + movq %rdx, (%rax) # x,* D.2387 + jne .L20 #, +.L19: + testq %r10, %r10 # carry + je .L_EXIT #, + decq (%rdi) #* D.2349 +.L_EXIT: + popq %rbx # + popq %rbp # + popq %r12 # + popq %r13 # + popq %r14 # + popq %r15 # +END_FUNCTION(bigint_monty_redc) + + +#if 0 + #define Z_ARR ARG_1 // rdi +#define Z_SIZE ARG_2_32 // esi +// X_ARR is ARG_3 == rdx, moved b/c needed for multiply +#define X_SIZE ARG_4_32 // ecx +#define U ARG_5 // r8 + +/* + We need all arguments for a while (we can reuse U eventually) + So only temp registers are + TEMP_1 %r10 + TEMP_2 %r11 + TEMP_3 = ARG_6 = %r9 + void return, so also + R0 %rax (aka TEMP_9) + is free (but needed for multiply) + + Can push: + %rbx (base pointer, callee saved) + %rpb (frame pointer, callee saved) + %r12-%r15 (callee saved) + + Can push base/frame pointers since this is a leaf function + and does not reference any data. +*/ + + push %r12 + push %r13 + push %r14 + push %r15 + +#define LOOP_CTR_I %r12 +#define LOOP_CTR_J %r13 + +#define CARRY TEMP_1 +#define Z_WORD TEMP_2 +#define X_ARR TEMP_3 +#define MUL_LO %rax +#define MUL_HI %rdx + + ASSIGN(X_ARR, ARG_3) + + /* + ZEROIZE(CARRY) + + ASSIGN(LOOP_CTR, X_SIZE) + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) + +#define MULADD_OP(N) \ + ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ + ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ + MUL(Y) ; \ + ADD(Z_WORD, CARRY) ; \ + ASSIGN(CARRY, MUL_HI) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ADD(Z_WORD, MUL_LO) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) + +ALIGN +.LOOP_MULADD8: + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(Z_ARR, 64) + ADD_IMM(X_ARR, 64) + cmp IMM(8), LOOP_CTR + jge .LOOP_MULADD8 + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + +ALIGN +.LOOP_MULADD1: + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(Z_ARR, 8) + ADD_IMM(X_ARR, 8) + + cmp IMM(0), LOOP_CTR + jne .LOOP_MULADD1 +*/ + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif diff --git a/src/asm/asm_amd64/mp_mulop_amd64.S b/src/asm/asm_amd64/mp_mulop_amd64.S new file mode 100644 index 000000000..e5bba23fb --- /dev/null +++ b/src/asm/asm_amd64/mp_mulop_amd64.S @@ -0,0 +1,128 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(mp_mulop.S) + +#if 0 +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + const u32bit blocks = x_size - (x_size % 8); + + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + word carry = 0; + + /* + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_madd3(z + i + j, x + j, x[i], carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + */ + + + for(u32bit j = 0; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + + for(u32bit j = 0; j != x_size; ++j) + { + dword z = (dword)a * b + c + *d; + *d = (word)(z >> BOTAN_MP_WORD_BITS); + return (word)z; + } + + + + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + + } + + + + z[x_size+i] = carry; + } + } + +#endif + +START_FUNCTION(bigint_simple_sqr) + +#define Z_ARR ARG_1 +#define X_ARR ARG_2 +//#define X_SIZE ARG_3_32 + +#define CARRY TEMP_1 +#define Z_WORD TEMP_2 +#define LOOP_I TEMP_3 +#define LOOP_J TEMP_4 +#define X_SIZE TEMP_5 +#define MUL_LO %rax +// arg 3, xsize +#define MUL_HI %rdx + +// need arg3 == rdx for multiply + ASSIGN(X_SIZE, ARG3_32) + + ZEROIZE(CARRY) + + ZEROIZE(LOOP_I) + +.LOOP_ZEROIZE_Z: + + cmp LOOP_I, X_SIZE + + + + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) + +#define MULADD_OP(N) \ + ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ + ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ + MUL(Y) ; \ + ADD(Z_WORD, CARRY) ; \ + ASSIGN(CARRY, MUL_HI) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ADD(Z_WORD, MUL_LO) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) + +.LOOP_MULADD8: + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(Z_ARR, 64) + ADD_IMM(X_ARR, 64) + cmp IMM(8), LOOP_CTR + jge .LOOP_MULADD8 + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + +ALIGN +.LOOP_MULADD1: + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(Z_ARR, 8) + ADD_IMM(X_ARR, 8) + + cmp IMM(0), LOOP_CTR + jne .LOOP_MULADD1 + +.L_MULADD_DONE: + RETURN_VALUE_IS(CARRY) +END_FUNCTION(bigint_simple_square) diff --git a/src/asm/asm_amd64/sha160.cpp b/src/asm/asm_amd64/sha160.cpp new file mode 100644 index 000000000..cfac02f45 --- /dev/null +++ b/src/asm/asm_amd64/sha160.cpp @@ -0,0 +1,52 @@ +/************************************************* +* SHA-160 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/sha160.h> +#include <botan/loadstor.h> + +namespace Botan { + +extern "C" void botan_sha160_asm_amd64(u32bit[5], const byte[64], u32bit[80]); + +/************************************************* +* SHA-160 Compression Function * +*************************************************/ +void SHA_160::hash(const byte input[]) + { + botan_sha160_asm_amd64(digest, input, W); + } + +/************************************************* +* Copy out the digest * +*************************************************/ +void SHA_160::copy_out(byte output[]) + { + for(u32bit j = 0; j != OUTPUT_LENGTH; ++j) + output[j] = get_byte(j % 4, digest[j/4]); + } + +/************************************************* +* Clear memory of sensitive data * +*************************************************/ +void SHA_160::clear() throw() + { + MDx_HashFunction::clear(); + W.clear(); + digest[0] = 0x67452301; + digest[1] = 0xEFCDAB89; + digest[2] = 0x98BADCFE; + digest[3] = 0x10325476; + digest[4] = 0xC3D2E1F0; + } + +/************************************************* +* SHA_160 Constructor * +*************************************************/ +SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true), W(80) + { + clear(); + } + +} diff --git a/src/asm/asm_amd64/sha1_asm.S b/src/asm/asm_amd64/sha1_asm.S new file mode 100644 index 000000000..ecf4a18ce --- /dev/null +++ b/src/asm/asm_amd64/sha1_asm.S @@ -0,0 +1,258 @@ +/************************************************* +* SHA-160 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(sha1_asm.S) + +START_FUNCTION(botan_sha160_asm_amd64) + +#define DIGEST_ARR %rdi +#define INPUT %rsi +#define W %rdx +#define LOOP_CTR %eax + +#define A %r8d +#define B %r9d +#define C %r10d +#define D %r11d +#define E %ecx + + ZEROIZE(LOOP_CTR) + +ALIGN; +.LOOP_LOAD_INPUT: + addl $8, %eax + + movq ARRAY8(INPUT, 0), %r8 + movq ARRAY8(INPUT, 1), %r9 + movq ARRAY8(INPUT, 2), %r10 + movq ARRAY8(INPUT, 3), %r11 + + bswap %r8 + bswap %r9 + bswap %r10 + bswap %r11 + + rolq $32, %r8 + rolq $32, %r9 + rolq $32, %r10 + rolq $32, %r11 + + movq %r8, ARRAY8(W, 0) + movq %r9, ARRAY8(W, 1) + movq %r10, ARRAY8(W, 2) + movq %r11, ARRAY8(W, 3) + + addq $32, W + addq $32, INPUT + + cmp IMM(16), LOOP_CTR + jne .LOOP_LOAD_INPUT + +/* +#define A %r8d +#define B %r9d +#define C %r10d +#define D %r11d +#define E %ecx +*/ + +ALIGN; +.LOOP_EXPANSION: + addl $4, LOOP_CTR + + ZEROIZE(A) + ASSIGN(B, ARRAY4(W, -1)) + ASSIGN(C, ARRAY4(W, -2)) + ASSIGN(D, ARRAY4(W, -3)) + + XOR(A, ARRAY4(W, -5)) + XOR(B, ARRAY4(W, -6)) + XOR(C, ARRAY4(W, -7)) + XOR(D, ARRAY4(W, -8)) + + XOR(A, ARRAY4(W, -11)) + XOR(B, ARRAY4(W, -12)) + XOR(C, ARRAY4(W, -13)) + XOR(D, ARRAY4(W, -14)) + + XOR(A, ARRAY4(W, -13)) + XOR(B, ARRAY4(W, -14)) + XOR(C, ARRAY4(W, -15)) + XOR(D, ARRAY4(W, -16)) + + ROTL_IMM(D, 1) + ROTL_IMM(C, 1) + ROTL_IMM(B, 1) + XOR(A, D) + ROTL_IMM(A, 1) + + ASSIGN(ARRAY4(W, 0), D) + ASSIGN(ARRAY4(W, 1), C) + ASSIGN(ARRAY4(W, 2), B) + ASSIGN(ARRAY4(W, 3), A) + + addq $16, W + cmp IMM(80), LOOP_CTR + jne .LOOP_EXPANSION + + subq $320, W + +#define MAGIC1 0x5A827999 +#define MAGIC2 0x6ED9EBA1 +#define MAGIC3 0x8F1BBCDC +#define MAGIC4 0xCA62C1D6 + +#define T %esi +#define T2 %eax + +#define F1(A, B, C, D, E, F, N) \ + ASSIGN(T2, ARRAY4(W, N)) ; \ + ASSIGN(A, F) ; \ + ROTL_IMM(F, 5) ; \ + ADD(F, E) ; \ + ASSIGN(E, C) ; \ + XOR(E, D) ; \ + ADD3_IMM(F, T2, MAGIC1) ; \ + AND(E, B) ; \ + XOR(E, D) ; \ + ROTR_IMM(B, 2) ; \ + ADD(E, F) ; + +#define F2_4(A, B, C, D, E, F, N, MAGIC) \ + ASSIGN(T2, ARRAY4(W, N)) ; \ + ASSIGN(A, F) ; \ + ROTL_IMM(F, 5) ; \ + ADD(F, E) ; \ + ASSIGN(E, B) ; \ + XOR(E, C) ; \ + ADD3_IMM(F, T2, MAGIC) ; \ + XOR(E, D) ; \ + ROTR_IMM(B, 2) ; \ + ADD(E, F) ; + +#define F3(A, B, C, D, E, F, N) \ + ASSIGN(T2, ARRAY4(W, N)) ; \ + ASSIGN(A, F) ; \ + ROTL_IMM(F, 5) ; \ + ADD(F, E) ; \ + ASSIGN(E, B) ; \ + OR(E, C) ; \ + AND(E, D) ; \ + ADD3_IMM(F, T2, MAGIC3) ; \ + ASSIGN(T2, B) ; \ + AND(T2, C) ; \ + OR(E, T2) ; \ + ROTR_IMM(B, 2) ; \ + ADD(E, F) ; + +#define F2(A, B, C, D, E, F, W) \ + F2_4(A, B, C, D, E, F, W, MAGIC2) + +#define F4(A, B, C, D, E, F, W) \ + F2_4(A, B, C, D, E, F, W, MAGIC4) + + ASSIGN(T, ARRAY4(DIGEST_ARR, 0)) + ASSIGN(B, ARRAY4(DIGEST_ARR, 1)) + ASSIGN(C, ARRAY4(DIGEST_ARR, 2)) + ASSIGN(D, ARRAY4(DIGEST_ARR, 3)) + ASSIGN(E, ARRAY4(DIGEST_ARR, 4)) + + /* First Round */ + F1(A, B, C, D, E, T, 0) + F1(T, A, B, C, D, E, 1) + F1(E, T, A, B, C, D, 2) + F1(D, E, T, A, B, C, 3) + F1(C, D, E, T, A, B, 4) + F1(B, C, D, E, T, A, 5) + F1(A, B, C, D, E, T, 6) + F1(T, A, B, C, D, E, 7) + F1(E, T, A, B, C, D, 8) + F1(D, E, T, A, B, C, 9) + F1(C, D, E, T, A, B, 10) + F1(B, C, D, E, T, A, 11) + F1(A, B, C, D, E, T, 12) + F1(T, A, B, C, D, E, 13) + F1(E, T, A, B, C, D, 14) + F1(D, E, T, A, B, C, 15) + F1(C, D, E, T, A, B, 16) + F1(B, C, D, E, T, A, 17) + F1(A, B, C, D, E, T, 18) + F1(T, A, B, C, D, E, 19) + + /* Second Round */ + F2(E, T, A, B, C, D, 20) + F2(D, E, T, A, B, C, 21) + F2(C, D, E, T, A, B, 22) + F2(B, C, D, E, T, A, 23) + F2(A, B, C, D, E, T, 24) + F2(T, A, B, C, D, E, 25) + F2(E, T, A, B, C, D, 26) + F2(D, E, T, A, B, C, 27) + F2(C, D, E, T, A, B, 28) + F2(B, C, D, E, T, A, 29) + F2(A, B, C, D, E, T, 30) + F2(T, A, B, C, D, E, 31) + F2(E, T, A, B, C, D, 32) + F2(D, E, T, A, B, C, 33) + F2(C, D, E, T, A, B, 34) + F2(B, C, D, E, T, A, 35) + F2(A, B, C, D, E, T, 36) + F2(T, A, B, C, D, E, 37) + F2(E, T, A, B, C, D, 38) + F2(D, E, T, A, B, C, 39) + + /* Third Round */ + F3(C, D, E, T, A, B, 40) + F3(B, C, D, E, T, A, 41) + F3(A, B, C, D, E, T, 42) + F3(T, A, B, C, D, E, 43) + F3(E, T, A, B, C, D, 44) + F3(D, E, T, A, B, C, 45) + F3(C, D, E, T, A, B, 46) + F3(B, C, D, E, T, A, 47) + F3(A, B, C, D, E, T, 48) + F3(T, A, B, C, D, E, 49) + F3(E, T, A, B, C, D, 50) + F3(D, E, T, A, B, C, 51) + F3(C, D, E, T, A, B, 52) + F3(B, C, D, E, T, A, 53) + F3(A, B, C, D, E, T, 54) + F3(T, A, B, C, D, E, 55) + F3(E, T, A, B, C, D, 56) + F3(D, E, T, A, B, C, 57) + F3(C, D, E, T, A, B, 58) + F3(B, C, D, E, T, A, 59) + + /* Fourth Round */ + F4(A, B, C, D, E, T, 60) + F4(T, A, B, C, D, E, 61) + F4(E, T, A, B, C, D, 62) + F4(D, E, T, A, B, C, 63) + F4(C, D, E, T, A, B, 64) + F4(B, C, D, E, T, A, 65) + F4(A, B, C, D, E, T, 66) + F4(T, A, B, C, D, E, 67) + F4(E, T, A, B, C, D, 68) + F4(D, E, T, A, B, C, 69) + F4(C, D, E, T, A, B, 70) + F4(B, C, D, E, T, A, 71) + F4(A, B, C, D, E, T, 72) + F4(T, A, B, C, D, E, 73) + F4(E, T, A, B, C, D, 74) + F4(D, E, T, A, B, C, 75) + F4(C, D, E, T, A, B, 76) + F4(B, C, D, E, T, A, 77) + F4(A, B, C, D, E, T, 78) + F4(T, A, B, C, D, E, 79) + + ADD(ARRAY4(DIGEST_ARR, 0), D) + ADD(ARRAY4(DIGEST_ARR, 1), T) + ADD(ARRAY4(DIGEST_ARR, 2), A) + ADD(ARRAY4(DIGEST_ARR, 3), B) + ADD(ARRAY4(DIGEST_ARR, 4), C) + +END_FUNCTION(botan_sha160_asm_amd64) diff --git a/src/asm/asm_ia32/asm_macr.h b/src/asm/asm_ia32/asm_macr.h new file mode 100644 index 000000000..6d5dbb59d --- /dev/null +++ b/src/asm/asm_ia32/asm_macr.h @@ -0,0 +1,131 @@ +/************************************************* +* Assembly Macros Header File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_IA32_ASM_MACROS_H__ +#define BOTAN_IA32_ASM_MACROS_H__ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +/************************************************* +* General/Global Macros * +*************************************************/ +#define ALIGN .p2align 4,,15 + +#define START_LISTING(FILENAME) \ + .file #FILENAME; \ + .text; \ + .p2align 4,,15; + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +/************************************************* +* Function Definitions * +*************************************************/ +#define START_FUNCTION(func_name) \ + .align 8; \ + ALIGN; \ + .global func_name; \ + .type func_name,@function; \ +func_name: + +#define END_FUNCTION(func_name) \ + ret + +/************************************************* +* Loop Control * +*************************************************/ +#define START_LOOP(LABEL) \ + ALIGN; \ + LABEL##_LOOP: + +#define LOOP_UNTIL_EQ(REG, NUM, LABEL) \ + cmpl IMM(NUM), REG; \ + jne LABEL##_LOOP + +#define LOOP_UNTIL_LT(REG, NUM, LABEL) \ + cmpl IMM(NUM), REG; \ + jge LABEL##_LOOP + +/************************************************* + Conditional Jumps * +*************************************************/ +#define JUMP_IF_ZERO(REG, LABEL) \ + cmpl IMM(0), REG; \ + jz LABEL + +#define JUMP_IF_LT(REG, NUM, LABEL) \ + cmpl IMM(NUM), REG; \ + jl LABEL + +/************************************************* +* Register Names * +*************************************************/ +#define EAX %eax +#define EBX %ebx +#define ECX %ecx +#define EDX %edx +#define EBP %ebp +#define EDI %edi +#define ESI %esi +#define ESP %esp + +/************************************************* +* Memory Access Operations * +*************************************************/ +#define ARRAY1(REG, NUM) (NUM)(REG) +#define ARRAY4(REG, NUM) 4*(NUM)(REG) +#define ARRAY4_INDIRECT(BASE, OFFSET, NUM) 4*(NUM)(BASE,OFFSET,4) +#define ARG(NUM) 4*(PUSHED) + ARRAY4(ESP, NUM) + +#define ASSIGN(TO, FROM) movl FROM, TO +#define ASSIGN_BYTE(TO, FROM) movzbl FROM, TO + +#define PUSH(REG) pushl REG +#define POP(REG) popl REG + +#define SPILL_REGS() \ + PUSH(EBP) ; \ + PUSH(EDI) ; \ + PUSH(ESI) ; \ + PUSH(EBX) + +#define RESTORE_REGS() \ + POP(EBX) ; \ + POP(ESI) ; \ + POP(EDI) ; \ + POP(EBP) + +/************************************************* +* ALU Operations * +*************************************************/ +#define IMM(VAL) $VAL + +#define ADD(TO, FROM) addl FROM, TO +#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM)) +#define ADD_W_CARRY(TO1, TO2, FROM) addl FROM, TO1; adcl IMM(0), TO2; +#define SUB_IMM(TO, NUM) subl IMM(NUM), TO +#define ADD2_IMM(TO, FROM, NUM) leal NUM(FROM), TO +#define ADD3_IMM(TO, FROM, NUM) leal NUM(TO,FROM,1), TO +#define MUL(REG) mull REG + +#define SHL_IMM(REG, SHIFT) shll IMM(SHIFT), REG +#define SHR_IMM(REG, SHIFT) shrl IMM(SHIFT), REG +#define SHL2_3(TO, FROM) leal 0(,FROM,8), TO + +#define XOR(TO, FROM) xorl FROM, TO +#define AND(TO, FROM) andl FROM, TO +#define OR(TO, FROM) orl FROM, TO +#define NOT(REG) notl REG +#define ZEROIZE(REG) XOR(REG, REG) + +#define ROTL_IMM(REG, NUM) roll IMM(NUM), REG +#define ROTR_IMM(REG, NUM) rorl IMM(NUM), REG +#define BSWAP(REG) bswapl REG + +#endif diff --git a/src/asm/asm_ia32/md4.cpp b/src/asm/asm_ia32/md4.cpp new file mode 100644 index 000000000..e3dc79012 --- /dev/null +++ b/src/asm/asm_ia32/md4.cpp @@ -0,0 +1,43 @@ +/************************************************* +* MD4 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/md4.h> +#include <botan/loadstor.h> + +namespace Botan { + +extern "C" void md4_core(u32bit[4], const byte[64], u32bit[16]); + +/************************************************* +* MD4 Compression Function * +*************************************************/ +void MD4::hash(const byte input[]) + { + md4_core(digest, input, M); + } + +/************************************************* +* Copy out the digest * +*************************************************/ +void MD4::copy_out(byte output[]) + { + for(u32bit j = 0; j != OUTPUT_LENGTH; ++j) + output[j] = get_byte(3 - (j % 4), digest[j/4]); + } + +/************************************************* +* Clear memory of sensitive data * +*************************************************/ +void MD4::clear() throw() + { + MDx_HashFunction::clear(); + M.clear(); + digest[0] = 0x67452301; + digest[1] = 0xEFCDAB89; + digest[2] = 0x98BADCFE; + digest[3] = 0x10325476; + } + +} diff --git a/src/asm/asm_ia32/md4core.S b/src/asm/asm_ia32/md4core.S new file mode 100644 index 000000000..662e9924a --- /dev/null +++ b/src/asm/asm_ia32/md4core.S @@ -0,0 +1,135 @@ +/************************************************* +* MD4 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(md4core.S) + +START_FUNCTION(md4_core) + SPILL_REGS() + +#define PUSHED 4 + + ASSIGN(EBP, ARG(2)) /* input block */ + ASSIGN(EDI, ARG(3)) /* expanded words */ + + ZEROIZE(ESI) + +START_LOOP(.LOAD_INPUT) + ADD_IMM(ESI, 4) + + ASSIGN(EAX, ARRAY4(EBP, 0)) + ASSIGN(EBX, ARRAY4(EBP, 1)) + ASSIGN(ECX, ARRAY4(EBP, 2)) + ASSIGN(EDX, ARRAY4(EBP, 3)) + + ADD_IMM(EBP, 16) + + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-4), EAX) + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-3), EBX) + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-2), ECX) + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-1), EDX) +LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT) + + ASSIGN(EBP, ARG(1)) + ASSIGN(EAX, ARRAY4(EBP, 0)) + ASSIGN(EBX, ARRAY4(EBP, 1)) + ASSIGN(ECX, ARRAY4(EBP, 2)) + ASSIGN(EDX, ARRAY4(EBP, 3)) + +#define MSG EDI +#define T1 ESI +#define T2 EBP + +#define FF(A, B, C, D, N, S) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, C) ; \ + XOR(T2, D) ; \ + AND(T2, B) ; \ + XOR(T2, D) ; \ + ADD(A, T1) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; + +#define GG(A, B, C, D, N, S) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, B) ; \ + OR(T2, C) ; \ + AND(T2, D) ; \ + ADD3_IMM(A, T1, 0x5A827999) ; \ + ASSIGN(T1, B) ; \ + AND(T1, C) ; \ + OR(T2, T1) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; + +#define HH(A, B, C, D, N, S) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, B) ; \ + XOR(T2, C) ; \ + XOR(T2, D) ; \ + ADD3_IMM(A, T1, 0x6ED9EBA1) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; + + FF(EAX,EBX,ECX,EDX, 0, 3); + FF(EDX,EAX,EBX,ECX, 1, 7); + FF(ECX,EDX,EAX,EBX, 2,11); + FF(EBX,ECX,EDX,EAX, 3,19); + FF(EAX,EBX,ECX,EDX, 4, 3); + FF(EDX,EAX,EBX,ECX, 5, 7); + FF(ECX,EDX,EAX,EBX, 6,11); + FF(EBX,ECX,EDX,EAX, 7,19); + FF(EAX,EBX,ECX,EDX, 8, 3); + FF(EDX,EAX,EBX,ECX, 9, 7); + FF(ECX,EDX,EAX,EBX,10,11); + FF(EBX,ECX,EDX,EAX,11,19); + FF(EAX,EBX,ECX,EDX,12, 3); + FF(EDX,EAX,EBX,ECX,13, 7); + FF(ECX,EDX,EAX,EBX,14,11); + FF(EBX,ECX,EDX,EAX,15,19); + + GG(EAX,EBX,ECX,EDX, 0, 3); + GG(EDX,EAX,EBX,ECX, 4, 5); + GG(ECX,EDX,EAX,EBX, 8, 9); + GG(EBX,ECX,EDX,EAX,12,13); + GG(EAX,EBX,ECX,EDX, 1, 3); + GG(EDX,EAX,EBX,ECX, 5, 5); + GG(ECX,EDX,EAX,EBX, 9, 9); + GG(EBX,ECX,EDX,EAX,13,13); + GG(EAX,EBX,ECX,EDX, 2, 3); + GG(EDX,EAX,EBX,ECX, 6, 5); + GG(ECX,EDX,EAX,EBX,10, 9); + GG(EBX,ECX,EDX,EAX,14,13); + GG(EAX,EBX,ECX,EDX, 3, 3); + GG(EDX,EAX,EBX,ECX, 7, 5); + GG(ECX,EDX,EAX,EBX,11, 9); + GG(EBX,ECX,EDX,EAX,15,13); + + HH(EAX,EBX,ECX,EDX, 0, 3); + HH(EDX,EAX,EBX,ECX, 8, 9); + HH(ECX,EDX,EAX,EBX, 4,11); + HH(EBX,ECX,EDX,EAX,12,15); + HH(EAX,EBX,ECX,EDX, 2, 3); + HH(EDX,EAX,EBX,ECX,10, 9); + HH(ECX,EDX,EAX,EBX, 6,11); + HH(EBX,ECX,EDX,EAX,14,15); + HH(EAX,EBX,ECX,EDX, 1, 3); + HH(EDX,EAX,EBX,ECX, 9, 9); + HH(ECX,EDX,EAX,EBX, 5,11); + HH(EBX,ECX,EDX,EAX,13,15); + HH(EAX,EBX,ECX,EDX, 3, 3); + HH(EDX,EAX,EBX,ECX,11, 9); + HH(ECX,EDX,EAX,EBX, 7,11); + HH(EBX,ECX,EDX,EAX,15,15); + + ASSIGN(EBP, ARG(1)) + ADD(ARRAY4(EBP, 0), EAX) + ADD(ARRAY4(EBP, 1), EBX) + ADD(ARRAY4(EBP, 2), ECX) + ADD(ARRAY4(EBP, 3), EDX) + + RESTORE_REGS() +END_FUNCTION(md4_core) diff --git a/src/asm/asm_ia32/md5.cpp b/src/asm/asm_ia32/md5.cpp new file mode 100644 index 000000000..cfe48e7e9 --- /dev/null +++ b/src/asm/asm_ia32/md5.cpp @@ -0,0 +1,43 @@ +/************************************************* +* MD5 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/md5.h> +#include <botan/loadstor.h> + +namespace Botan { + +extern "C" void md5_core(u32bit[4], const byte[64], u32bit[16]); + +/************************************************* +* MD5 Compression Function * +*************************************************/ +void MD5::hash(const byte input[]) + { + md5_core(digest, input, M); + } + +/************************************************* +* Copy out the digest * +*************************************************/ +void MD5::copy_out(byte output[]) + { + for(u32bit j = 0; j != OUTPUT_LENGTH; ++j) + output[j] = get_byte(3 - (j % 4), digest[j/4]); + } + +/************************************************* +* Clear memory of sensitive data * +*************************************************/ +void MD5::clear() throw() + { + MDx_HashFunction::clear(); + M.clear(); + digest[0] = 0x67452301; + digest[1] = 0xEFCDAB89; + digest[2] = 0x98BADCFE; + digest[3] = 0x10325476; + } + +} diff --git a/src/asm/asm_ia32/md5core.S b/src/asm/asm_ia32/md5core.S new file mode 100644 index 000000000..8ebe469f3 --- /dev/null +++ b/src/asm/asm_ia32/md5core.S @@ -0,0 +1,164 @@ +/************************************************* +* MD5 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(md5core.S) + +START_FUNCTION(md5_core) + SPILL_REGS() + +#define PUSHED 4 + + ASSIGN(EBP, ARG(2)) /* input block */ + ASSIGN(EDI, ARG(3)) /* expanded words */ + + ZEROIZE(ESI) + +START_LOOP(.LOAD_INPUT) + ADD_IMM(ESI, 4) + + ASSIGN(EAX, ARRAY4(EBP, 0)) + ASSIGN(EBX, ARRAY4(EBP, 1)) + ASSIGN(ECX, ARRAY4(EBP, 2)) + ASSIGN(EDX, ARRAY4(EBP, 3)) + + ADD_IMM(EBP, 16) + + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-4), EAX) + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-3), EBX) + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-2), ECX) + ASSIGN(ARRAY4_INDIRECT(EDI,ESI,-1), EDX) +LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT) + + ASSIGN(EBP, ARG(1)) + ASSIGN(EAX, ARRAY4(EBP, 0)) + ASSIGN(EBX, ARRAY4(EBP, 1)) + ASSIGN(ECX, ARRAY4(EBP, 2)) + ASSIGN(EDX, ARRAY4(EBP, 3)) + +#define MSG EDI +#define T1 ESI +#define T2 EBP + +#define FF(A, B, C, D, N, S, MAGIC) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, C) ; \ + XOR(T2, D) ; \ + AND(T2, B) ; \ + XOR(T2, D) ; \ + ADD3_IMM(A, T1, MAGIC) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; \ + ADD(A, B) ; + +#define GG(A, B, C, D, N, S, MAGIC) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, B) ; \ + XOR(T2, C) ; \ + AND(T2, D) ; \ + XOR(T2, C) ; \ + ADD3_IMM(A, T1, MAGIC) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; \ + ADD(A, B) ; + +#define HH(A, B, C, D, N, S, MAGIC) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, B) ; \ + XOR(T2, C) ; \ + XOR(T2, D) ; \ + ADD3_IMM(A, T1, MAGIC) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; \ + ADD(A, B) ; + +#define II(A, B, C, D, N, S, MAGIC) \ + ASSIGN(T1, ARRAY4(MSG, N)) ; \ + ASSIGN(T2, D) ; \ + NOT(T2) ; \ + OR(T2, B) ; \ + XOR(T2, C) ; \ + ADD3_IMM(A, T1, MAGIC) ; \ + ADD(A, T2) ; \ + ROTL_IMM(A, S) ; \ + ADD(A, B) ; + + FF(EAX,EBX,ECX,EDX, 0, 7,0xD76AA478); + FF(EDX,EAX,EBX,ECX, 1,12,0xE8C7B756); + FF(ECX,EDX,EAX,EBX, 2,17,0x242070DB); + FF(EBX,ECX,EDX,EAX, 3,22,0xC1BDCEEE); + FF(EAX,EBX,ECX,EDX, 4, 7,0xF57C0FAF); + FF(EDX,EAX,EBX,ECX, 5,12,0x4787C62A); + FF(ECX,EDX,EAX,EBX, 6,17,0xA8304613); + FF(EBX,ECX,EDX,EAX, 7,22,0xFD469501); + FF(EAX,EBX,ECX,EDX, 8, 7,0x698098D8); + FF(EDX,EAX,EBX,ECX, 9,12,0x8B44F7AF); + FF(ECX,EDX,EAX,EBX,10,17,0xFFFF5BB1); + FF(EBX,ECX,EDX,EAX,11,22,0x895CD7BE); + FF(EAX,EBX,ECX,EDX,12, 7,0x6B901122); + FF(EDX,EAX,EBX,ECX,13,12,0xFD987193); + FF(ECX,EDX,EAX,EBX,14,17,0xA679438E); + FF(EBX,ECX,EDX,EAX,15,22,0x49B40821); + + GG(EAX,EBX,ECX,EDX, 1, 5,0xF61E2562); + GG(EDX,EAX,EBX,ECX, 6, 9,0xC040B340); + GG(ECX,EDX,EAX,EBX,11,14,0x265E5A51); + GG(EBX,ECX,EDX,EAX, 0,20,0xE9B6C7AA); + GG(EAX,EBX,ECX,EDX, 5, 5,0xD62F105D); + GG(EDX,EAX,EBX,ECX,10, 9,0x02441453); + GG(ECX,EDX,EAX,EBX,15,14,0xD8A1E681); + GG(EBX,ECX,EDX,EAX, 4,20,0xE7D3FBC8); + GG(EAX,EBX,ECX,EDX, 9, 5,0x21E1CDE6); + GG(EDX,EAX,EBX,ECX,14, 9,0xC33707D6); + GG(ECX,EDX,EAX,EBX, 3,14,0xF4D50D87); + GG(EBX,ECX,EDX,EAX, 8,20,0x455A14ED); + GG(EAX,EBX,ECX,EDX,13, 5,0xA9E3E905); + GG(EDX,EAX,EBX,ECX, 2, 9,0xFCEFA3F8); + GG(ECX,EDX,EAX,EBX, 7,14,0x676F02D9); + GG(EBX,ECX,EDX,EAX,12,20,0x8D2A4C8A); + + HH(EAX,EBX,ECX,EDX, 5, 4,0xFFFA3942); + HH(EDX,EAX,EBX,ECX, 8,11,0x8771F681); + HH(ECX,EDX,EAX,EBX,11,16,0x6D9D6122); + HH(EBX,ECX,EDX,EAX,14,23,0xFDE5380C); + HH(EAX,EBX,ECX,EDX, 1, 4,0xA4BEEA44); + HH(EDX,EAX,EBX,ECX, 4,11,0x4BDECFA9); + HH(ECX,EDX,EAX,EBX, 7,16,0xF6BB4B60); + HH(EBX,ECX,EDX,EAX,10,23,0xBEBFBC70); + HH(EAX,EBX,ECX,EDX,13, 4,0x289B7EC6); + HH(EDX,EAX,EBX,ECX, 0,11,0xEAA127FA); + HH(ECX,EDX,EAX,EBX, 3,16,0xD4EF3085); + HH(EBX,ECX,EDX,EAX, 6,23,0x04881D05); + HH(EAX,EBX,ECX,EDX, 9, 4,0xD9D4D039); + HH(EDX,EAX,EBX,ECX,12,11,0xE6DB99E5); + HH(ECX,EDX,EAX,EBX,15,16,0x1FA27CF8); + HH(EBX,ECX,EDX,EAX, 2,23,0xC4AC5665); + + II(EAX,EBX,ECX,EDX, 0, 6,0xF4292244); + II(EDX,EAX,EBX,ECX, 7,10,0x432AFF97); + II(ECX,EDX,EAX,EBX,14,15,0xAB9423A7); + II(EBX,ECX,EDX,EAX, 5,21,0xFC93A039); + II(EAX,EBX,ECX,EDX,12, 6,0x655B59C3); + II(EDX,EAX,EBX,ECX, 3,10,0x8F0CCC92); + II(ECX,EDX,EAX,EBX,10,15,0xFFEFF47D); + II(EBX,ECX,EDX,EAX, 1,21,0x85845DD1); + II(EAX,EBX,ECX,EDX, 8, 6,0x6FA87E4F); + II(EDX,EAX,EBX,ECX,15,10,0xFE2CE6E0); + II(ECX,EDX,EAX,EBX, 6,15,0xA3014314); + II(EBX,ECX,EDX,EAX,13,21,0x4E0811A1); + II(EAX,EBX,ECX,EDX, 4, 6,0xF7537E82); + II(EDX,EAX,EBX,ECX,11,10,0xBD3AF235); + II(ECX,EDX,EAX,EBX, 2,15,0x2AD7D2BB); + II(EBX,ECX,EDX,EAX, 9,21,0xEB86D391); + + ASSIGN(EBP, ARG(1)) + ADD(ARRAY4(EBP, 0), EAX) + ADD(ARRAY4(EBP, 1), EBX) + ADD(ARRAY4(EBP, 2), ECX) + ADD(ARRAY4(EBP, 3), EDX) + + RESTORE_REGS() +END_FUNCTION(md5_core) diff --git a/src/asm/asm_ia32/modinfo.txt b/src/asm/asm_ia32/modinfo.txt new file mode 100644 index 000000000..12c8cd96d --- /dev/null +++ b/src/asm/asm_ia32/modinfo.txt @@ -0,0 +1,43 @@ +realname "x86 Assembler" + +#mp_bits 32 + +load_on asm_ok + +<replace> +md4.cpp +md5.cpp +sha160.cpp +serpent.cpp +</replace> + +<ignore> +#mp_mulop.cpp +</ignore> + +<add> +asm_macr.h +md4core.S +md5core.S +sha1_asm.S +serp_asm.S +#mp_mulop.S +</add> + +<arch> +ia32 +</arch> + +<cc> +gcc +icc +</cc> + +# ELF systems +<os> +linux +freebsd +netbsd +openbsd +solaris +</os> diff --git a/src/asm/asm_ia32/mp_mulop.S b/src/asm/asm_ia32/mp_mulop.S new file mode 100644 index 000000000..a5f0d3b27 --- /dev/null +++ b/src/asm/asm_ia32/mp_mulop.S @@ -0,0 +1,62 @@ +/************************************************* +* Multiply/Add Algorithm Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(mp_muladd.S) + +START_FUNCTION(bigint_mul_add_words) + SPILL_REGS() +#define PUSHED 4 + +#define LOOP_CTR ESI + ASSIGN(LOOP_CTR, ARG(3)) /* x_size */ + ZEROIZE(EDI) + + ASSIGN(ECX, ARG(1)) /* z[] */ + ASSIGN(EBX, ARG(2)) /* x[] */ + ASSIGN(EBP, ARG(4)) /* y */ + +#define MULADD_OP(N) \ + ASSIGN(EAX, ARRAY4(EBX, N)) ; \ + MUL(EBP) ; \ + ADD_W_CARRY(EAX, EDX, EDI) ; \ + ASSIGN(EDI, EDX) ; \ + ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ; + + JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP) + +START_LOOP(.MULADD8) + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(EBX, 32) + ADD_IMM(ECX, 32) +LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8) + + JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) + +START_LOOP(.MULADD1) + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(EBX, 4) + ADD_IMM(ECX, 4) +LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1) + +.MUL_ADD_DONE: + + ASSIGN(EAX, EDI) +#undef PUSHED + RESTORE_REGS() +END_FUNCTION(bigint_mul_add_words) diff --git a/src/asm/asm_ia32/serp_asm.S b/src/asm/asm_ia32/serp_asm.S new file mode 100644 index 000000000..c8915382d --- /dev/null +++ b/src/asm/asm_ia32/serp_asm.S @@ -0,0 +1,667 @@ +/************************************************* +* Serpent Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(serp_asm.S) + +#define SBOX_E1(A, B, C, D, T) \ + XOR(D, A) ; \ + ASSIGN(T, B) ; \ + AND(B, D) ; \ + XOR(T, C) ; \ + XOR(B, A) ; \ + OR(A, D) ; \ + XOR(A, T) ; \ + XOR(T, D) ; \ + XOR(D, C) ; \ + OR(C, B) ; \ + XOR(C, T) ; \ + NOT(T) ; \ + OR(T, B) ; \ + XOR(B, D) ; \ + XOR(B, T) ; \ + OR(D, A) ; \ + XOR(B, D) ; \ + XOR(T, D) ; \ + ASSIGN(D, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, T) ; + +#define SBOX_E2(A, B, C, D, T) \ + NOT(A) ; \ + NOT(C) ; \ + ASSIGN(T, A) ; \ + AND(A, B) ; \ + XOR(C, A) ; \ + OR(A, D) ; \ + XOR(D, C) ; \ + XOR(B, A) ; \ + XOR(A, T) ; \ + OR(T, B) ; \ + XOR(B, D) ; \ + OR(C, A) ; \ + AND(C, T) ; \ + XOR(A, B) ; \ + AND(B, C) ; \ + XOR(B, A) ; \ + AND(A, C) ; \ + XOR(T, A) ; \ + ASSIGN(A, C) ; \ + ASSIGN(C, D) ; \ + ASSIGN(D, B) ; \ + ASSIGN(B, T) ; + +#define SBOX_E3(A, B, C, D, T) \ + ASSIGN(T, A) ; \ + AND(A, C) ; \ + XOR(A, D) ; \ + XOR(C, B) ; \ + XOR(C, A) ; \ + OR(D, T) ; \ + XOR(D, B) ; \ + XOR(T, C) ; \ + ASSIGN(B, D) ; \ + OR(D, T) ; \ + XOR(D, A) ; \ + AND(A, B) ; \ + XOR(T, A) ; \ + XOR(B, D) ; \ + XOR(B, T) ; \ + NOT(T) ; \ + ASSIGN(A, C) ; \ + ASSIGN(C, B) ; \ + ASSIGN(B, D) ; \ + ASSIGN(D, T) ; + +#define SBOX_E4(A, B, C, D, T) \ + ASSIGN(T, A) ; \ + OR(A, D) ; \ + XOR(D, B) ; \ + AND(B, T) ; \ + XOR(T, C) ; \ + XOR(C, D) ; \ + AND(D, A) ; \ + OR(T, B) ; \ + XOR(D, T) ; \ + XOR(A, B) ; \ + AND(T, A) ; \ + XOR(B, D) ; \ + XOR(T, C) ; \ + OR(B, A) ; \ + XOR(B, C) ; \ + XOR(A, D) ; \ + ASSIGN(C, B) ; \ + OR(B, D) ; \ + XOR(B, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, C) ; \ + ASSIGN(C, D) ; \ + ASSIGN(D, T) ; + +#define SBOX_E5(A, B, C, D, T) \ + XOR(B, D) ; \ + NOT(D) ; \ + XOR(C, D) ; \ + XOR(D, A) ; \ + ASSIGN(T, B) ; \ + AND(B, D) ; \ + XOR(B, C) ; \ + XOR(T, D) ; \ + XOR(A, T) ; \ + AND(C, T) ; \ + XOR(C, A) ; \ + AND(A, B) ; \ + XOR(D, A) ; \ + OR(T, B) ; \ + XOR(T, A) ; \ + OR(A, D) ; \ + XOR(A, C) ; \ + AND(C, D) ; \ + NOT(A) ; \ + XOR(T, C) ; \ + ASSIGN(C, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, T) ; + +#define SBOX_E6(A, B, C, D, T) \ + XOR(A, B) ; \ + XOR(B, D) ; \ + NOT(D) ; \ + ASSIGN(T, B) ; \ + AND(B, A) ; \ + XOR(C, D) ; \ + XOR(B, C) ; \ + OR(C, T) ; \ + XOR(T, D) ; \ + AND(D, B) ; \ + XOR(D, A) ; \ + XOR(T, B) ; \ + XOR(T, C) ; \ + XOR(C, A) ; \ + AND(A, D) ; \ + NOT(C) ; \ + XOR(A, T) ; \ + OR(T, D) ; \ + XOR(T, C) ; \ + ASSIGN(C, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, D) ; \ + ASSIGN(D, T) ; + +#define SBOX_E7(A, B, C, D, T) \ + NOT(C) ; \ + ASSIGN(T, D) ; \ + AND(D, A) ; \ + XOR(A, T) ; \ + XOR(D, C) ; \ + OR(C, T) ; \ + XOR(B, D) ; \ + XOR(C, A) ; \ + OR(A, B) ; \ + XOR(C, B) ; \ + XOR(T, A) ; \ + OR(A, D) ; \ + XOR(A, C) ; \ + XOR(T, D) ; \ + XOR(T, A) ; \ + NOT(D) ; \ + AND(C, T) ; \ + XOR(C, D) ; \ + ASSIGN(D, C) ; \ + ASSIGN(C, T) ; + +#define SBOX_E8(A, B, C, D, T) \ + ASSIGN(T, B) ; \ + OR(B, C) ; \ + XOR(B, D) ; \ + XOR(T, C) ; \ + XOR(C, B) ; \ + OR(D, T) ; \ + AND(D, A) ; \ + XOR(T, C) ; \ + XOR(D, B) ; \ + OR(B, T) ; \ + XOR(B, A) ; \ + OR(A, T) ; \ + XOR(A, C) ; \ + XOR(B, T) ; \ + XOR(C, B) ; \ + AND(B, A) ; \ + XOR(B, T) ; \ + NOT(C) ; \ + OR(C, A) ; \ + XOR(T, C) ; \ + ASSIGN(C, B) ; \ + ASSIGN(B, D) ; \ + ASSIGN(D, A) ; \ + ASSIGN(A, T) ; + +#define SBOX_D1(A, B, C, D, T) \ + NOT(C) ; \ + ASSIGN(T, B) ; \ + OR(B, A) ; \ + NOT(T) ; \ + XOR(B, C) ; \ + OR(C, T) ; \ + XOR(B, D) ; \ + XOR(A, T) ; \ + XOR(C, A) ; \ + AND(A, D) ; \ + XOR(T, A) ; \ + OR(A, B) ; \ + XOR(A, C) ; \ + XOR(D, T) ; \ + XOR(C, B) ; \ + XOR(D, A) ; \ + XOR(D, B) ; \ + AND(C, D) ; \ + XOR(T, C) ; \ + ASSIGN(C, B) ; \ + ASSIGN(B, T) ; + +#define SBOX_D2(A, B, C, D, T) \ + ASSIGN(T, B) ; \ + XOR(B, D) ; \ + AND(D, B) ; \ + XOR(T, C) ; \ + XOR(D, A) ; \ + OR(A, B) ; \ + XOR(C, D) ; \ + XOR(A, T) ; \ + OR(A, C) ; \ + XOR(B, D) ; \ + XOR(A, B) ; \ + OR(B, D) ; \ + XOR(B, A) ; \ + NOT(T) ; \ + XOR(T, B) ; \ + OR(B, A) ; \ + XOR(B, A) ; \ + OR(B, T) ; \ + XOR(D, B) ; \ + ASSIGN(B, A) ; \ + ASSIGN(A, T) ; \ + ASSIGN(T, D) ; \ + ASSIGN(D, C) ; \ + ASSIGN(C, T) ; + +#define SBOX_D3(A, B, C, D, T) \ + XOR(C, D) ; \ + XOR(D, A) ; \ + ASSIGN(T, D) ; \ + AND(D, C) ; \ + XOR(D, B) ; \ + OR(B, C) ; \ + XOR(B, T) ; \ + AND(T, D) ; \ + XOR(C, D) ; \ + AND(T, A) ; \ + XOR(T, C) ; \ + AND(C, B) ; \ + OR(C, A) ; \ + NOT(D) ; \ + XOR(C, D) ; \ + XOR(A, D) ; \ + AND(A, B) ; \ + XOR(D, T) ; \ + XOR(D, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, T) ; + +#define SBOX_D4(A, B, C, D, T) \ + ASSIGN(T, C) ; \ + XOR(C, B) ; \ + XOR(A, C) ; \ + AND(T, C) ; \ + XOR(T, A) ; \ + AND(A, B) ; \ + XOR(B, D) ; \ + OR(D, T) ; \ + XOR(C, D) ; \ + XOR(A, D) ; \ + XOR(B, T) ; \ + AND(D, C) ; \ + XOR(D, B) ; \ + XOR(B, A) ; \ + OR(B, C) ; \ + XOR(A, D) ; \ + XOR(B, T) ; \ + XOR(A, B) ; \ + ASSIGN(T, A) ; \ + ASSIGN(A, C) ; \ + ASSIGN(C, D) ; \ + ASSIGN(D, T) ; + +#define SBOX_D5(A, B, C, D, T) \ + ASSIGN(T, C) ; \ + AND(C, D) ; \ + XOR(C, B) ; \ + OR(B, D) ; \ + AND(B, A) ; \ + XOR(T, C) ; \ + XOR(T, B) ; \ + AND(B, C) ; \ + NOT(A) ; \ + XOR(D, T) ; \ + XOR(B, D) ; \ + AND(D, A) ; \ + XOR(D, C) ; \ + XOR(A, B) ; \ + AND(C, A) ; \ + XOR(D, A) ; \ + XOR(C, T) ; \ + OR(C, D) ; \ + XOR(D, A) ; \ + XOR(C, B) ; \ + ASSIGN(B, D) ; \ + ASSIGN(D, T) ; + +#define SBOX_D6(A, B, C, D, T) \ + NOT(B) ; \ + ASSIGN(T, D) ; \ + XOR(C, B) ; \ + OR(D, A) ; \ + XOR(D, C) ; \ + OR(C, B) ; \ + AND(C, A) ; \ + XOR(T, D) ; \ + XOR(C, T) ; \ + OR(T, A) ; \ + XOR(T, B) ; \ + AND(B, C) ; \ + XOR(B, D) ; \ + XOR(T, C) ; \ + AND(D, T) ; \ + XOR(T, B) ; \ + XOR(D, T) ; \ + NOT(T) ; \ + XOR(D, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, T) ; \ + ASSIGN(T, D) ; \ + ASSIGN(D, C) ; \ + ASSIGN(C, T) ; + +#define SBOX_D7(A, B, C, D, T) \ + XOR(A, C) ; \ + ASSIGN(T, C) ; \ + AND(C, A) ; \ + XOR(T, D) ; \ + NOT(C) ; \ + XOR(D, B) ; \ + XOR(C, D) ; \ + OR(T, A) ; \ + XOR(A, C) ; \ + XOR(D, T) ; \ + XOR(T, B) ; \ + AND(B, D) ; \ + XOR(B, A) ; \ + XOR(A, D) ; \ + OR(A, C) ; \ + XOR(D, B) ; \ + XOR(T, A) ; \ + ASSIGN(A, B) ; \ + ASSIGN(B, C) ; \ + ASSIGN(C, T) ; + +#define SBOX_D8(A, B, C, D, T) \ + ASSIGN(T, C) ; \ + XOR(C, A) ; \ + AND(A, D) ; \ + OR(T, D) ; \ + NOT(C) ; \ + XOR(D, B) ; \ + OR(B, A) ; \ + XOR(A, C) ; \ + AND(C, T) ; \ + AND(D, T) ; \ + XOR(B, C) ; \ + XOR(C, A) ; \ + OR(A, C) ; \ + XOR(T, B) ; \ + XOR(A, D) ; \ + XOR(D, T) ; \ + OR(T, A) ; \ + XOR(D, C) ; \ + XOR(T, C) ; \ + ASSIGN(C, B) ; \ + ASSIGN(B, A) ; \ + ASSIGN(A, D) ; \ + ASSIGN(D, T) ; + +#define TRANSFORM(A, B, C, D, T) \ + ROTL_IMM(A, 13) ; \ + ROTL_IMM(C, 3) ; \ + SHL2_3(T, A) ; \ + XOR(B, A) ; \ + XOR(D, C) ; \ + XOR(B, C) ; \ + XOR(D, T) ; \ + ROTL_IMM(B, 1) ; \ + ROTL_IMM(D, 7) ; \ + ASSIGN(T, B) ; \ + SHL_IMM(T, 7) ; \ + XOR(A, B) ; \ + XOR(C, D) ; \ + XOR(A, D) ; \ + XOR(C, T) ; \ + ROTL_IMM(A, 5) ; \ + ROTL_IMM(C, 22) ; + +#define I_TRANSFORM(A, B, C, D, T) \ + ROTR_IMM(C, 22) ; \ + ROTR_IMM(A, 5) ; \ + ASSIGN(T, B) ; \ + SHL_IMM(T, 7) ; \ + XOR(A, B) ; \ + XOR(C, D) ; \ + XOR(A, D) ; \ + XOR(C, T) ; \ + ROTR_IMM(D, 7) ; \ + ROTR_IMM(B, 1) ; \ + SHL2_3(T, A) ; \ + XOR(B, C) ; \ + XOR(D, C) ; \ + XOR(B, A) ; \ + XOR(D, T) ; \ + ROTR_IMM(C, 3) ; \ + ROTR_IMM(A, 13) ; + +#define KEY_XOR(A, B, C, D, N) \ + XOR(A, ARRAY4(EDI, (4*N ))) ; \ + XOR(B, ARRAY4(EDI, (4*N+1))) ; \ + XOR(C, ARRAY4(EDI, (4*N+2))) ; \ + XOR(D, ARRAY4(EDI, (4*N+3))) ; + +/************************************************* +* Serpent Encryption * +*************************************************/ +START_FUNCTION(serpent_encrypt) + SPILL_REGS() +#define PUSHED 4 + + ASSIGN(EBP, ARG(1)) /* input block */ + ASSIGN(EAX, ARRAY4(EBP, 0)) + ASSIGN(EBX, ARRAY4(EBP, 1)) + ASSIGN(ECX, ARRAY4(EBP, 2)) + ASSIGN(EDX, ARRAY4(EBP, 3)) + + ASSIGN(EDI, ARG(3)) /* round keys */ + ZEROIZE(EBP) + +#define E_ROUND(A, B, C, D, T, N, SBOX) \ + KEY_XOR(A, B, C, D, N) \ + SBOX(A, B, C, D, T) \ + TRANSFORM(A, B, C, D, T) + + + E_ROUND(EAX, EBX, ECX, EDX, EBP, 0, SBOX_E1) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 1, SBOX_E2) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 2, SBOX_E3) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 3, SBOX_E4) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 4, SBOX_E5) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 5, SBOX_E6) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 6, SBOX_E7) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 7, SBOX_E8) + + E_ROUND(EAX, EBX, ECX, EDX, EBP, 8, SBOX_E1) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 9, SBOX_E2) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 10, SBOX_E3) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 11, SBOX_E4) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 12, SBOX_E5) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 13, SBOX_E6) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 14, SBOX_E7) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 15, SBOX_E8) + + E_ROUND(EAX, EBX, ECX, EDX, EBP, 16, SBOX_E1) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 17, SBOX_E2) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 18, SBOX_E3) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 19, SBOX_E4) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 20, SBOX_E5) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 21, SBOX_E6) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 22, SBOX_E7) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 23, SBOX_E8) + + E_ROUND(EAX, EBX, ECX, EDX, EBP, 24, SBOX_E1) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 25, SBOX_E2) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 26, SBOX_E3) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 27, SBOX_E4) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 28, SBOX_E5) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 29, SBOX_E6) + E_ROUND(EAX, EBX, ECX, EDX, EBP, 30, SBOX_E7) + + KEY_XOR(EAX, EBX, ECX, EDX, 31) + SBOX_E8(EAX, EBX, ECX, EDX, EBP) + KEY_XOR(EAX, EBX, ECX, EDX, 32) + + ASSIGN(EBP, ARG(2)) /* output block */ + ASSIGN(ARRAY4(EBP, 0), EAX) + ASSIGN(ARRAY4(EBP, 1), EBX) + ASSIGN(ARRAY4(EBP, 2), ECX) + ASSIGN(ARRAY4(EBP, 3), EDX) + + RESTORE_REGS() +#undef PUSHED +END_FUNCTION(serpent_encrypt) + +/************************************************* +* Serpent Decryption * +*************************************************/ +START_FUNCTION(serpent_decrypt) + SPILL_REGS() +#define PUSHED 4 + + ASSIGN(EBP, ARG(1)) /* input block */ + ASSIGN(EAX, ARRAY4(EBP, 0)) + ASSIGN(EBX, ARRAY4(EBP, 1)) + ASSIGN(ECX, ARRAY4(EBP, 2)) + ASSIGN(EDX, ARRAY4(EBP, 3)) + + ASSIGN(EDI, ARG(3)) /* round keys */ + + ZEROIZE(EBP) + +#define D_ROUND(A, B, C, D, T, N, SBOX) \ + I_TRANSFORM(A, B, C, D, T) \ + SBOX(A, B, C, D, T) \ + KEY_XOR(A, B, C, D, N) \ + + KEY_XOR(EAX, EBX, ECX, EDX, 32) + SBOX_D8(EAX, EBX, ECX, EDX, EBP) + KEY_XOR(EAX, EBX, ECX, EDX, 31) + + D_ROUND(EAX, EBX, ECX, EDX, EBP, 30, SBOX_D7) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 29, SBOX_D6) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 28, SBOX_D5) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 27, SBOX_D4) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 26, SBOX_D3) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 25, SBOX_D2) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 24, SBOX_D1) + + D_ROUND(EAX, EBX, ECX, EDX, EBP, 23, SBOX_D8) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 22, SBOX_D7) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 21, SBOX_D6) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 20, SBOX_D5) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 19, SBOX_D4) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 18, SBOX_D3) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 17, SBOX_D2) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 16, SBOX_D1) + + D_ROUND(EAX, EBX, ECX, EDX, EBP, 15, SBOX_D8) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 14, SBOX_D7) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 13, SBOX_D6) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 12, SBOX_D5) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 11, SBOX_D4) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 10, SBOX_D3) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 9, SBOX_D2) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 8, SBOX_D1) + + D_ROUND(EAX, EBX, ECX, EDX, EBP, 7, SBOX_D8) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 6, SBOX_D7) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 5, SBOX_D6) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 4, SBOX_D5) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 3, SBOX_D4) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 2, SBOX_D3) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 1, SBOX_D2) + D_ROUND(EAX, EBX, ECX, EDX, EBP, 0, SBOX_D1) + + ASSIGN(EBP, ARG(2)) /* output block */ + ASSIGN(ARRAY4(EBP, 0), EAX) + ASSIGN(ARRAY4(EBP, 1), EBX) + ASSIGN(ARRAY4(EBP, 2), ECX) + ASSIGN(ARRAY4(EBP, 3), EDX) + + RESTORE_REGS() +#undef PUSHED +END_FUNCTION(serpent_decrypt) + +/************************************************* +* Serpent Key Schedule * +*************************************************/ +START_FUNCTION(serpent_key_schedule) + SPILL_REGS() +#define PUSHED 4 + + ASSIGN(EDI, ARG(1)) /* round keys */ + ASSIGN(ESI, IMM(8)) + ADD_IMM(EDI, 32) + +START_LOOP(.EXPANSION) + ASSIGN(EAX, ARRAY4(EDI, -1)) + ASSIGN(EBX, ARRAY4(EDI, -3)) + ASSIGN(ECX, ARRAY4(EDI, -5)) + ASSIGN(EDX, ARRAY4(EDI, -8)) + + ASSIGN(EBP, ESI) + SUB_IMM(EBP, 8) + XOR(EBP, IMM(0x9E3779B9)) + XOR(EAX, EBX) + XOR(ECX, EDX) + XOR(EAX, EBP) + XOR(EAX, ECX) + + ROTL_IMM(EAX, 11) + + ASSIGN(ARRAY4(EDI, 0), EAX) + + ADD_IMM(ESI, 1) + ADD_IMM(EDI, 4) +LOOP_UNTIL_EQ(ESI, 140, .EXPANSION) + + ASSIGN(EDI, ARG(1)) /* round keys */ + +#define LOAD_AND_SBOX(MSG, SBOX) \ + ASSIGN(EAX, ARRAY4(EDI, (4*MSG+ 8))) ; \ + ASSIGN(EBX, ARRAY4(EDI, (4*MSG+ 9))) ; \ + ASSIGN(ECX, ARRAY4(EDI, (4*MSG+10))) ; \ + ASSIGN(EDX, ARRAY4(EDI, (4*MSG+11))) ; \ + SBOX(EAX, EBX, ECX, EDX, EBP) ; \ + ASSIGN(ARRAY4(EDI, (4*MSG+ 8)), EAX) ; \ + ASSIGN(ARRAY4(EDI, (4*MSG+ 9)), EBX) ; \ + ASSIGN(ARRAY4(EDI, (4*MSG+10)), ECX) ; \ + ASSIGN(ARRAY4(EDI, (4*MSG+11)), EDX) + + LOAD_AND_SBOX( 0, SBOX_E4) + LOAD_AND_SBOX( 1, SBOX_E3) + LOAD_AND_SBOX( 2, SBOX_E2) + LOAD_AND_SBOX( 3, SBOX_E1) + + LOAD_AND_SBOX( 4, SBOX_E8) + LOAD_AND_SBOX( 5, SBOX_E7) + LOAD_AND_SBOX( 6, SBOX_E6) + LOAD_AND_SBOX( 7, SBOX_E5) + LOAD_AND_SBOX( 8, SBOX_E4) + LOAD_AND_SBOX( 9, SBOX_E3) + LOAD_AND_SBOX(10, SBOX_E2) + LOAD_AND_SBOX(11, SBOX_E1) + + LOAD_AND_SBOX(12, SBOX_E8) + LOAD_AND_SBOX(13, SBOX_E7) + LOAD_AND_SBOX(14, SBOX_E6) + LOAD_AND_SBOX(15, SBOX_E5) + LOAD_AND_SBOX(16, SBOX_E4) + LOAD_AND_SBOX(17, SBOX_E3) + LOAD_AND_SBOX(18, SBOX_E2) + LOAD_AND_SBOX(19, SBOX_E1) + + LOAD_AND_SBOX(20, SBOX_E8) + LOAD_AND_SBOX(21, SBOX_E7) + LOAD_AND_SBOX(22, SBOX_E6) + LOAD_AND_SBOX(23, SBOX_E5) + LOAD_AND_SBOX(24, SBOX_E4) + LOAD_AND_SBOX(25, SBOX_E3) + LOAD_AND_SBOX(26, SBOX_E2) + LOAD_AND_SBOX(27, SBOX_E1) + + LOAD_AND_SBOX(28, SBOX_E8) + LOAD_AND_SBOX(29, SBOX_E7) + LOAD_AND_SBOX(30, SBOX_E6) + LOAD_AND_SBOX(31, SBOX_E5) + LOAD_AND_SBOX(32, SBOX_E4) + + RESTORE_REGS() +#undef PUSHED +END_FUNCTION(serpent_key_schedule) diff --git a/src/asm/asm_ia32/serpent.cpp b/src/asm/asm_ia32/serpent.cpp new file mode 100644 index 000000000..aacb72b0f --- /dev/null +++ b/src/asm/asm_ia32/serpent.cpp @@ -0,0 +1,49 @@ +/************************************************* +* Serpent Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/serpent.h> +#include <botan/loadstor.h> + +namespace Botan { + +extern "C" { + +void serpent_encrypt(const byte[16], byte[16], const u32bit[132]); +void serpent_decrypt(const byte[16], byte[16], const u32bit[132]); +void serpent_key_schedule(u32bit[140]); + +} + +/************************************************* +* Serpent Encryption * +*************************************************/ +void Serpent::enc(const byte in[], byte out[]) const + { + serpent_encrypt(in, out, round_key); + } + +/************************************************* +* Serpent Decryption * +*************************************************/ +void Serpent::dec(const byte in[], byte out[]) const + { + serpent_decrypt(in, out, round_key); + } + +/************************************************* +* Serpent Key Schedule * +*************************************************/ +void Serpent::key(const byte key[], u32bit length) + { + SecureBuffer<u32bit, 140> W; + for(u32bit j = 0; j != length / 4; ++j) + W[j] = make_u32bit(key[4*j+3], key[4*j+2], key[4*j+1], key[4*j]); + W[length / 4] |= u32bit(1) << ((length%4)*8); + + serpent_key_schedule(W); + round_key.copy(W + 8, 132); + } + +} diff --git a/src/asm/asm_ia32/sha160.cpp b/src/asm/asm_ia32/sha160.cpp new file mode 100644 index 000000000..7725541d5 --- /dev/null +++ b/src/asm/asm_ia32/sha160.cpp @@ -0,0 +1,52 @@ +/************************************************* +* SHA-160 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/sha160.h> +#include <botan/loadstor.h> + +namespace Botan { + +extern "C" void botan_sha160_asm_ia32(u32bit[5], const byte[64], u32bit[81]); + +/************************************************* +* SHA-160 Compression Function * +*************************************************/ +void SHA_160::hash(const byte input[]) + { + botan_sha160_asm_ia32(digest, input, W); + } + +/************************************************* +* Copy out the digest * +*************************************************/ +void SHA_160::copy_out(byte output[]) + { + for(u32bit j = 0; j != OUTPUT_LENGTH; ++j) + output[j] = get_byte(j % 4, digest[j/4]); + } + +/************************************************* +* Clear memory of sensitive data * +*************************************************/ +void SHA_160::clear() throw() + { + MDx_HashFunction::clear(); + W.clear(); + digest[0] = 0x67452301; + digest[1] = 0xEFCDAB89; + digest[2] = 0x98BADCFE; + digest[3] = 0x10325476; + digest[4] = 0xC3D2E1F0; + } + +/************************************************* +* SHA_160 Constructor * +*************************************************/ +SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true), W(81) + { + clear(); + } + +} diff --git a/src/asm/asm_ia32/sha1_asm.S b/src/asm/asm_ia32/sha1_asm.S new file mode 100644 index 000000000..85bc9dc2c --- /dev/null +++ b/src/asm/asm_ia32/sha1_asm.S @@ -0,0 +1,242 @@ +/************************************************* +* SHA-160 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/asm_macr.h> + +START_LISTING(sha1_asm.S) + +START_FUNCTION(botan_sha160_asm_ia32) + SPILL_REGS() + +#define PUSHED 4 + + ASSIGN(EDI, ARG(2)) + ASSIGN(EBP, ARG(3)) + + ZEROIZE(ESI) + +START_LOOP(.LOAD_INPUT) + ADD_IMM(ESI, 4) + + ASSIGN(EAX, ARRAY4(EDI, 0)) + ASSIGN(EBX, ARRAY4(EDI, 1)) + ASSIGN(ECX, ARRAY4(EDI, 2)) + ASSIGN(EDX, ARRAY4(EDI, 3)) + + ADD_IMM(EDI, 16) + + BSWAP(EAX) + BSWAP(EBX) + BSWAP(ECX) + BSWAP(EDX) + + ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-4), EAX) + ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-3), EBX) + ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-2), ECX) + ASSIGN(ARRAY4_INDIRECT(EBP,ESI,-1), EDX) +LOOP_UNTIL_EQ(ESI, 16, .LOAD_INPUT) + + ADD2_IMM(EDI, EBP, 64) + +START_LOOP(.EXPANSION) + ADD_IMM(ESI, 4) + + ZEROIZE(EAX) + ASSIGN(EBX, ARRAY4(EDI, -1)) + ASSIGN(ECX, ARRAY4(EDI, -2)) + ASSIGN(EDX, ARRAY4(EDI, -3)) + + XOR(EAX, ARRAY4(EDI, -5)) + XOR(EBX, ARRAY4(EDI, -6)) + XOR(ECX, ARRAY4(EDI, -7)) + XOR(EDX, ARRAY4(EDI, -8)) + + XOR(EAX, ARRAY4(EDI, -11)) + XOR(EBX, ARRAY4(EDI, -12)) + XOR(ECX, ARRAY4(EDI, -13)) + XOR(EDX, ARRAY4(EDI, -14)) + + XOR(EAX, ARRAY4(EDI, -13)) + XOR(EBX, ARRAY4(EDI, -14)) + XOR(ECX, ARRAY4(EDI, -15)) + XOR(EDX, ARRAY4(EDI, -16)) + + ROTL_IMM(EDX, 1) + ROTL_IMM(ECX, 1) + ROTL_IMM(EBX, 1) + XOR(EAX, EDX) + ROTL_IMM(EAX, 1) + + ASSIGN(ARRAY4(EDI, 0), EDX) + ASSIGN(ARRAY4(EDI, 1), ECX) + ASSIGN(ARRAY4(EDI, 2), EBX) + ASSIGN(ARRAY4(EDI, 3), EAX) + + ADD_IMM(EDI, 16) +LOOP_UNTIL_EQ(ESI, 80, .EXPANSION) + +#define MAGIC1 0x5A827999 +#define MAGIC2 0x6ED9EBA1 +#define MAGIC3 0x8F1BBCDC +#define MAGIC4 0xCA62C1D6 + +#define MSG ESP +#define T2 EBP + +#define F1(A, B, C, D, E, F, N) \ + ASSIGN(T2, ARRAY4(MSG, N)) ; \ + ASSIGN(A, F) ; \ + ROTL_IMM(F, 5) ; \ + ADD(F, E) ; \ + ASSIGN(E, C) ; \ + XOR(E, D) ; \ + ADD3_IMM(F, T2, MAGIC1) ; \ + AND(E, B) ; \ + XOR(E, D) ; \ + ROTR_IMM(B, 2) ; \ + ADD(E, F) ; + +#define F2_4(A, B, C, D, E, F, N, MAGIC) \ + ASSIGN(T2, ARRAY4(MSG, N)) ; \ + ASSIGN(A, F) ; \ + ROTL_IMM(F, 5) ; \ + ADD(F, E) ; \ + ASSIGN(E, B) ; \ + XOR(E, C) ; \ + ADD3_IMM(F, T2, MAGIC) ; \ + XOR(E, D) ; \ + ROTR_IMM(B, 2) ; \ + ADD(E, F) ; + +#define F3(A, B, C, D, E, F, N) \ + ASSIGN(T2, ARRAY4(MSG, N)) ; \ + ASSIGN(A, F) ; \ + ROTL_IMM(F, 5) ; \ + ADD(F, E) ; \ + ASSIGN(E, B) ; \ + OR(E, C) ; \ + AND(E, D) ; \ + ADD3_IMM(F, T2, MAGIC3) ; \ + ASSIGN(T2, B) ; \ + AND(T2, C) ; \ + OR(E, T2) ; \ + ROTR_IMM(B, 2) ; \ + ADD(E, F) ; + +#define F2(A, B, C, D, E, F, MSG) \ + F2_4(A, B, C, D, E, F, MSG, MAGIC2) + +#define F4(A, B, C, D, E, F, MSG) \ + F2_4(A, B, C, D, E, F, MSG, MAGIC4) + + ASSIGN(EAX, ARG(1)) + ASSIGN(EDI, ARRAY4(EAX, 0)) + ASSIGN(EBX, ARRAY4(EAX, 1)) + ASSIGN(ECX, ARRAY4(EAX, 2)) + ASSIGN(EDX, ARRAY4(EAX, 3)) + ASSIGN(ESI, ARRAY4(EAX, 4)) + + ASSIGN(ARRAY4(EBP, 80), ESP) + ASSIGN(ESP, EBP) + + /* First Round */ + F1(EAX, EBX, ECX, EDX, ESI, EDI, 0) + F1(EDI, EAX, EBX, ECX, EDX, ESI, 1) + F1(ESI, EDI, EAX, EBX, ECX, EDX, 2) + F1(EDX, ESI, EDI, EAX, EBX, ECX, 3) + F1(ECX, EDX, ESI, EDI, EAX, EBX, 4) + F1(EBX, ECX, EDX, ESI, EDI, EAX, 5) + F1(EAX, EBX, ECX, EDX, ESI, EDI, 6) + F1(EDI, EAX, EBX, ECX, EDX, ESI, 7) + F1(ESI, EDI, EAX, EBX, ECX, EDX, 8) + F1(EDX, ESI, EDI, EAX, EBX, ECX, 9) + F1(ECX, EDX, ESI, EDI, EAX, EBX, 10) + F1(EBX, ECX, EDX, ESI, EDI, EAX, 11) + F1(EAX, EBX, ECX, EDX, ESI, EDI, 12) + F1(EDI, EAX, EBX, ECX, EDX, ESI, 13) + F1(ESI, EDI, EAX, EBX, ECX, EDX, 14) + F1(EDX, ESI, EDI, EAX, EBX, ECX, 15) + F1(ECX, EDX, ESI, EDI, EAX, EBX, 16) + F1(EBX, ECX, EDX, ESI, EDI, EAX, 17) + F1(EAX, EBX, ECX, EDX, ESI, EDI, 18) + F1(EDI, EAX, EBX, ECX, EDX, ESI, 19) + + /* Second Round */ + F2(ESI, EDI, EAX, EBX, ECX, EDX, 20) + F2(EDX, ESI, EDI, EAX, EBX, ECX, 21) + F2(ECX, EDX, ESI, EDI, EAX, EBX, 22) + F2(EBX, ECX, EDX, ESI, EDI, EAX, 23) + F2(EAX, EBX, ECX, EDX, ESI, EDI, 24) + F2(EDI, EAX, EBX, ECX, EDX, ESI, 25) + F2(ESI, EDI, EAX, EBX, ECX, EDX, 26) + F2(EDX, ESI, EDI, EAX, EBX, ECX, 27) + F2(ECX, EDX, ESI, EDI, EAX, EBX, 28) + F2(EBX, ECX, EDX, ESI, EDI, EAX, 29) + F2(EAX, EBX, ECX, EDX, ESI, EDI, 30) + F2(EDI, EAX, EBX, ECX, EDX, ESI, 31) + F2(ESI, EDI, EAX, EBX, ECX, EDX, 32) + F2(EDX, ESI, EDI, EAX, EBX, ECX, 33) + F2(ECX, EDX, ESI, EDI, EAX, EBX, 34) + F2(EBX, ECX, EDX, ESI, EDI, EAX, 35) + F2(EAX, EBX, ECX, EDX, ESI, EDI, 36) + F2(EDI, EAX, EBX, ECX, EDX, ESI, 37) + F2(ESI, EDI, EAX, EBX, ECX, EDX, 38) + F2(EDX, ESI, EDI, EAX, EBX, ECX, 39) + + /* Third Round */ + F3(ECX, EDX, ESI, EDI, EAX, EBX, 40) + F3(EBX, ECX, EDX, ESI, EDI, EAX, 41) + F3(EAX, EBX, ECX, EDX, ESI, EDI, 42) + F3(EDI, EAX, EBX, ECX, EDX, ESI, 43) + F3(ESI, EDI, EAX, EBX, ECX, EDX, 44) + F3(EDX, ESI, EDI, EAX, EBX, ECX, 45) + F3(ECX, EDX, ESI, EDI, EAX, EBX, 46) + F3(EBX, ECX, EDX, ESI, EDI, EAX, 47) + F3(EAX, EBX, ECX, EDX, ESI, EDI, 48) + F3(EDI, EAX, EBX, ECX, EDX, ESI, 49) + F3(ESI, EDI, EAX, EBX, ECX, EDX, 50) + F3(EDX, ESI, EDI, EAX, EBX, ECX, 51) + F3(ECX, EDX, ESI, EDI, EAX, EBX, 52) + F3(EBX, ECX, EDX, ESI, EDI, EAX, 53) + F3(EAX, EBX, ECX, EDX, ESI, EDI, 54) + F3(EDI, EAX, EBX, ECX, EDX, ESI, 55) + F3(ESI, EDI, EAX, EBX, ECX, EDX, 56) + F3(EDX, ESI, EDI, EAX, EBX, ECX, 57) + F3(ECX, EDX, ESI, EDI, EAX, EBX, 58) + F3(EBX, ECX, EDX, ESI, EDI, EAX, 59) + + /* Fourth Round */ + F4(EAX, EBX, ECX, EDX, ESI, EDI, 60) + F4(EDI, EAX, EBX, ECX, EDX, ESI, 61) + F4(ESI, EDI, EAX, EBX, ECX, EDX, 62) + F4(EDX, ESI, EDI, EAX, EBX, ECX, 63) + F4(ECX, EDX, ESI, EDI, EAX, EBX, 64) + F4(EBX, ECX, EDX, ESI, EDI, EAX, 65) + F4(EAX, EBX, ECX, EDX, ESI, EDI, 66) + F4(EDI, EAX, EBX, ECX, EDX, ESI, 67) + F4(ESI, EDI, EAX, EBX, ECX, EDX, 68) + F4(EDX, ESI, EDI, EAX, EBX, ECX, 69) + F4(ECX, EDX, ESI, EDI, EAX, EBX, 70) + F4(EBX, ECX, EDX, ESI, EDI, EAX, 71) + F4(EAX, EBX, ECX, EDX, ESI, EDI, 72) + F4(EDI, EAX, EBX, ECX, EDX, ESI, 73) + F4(ESI, EDI, EAX, EBX, ECX, EDX, 74) + F4(EDX, ESI, EDI, EAX, EBX, ECX, 75) + F4(ECX, EDX, ESI, EDI, EAX, EBX, 76) + F4(EBX, ECX, EDX, ESI, EDI, EAX, 77) + F4(EAX, EBX, ECX, EDX, ESI, EDI, 78) + F4(EDI, EAX, EBX, ECX, EDX, ESI, 79) + + ASSIGN(ESP, ARRAY4(ESP, 80)) + + ASSIGN(EBP, ARG(1)) + ADD(ARRAY4(EBP, 0), EDX) + ADD(ARRAY4(EBP, 1), EDI) + ADD(ARRAY4(EBP, 2), EAX) + ADD(ARRAY4(EBP, 3), EBX) + ADD(ARRAY4(EBP, 4), ECX) + + RESTORE_REGS() +END_FUNCTION(botan_sha160_asm_ia32) diff --git a/src/asm/mp_amd64/bswap.h b/src/asm/mp_amd64/bswap.h new file mode 100644 index 000000000..3c77b460c --- /dev/null +++ b/src/asm/mp_amd64/bswap.h @@ -0,0 +1,36 @@ +/************************************************* +* Byte Swapping Operations Header File * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_BSWAP_H__ +#define BOTAN_BSWAP_H__ + +#include <botan/types.h> +#include <botan/rotate.h> + +namespace Botan { + +/************************************************* +* Byte Swapping Functions * +*************************************************/ +inline u16bit reverse_bytes(u16bit input) + { + return rotate_left(input, 8); + } + +inline u32bit reverse_bytes(u32bit input) + { + asm("bswapl %0" : "=r" (input) : "0" (input)); + return input; + } + +inline u64bit reverse_bytes(u64bit input) + { + asm("bswapq %0" : "=r" (input) : "0" (input)); + return input; + } + +} + +#endif diff --git a/src/asm/mp_amd64/modinfo.txt b/src/asm/mp_amd64/modinfo.txt new file mode 100644 index 000000000..a042a3976 --- /dev/null +++ b/src/asm/mp_amd64/modinfo.txt @@ -0,0 +1,21 @@ +realname "x86-64 MPI Assembler Core" + +mp_bits 64 + +load_on asm_ok + +<replace> +bswap.h +mp_asm.h +mp_asmi.h +#mp_mulop.cpp +</replace> + +<arch> +amd64 +</arch> + +<cc> +gcc +icc +</cc> diff --git a/src/asm/mp_amd64/mp_asm.h b/src/asm/mp_amd64/mp_asm.h new file mode 100644 index 000000000..eca7bae6c --- /dev/null +++ b/src/asm/mp_amd64/mp_asm.h @@ -0,0 +1,67 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_H__ +#define BOTAN_MP_ASM_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS != 64) + #error The mp_amd64 module requires that BOTAN_MP_WORD_BITS == 64 +#endif + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for amd64 Assembly * +*************************************************/ +#define ASM(x) x "\n\t" + +/************************************************* +* Word Multiply * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + asm( + ASM("mulq %[b]") + ASM("addq %[c],%[a]") + ASM("adcq $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c) + : "0"(a), "1"(b), [c]"g"(*c) : "cc"); + + return a; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + asm( + ASM("mulq %[b]") + + ASM("addq %[c],%[a]") + ASM("adcq $0,%[carry]") + + ASM("addq %[d],%[a]") + ASM("adcq $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d) + : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc"); + + return a; + } + +#undef ASM + +} + +} + +#endif diff --git a/src/asm/mp_amd64/mp_asmi.h b/src/asm/mp_amd64/mp_asmi.h new file mode 100644 index 000000000..16632a38d --- /dev/null +++ b/src/asm/mp_amd64/mp_asmi.h @@ -0,0 +1,233 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2007 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/mp_asm.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for amd64 Assembly * +*************************************************/ +#ifndef ASM + #define ASM(x) x "\n\t" +#endif + +#define ADDSUB2_OP(OPERATION, INDEX) \ + ASM("movq 8*" #INDEX "(%[y]), %[carry]") \ + ASM(OPERATION " %[carry], 8*" #INDEX "(%[x])") \ + +#define ADDSUB3_OP(OPERATION, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]), %[carry]") \ + ASM(OPERATION " 8*" #INDEX "(%[y]), %[carry]") \ + ASM("movq %[carry], 8*" #INDEX "(%[z])") \ + +#define LINMUL_OP(WRITE_TO, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]),%%rax") \ + ASM("mulq %[y]") \ + ASM("addq %[carry],%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("movq %%rdx,%[carry]") \ + ASM("movq %%rax, 8*" #INDEX "(%[" WRITE_TO "])") + +#define MULADD_OP(IGNORED, INDEX) \ + ASM("movq 8*" #INDEX "(%[x]),%%rax") \ + ASM("mulq %[y]") \ + ASM("addq %[carry],%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("addq 8*" #INDEX "(%[z]),%%rax") \ + ASM("adcq $0,%%rdx") \ + ASM("movq %%rdx,%[carry]") \ + ASM("movq %%rax, 8*" #INDEX " (%[z])") + +#define DO_8_TIMES(MACRO, ARG) \ + MACRO(ARG, 0) \ + MACRO(ARG, 1) \ + MACRO(ARG, 2) \ + MACRO(ARG, 3) \ + MACRO(ARG, 4) \ + MACRO(ARG, 5) \ + MACRO(ARG, 6) \ + MACRO(ARG, 7) + +#define ADD_OR_SUBTRACT(CORE_CODE) \ + ASM("rorq %[carry]") \ + CORE_CODE \ + ASM("sbbq %[carry],%[carry]") \ + ASM("negq %[carry]") + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("adcq %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("sbbq %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Subtraction, Three Argument * +*************************************************/ +inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbq")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "x") + : [carry]"=r"(carry) + : [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "z") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(MULADD_OP, "") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%rax", "%rdx"); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mulq %[y]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mulq %[y]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + ASM("addq %[x],%[w0]") + ASM("adcq %[y],%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + + +#undef ASM +#undef DO_8_TIMES +#undef ADD_OR_SUBTRACT +#undef ADDSUB2_OP +#undef ADDSUB3_OP +#undef LINMUL_OP +#undef MULADD_OP + +} + +} +#endif diff --git a/src/asm/mp_amd64/mp_mulop.cpp b/src/asm/mp_amd64/mp_mulop.cpp new file mode 100644 index 000000000..d1aa51489 --- /dev/null +++ b/src/asm/mp_amd64/mp_mulop.cpp @@ -0,0 +1,94 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include <botan/mp_asm.h> +#include <botan/mp_asmi.h> +#include <botan/mp_core.h> +#include <botan/mem_ops.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Simple O(N^2) Multiplication * +*************************************************/ +void bigint_simple_mul(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + const u32bit blocks = x_size - (x_size % 8); + + clear_mem(z, x_size + y_size); + + for(u32bit i = 0; i != y_size; ++i) + { + word carry = 0; + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_madd3(z + i + j, x + j, y[i], carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +inline word word_sqr(word x, + +/************************************************* +* Simple O(N^2) Squaring + +This is exactly the same algorithm as bigint_simple_mul, +however because C/C++ compilers suck at alias analysis it +is good to have the version where the compiler knows +that x == y +*************************************************/ +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + const word x_i = x[i]; + + word carry = z[2*i]; + z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry); + + for(u32bit j = i; j != x_size; ++j) + { + // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry; + + /* + load z[i+j] into register + load x[j] into %hi + mulq %[x_i] -> x[i] * x[j] -> %lo:%hi + shlq %lo, $1 + + // put carry bit (cf) from %lo into %temp + xorl %temp + adcq $0, %temp + + // high bit of lo now in cf + shl %hi, $1 + // add in lowest bid from %lo + orl %temp, %hi + + addq %[c], %[lo] + adcq $0, %[hi] + addq %[z_ij], %[lo] + adcq $0, %[hi] + + */ + + } + + z[x_size+i] = carry; + } + } + +} + +} diff --git a/src/asm/mp_asm64/modinfo.txt b/src/asm/mp_asm64/modinfo.txt new file mode 100644 index 000000000..a9e5d53da --- /dev/null +++ b/src/asm/mp_asm64/modinfo.txt @@ -0,0 +1,25 @@ +realname "64-bit RISC MPI Assembler Core" + +mp_bits 64 + +load_on asm_ok + +<replace> +mp_asm.h +</replace> + +<arch> +alpha +ia64 +mips64 +ppc64 +sparc64 +</arch> + +# The inline asm only works with gcc, but it looks like (at least on +# UltraSPARC), using 64-bit words and the sythensized multiply is a 5 to 25% +# win, so it's probably worth using elsewhere. +<cc> +gcc +sunwspro +</cc> diff --git a/src/asm/mp_asm64/mp_asm.h b/src/asm/mp_asm64/mp_asm.h new file mode 100644 index 000000000..e455b3616 --- /dev/null +++ b/src/asm/mp_asm64/mp_asm.h @@ -0,0 +1,110 @@ +/************************************************* +* MPI Multiply-Add Core Header File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#ifndef BOTAN_MP_MADD_H__ +#define BOTAN_MP_MADD_H__ + +#include <botan/mp_types.h> + +namespace Botan { + +#if (BOTAN_MP_WORD_BITS != 64) + #error The mp_asm64 module requires that BOTAN_MP_WORD_BITS == 64 +#endif + +#if defined(BOTAN_TARGET_ARCH_IS_ALPHA) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("umulh %1,%2,%0" : "=r" (z0) : "r" (a), "r" (b)); \ + z1 = a * b; \ +} while(0); + +#elif defined(BOTAN_TARGET_ARCH_IS_IA64) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("xmpy.hu %0=%1,%2" : "=f" (z0) : "f" (a), "f" (b)); \ + z1 = a * b; \ +} while(0); + +#elif defined(BOTAN_TARGET_ARCH_IS_PPC64) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("mulhdu %0,%1,%2" : "=r" (z0) : "r" (a), "r" (b) : "cc"); \ + z1 = a * b; \ +} while(0); + +#elif defined(BOTAN_TARGET_ARCH_IS_MIPS64) + +#define BOTAN_WORD_MUL(a,b,z1,z0) do { \ + asm("dmultu %2,%3" : "=h" (z0), "=l" (z1) : "r" (a), "r" (b)); \ +} while(0); + +#else + +// Do a 64x64->128 multiply using four 64x64->64 multiplies +// plus some adds and shifts. Last resort for CPUs like UltraSPARC, +// with 64-bit registers/ALU, but no 64x64->128 multiply. +inline void bigint_2word_mul(word a, word b, word* z1, word* z0) + { + const u32bit MP_HWORD_BITS = MP_WORD_BITS / 2; + const word MP_HWORD_MASK = ((word)1 << MP_HWORD_BITS) - 1; + + const word a_hi = (a >> MP_HWORD_BITS); + const word a_lo = (a & MP_HWORD_MASK); + const word b_hi = (b >> MP_HWORD_BITS); + const word b_lo = (b & MP_HWORD_MASK); + + word x0 = a_hi * b_hi; + word x1 = a_lo * b_hi; + word x2 = a_hi * b_lo; + word x3 = a_lo * b_lo; + + x2 += x3 >> (MP_HWORD_BITS); + x2 += x1; + if(x2 < x1) + x0 += ((word)1 << MP_HWORD_BITS); + + *z0 = x0 + (x2 >> MP_HWORD_BITS); + *z1 = ((x2 & MP_HWORD_MASK) << MP_HWORD_BITS) + (x3 & MP_HWORD_MASK); + } + +#define BOTAN_WORD_MUL(a,b,z1,z0) bigint_2word_mul(a, b, &z1, &z0) + +#endif + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + word z0 = 0, z1 = 0; + + BOTAN_WORD_MUL(a, b, z1, z0); + + z1 += *c; if(z1 < *c) z0++; + + *c = z0; + return z1; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + word z0 = 0, z1 = 0; + + BOTAN_WORD_MUL(a, b, z1, z0); + + z1 += c; if(z1 < c) z0++; + z1 += *d; if(z1 < *d) z0++; + + *d = z0; + return z1; + } + +} + +#endif diff --git a/src/asm/mp_ia32/modinfo.txt b/src/asm/mp_ia32/modinfo.txt new file mode 100644 index 000000000..cf4959250 --- /dev/null +++ b/src/asm/mp_ia32/modinfo.txt @@ -0,0 +1,19 @@ +realname "x86 MPI Assembler Core" + +mp_bits 32 + +load_on asm_ok + +<replace> +mp_asm.h +mp_asmi.h +</replace> + +<arch> +ia32 +</arch> + +<cc> +gcc +icc +</cc> diff --git a/src/asm/mp_ia32/mp_asm.h b/src/asm/mp_ia32/mp_asm.h new file mode 100644 index 000000000..b45140321 --- /dev/null +++ b/src/asm/mp_ia32/mp_asm.h @@ -0,0 +1,65 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2008 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_H__ +#define BOTAN_MP_ASM_H__ + +#include <botan/mp_types.h> + +#if (BOTAN_MP_WORD_BITS != 32) + #error The mp_ia32 module requires that BOTAN_MP_WORD_BITS == 32 +#endif + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for x86 Assembly * +*************************************************/ +#define ASM(x) x "\n\t" + +/************************************************* +* Word Multiply * +*************************************************/ +inline word word_madd2(word a, word b, word* c) + { + asm( + ASM("mull %[b]") + ASM("addl %[c],%[a]") + ASM("adcl $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*c) + : "0"(a), "1"(b), [c]"g"(*c) : "cc"); + + return a; + } + +/************************************************* +* Word Multiply/Add * +*************************************************/ +inline word word_madd3(word a, word b, word c, word* d) + { + asm( + ASM("mull %[b]") + + ASM("addl %[c],%[a]") + ASM("adcl $0,%[carry]") + + ASM("addl %[d],%[a]") + ASM("adcl $0,%[carry]") + + : [a]"=a"(a), [b]"=rm"(b), [carry]"=&d"(*d) + : "0"(a), "1"(b), [c]"g"(c), [d]"g"(*d) : "cc"); + + return a; + } + +} + +} + +#endif diff --git a/src/asm/mp_ia32/mp_asmi.h b/src/asm/mp_ia32/mp_asmi.h new file mode 100644 index 000000000..9de0c11e3 --- /dev/null +++ b/src/asm/mp_ia32/mp_asmi.h @@ -0,0 +1,225 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2007 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include <botan/mp_asm.h> + +namespace Botan { + +extern "C" { + +/************************************************* +* Helper Macros for x86 Assembly * +*************************************************/ +#ifndef ASM + #define ASM(x) x "\n\t" +#endif + +#define ADDSUB2_OP(OPERATION, INDEX) \ + ASM("movl 4*" #INDEX "(%[y]), %[carry]") \ + ASM(OPERATION " %[carry], 4*" #INDEX "(%[x])") \ + +#define ADDSUB3_OP(OPERATION, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]), %[carry]") \ + ASM(OPERATION " 4*" #INDEX "(%[y]), %[carry]") \ + ASM("movl %[carry], 4*" #INDEX "(%[z])") \ + +#define LINMUL_OP(WRITE_TO, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]),%%eax") \ + ASM("mull %[y]") \ + ASM("addl %[carry],%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("movl %%edx,%[carry]") \ + ASM("movl %%eax, 4*" #INDEX "(%[" WRITE_TO "])") + +#define MULADD_OP(IGNORED, INDEX) \ + ASM("movl 4*" #INDEX "(%[x]),%%eax") \ + ASM("mull %[y]") \ + ASM("addl %[carry],%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("addl 4*" #INDEX "(%[z]),%%eax") \ + ASM("adcl $0,%%edx") \ + ASM("movl %%edx,%[carry]") \ + ASM("movl %%eax, 4*" #INDEX " (%[z])") + +#define DO_8_TIMES(MACRO, ARG) \ + MACRO(ARG, 0) \ + MACRO(ARG, 1) \ + MACRO(ARG, 2) \ + MACRO(ARG, 3) \ + MACRO(ARG, 4) \ + MACRO(ARG, 5) \ + MACRO(ARG, 6) \ + MACRO(ARG, 7) + +#define ADD_OR_SUBTRACT(CORE_CODE) \ + ASM("rorl %[carry]") \ + CORE_CODE \ + ASM("sbbl %[carry],%[carry]") \ + ASM("negl %[carry]") + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("adcl %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "adcl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "adcl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + asm( + ADD_OR_SUBTRACT(ASM("sbbl %[y],%[x]")) + : [x]"=r"(x), [carry]"=r"(*carry) + : "0"(x), [y]"rm"(y), "1"(*carry) + : "cc"); + return x; + } + +/************************************************* +* Eight Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB2_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Subtraction, Three Argument * +*************************************************/ +inline word word8_sub3(word z[8], const word x[8], const word y[8], word carry) + { + asm( + ADD_OR_SUBTRACT(DO_8_TIMES(ADDSUB3_OP, "sbbl")) + : [carry]"=r"(carry) + : [x]"r"(x), [y]"r"(y), [z]"r"(z), "0"(carry) + : "cc", "memory"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "x") + : [carry]"=r"(carry) + : [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(LINMUL_OP, "z") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + asm( + DO_8_TIMES(MULADD_OP, "") + : [carry]"=r"(carry) + : [z]"r"(z), [x]"r"(x), [y]"rm"(y), "0"(carry) + : "cc", "%eax", "%edx"); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mull %[y]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word x, word y) + { + asm( + ASM("mull %[y]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + ASM("addl %[x],%[w0]") + ASM("adcl %[y],%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"a"(x), [y]"d"(y), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + } + +} + +} + +#endif diff --git a/src/asm/mp_ia32_msvc/modinfo.txt b/src/asm/mp_ia32_msvc/modinfo.txt new file mode 100644 index 000000000..36d9d0290 --- /dev/null +++ b/src/asm/mp_ia32_msvc/modinfo.txt @@ -0,0 +1,17 @@ +realname "x86 MPI Assembler Core (MSVC)" + +mp_bits 32 + +#load_on asm_ok + +<replace> +mp_asmi.h +</replace> + +<arch> +ia32 +</arch> + +<cc> +msvc +</cc> diff --git a/src/asm/mp_ia32_msvc/mp_asmi.h b/src/asm/mp_ia32_msvc/mp_asmi.h new file mode 100644 index 000000000..5eaa46eb4 --- /dev/null +++ b/src/asm/mp_ia32_msvc/mp_asmi.h @@ -0,0 +1,547 @@ +/************************************************* +* Lowest Level MPI Algorithms Header File * +* (C) 1999-2006 Jack Lloyd * +* 2006 Luca Piccarreta * +*************************************************/ + +#ifndef BOTAN_MP_ASM_INTERNAL_H__ +#define BOTAN_MP_ASM_INTERNAL_H__ + +#include "mp_asm.h" + +namespace Botan { + +extern "C" { + +/************************************************* +* Word Addition * +*************************************************/ +inline word word_add(word x, word y, word* carry) + { + word z = x + y; + word c1 = (z < x); + z += *carry; + *carry = c1 | (z < *carry); + return z; + } + +/************************************************* +* Four Word Block Addition, Two Argument * +*************************************************/ +inline word word4_addcarry(word x[4], word carry) + { + __asm { + mov edx,[x] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + adc [edx],0 + mov eax,[esi+4] + adc [edx+4],0 + mov eax,[esi+8] + adc [edx+8],0 + mov eax,[esi+12] + adc [edx+12],0 + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Addition, Two Argument * +*************************************************/ +inline word word8_add2(word x[8], const word y[8], word carry) + { + __asm { + mov edx,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[esi] + adc [edx],eax + mov eax,[esi+4] + adc [edx+4],eax + mov eax,[esi+8] + adc [edx+8],eax + mov eax,[esi+12] + adc [edx+12],eax + mov eax,[esi+16] + adc [edx+16],eax + mov eax,[esi+20] + adc [edx+20],eax + mov eax,[esi+24] + adc [edx+24],eax + mov eax,[esi+28] + adc [edx+28],eax + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Addition, Three Argument * +*************************************************/ +inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + mov ebx,[z] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + adc eax,[esi] + mov [ebx],eax + + mov eax,[edi+4] + adc eax,[esi+4] + mov [ebx+4],eax + + mov eax,[edi+8] + adc eax,[esi+8] + mov [ebx+8],eax + + mov eax,[edi+12] + adc eax,[esi+12] + mov [ebx+12],eax + + mov eax,[edi+16] + adc eax,[esi+16] + mov [ebx+16],eax + + mov eax,[edi+20] + adc eax,[esi+20] + mov [ebx+20],eax + + mov eax,[edi+24] + adc eax,[esi+24] + mov [ebx+24],eax + + mov eax,[edi+28] + adc eax,[esi+28] + mov [ebx+28],eax + + sbb eax,eax + neg eax + } + } + +/************************************************* +* Word Subtraction * +*************************************************/ +inline word word_sub(word x, word y, word* carry) + { + word t0 = x - y; + word c1 = (t0 > x); + word z = t0 - *carry; + *carry = c1 | (z > t0); + return z; + } + +/************************************************* +* Four Word Block Subtraction, Two Argument * +*************************************************/ +inline word word8_sub2(word x[8], const word y[8], word carry) + { + _asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov eax,[edi] + sbb eax,[esi] + mov [edi],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [edi+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [edi+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [edi+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [edi+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [edi+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [edi+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [edi+28],eax + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Subtraction, Three Argument * +*************************************************/ +__forceinline word word8_sub3(word z[8], const word x[8], + const word y[8], word carry) + { + __asm { + mov edi,[x] + mov esi,[y] + xor eax,eax + sub eax,[carry] //force CF=1 iff *carry==1 + mov ebx,[z] + mov eax,[edi] + sbb eax,[esi] + mov [ebx],eax + mov eax,[edi+4] + sbb eax,[esi+4] + mov [ebx+4],eax + mov eax,[edi+8] + sbb eax,[esi+8] + mov [ebx+8],eax + mov eax,[edi+12] + sbb eax,[esi+12] + mov [ebx+12],eax + mov eax,[edi+16] + sbb eax,[esi+16] + mov [ebx+16],eax + mov eax,[edi+20] + sbb eax,[esi+20] + mov [ebx+20],eax + mov eax,[edi+24] + sbb eax,[esi+24] + mov [ebx+24],eax + mov eax,[edi+28] + sbb eax,[esi+28] + mov [ebx+28],eax + sbb eax,eax + neg eax + } + } + +/************************************************* +* Four Word Block Linear Multiplication * +*************************************************/ +inline word word8_linmul2(word x[8], word y, word carry) + { + __asm + { + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [esi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [esi+28],eax //load a + + mov eax,edx //store carry + } + } + +/************************************************* +* Eight Word Block Linear Multiplication * +*************************************************/ +__forceinline word word8_muladd(word z[8], const word x[8], + word y, word carry) + { + __asm + { + mov esi,[x] + mov ebx,[y] + mov edi,[z] + mov eax,[esi] //load a + mul ebx //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + add eax,[edi] //sum lo z + adc edx,0 //sum hi z + mov ecx,edx //carry for next block = hi z + mov [edi],eax //save lo z + + mov eax,[esi+4] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+4] + adc edx,0 + mov ecx,edx + mov [edi+4],eax + + mov eax,[esi+8] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+8] + adc edx,0 + mov ecx,edx + mov [edi+8],eax + + mov eax,[esi+12] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+12] + adc edx,0 + mov ecx,edx + mov [edi+12],eax + + mov eax,[esi+16] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+16] + adc edx,0 + mov ecx,edx + mov [edi+16],eax + + mov eax,[esi+20] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+20] + adc edx,0 + mov ecx,edx + mov [edi+20],eax + + mov eax,[esi+24] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+24] + adc edx,0 + mov ecx,edx + mov [edi+24],eax + + mov eax,[esi+28] + mul ebx + add eax,ecx + adc edx,0 + add eax,[edi+28] + adc edx,0 + mov [edi+28],eax + mov eax,edx + } + } + +__forceinline word word8_linmul3(word z[4], const word x[4], word y, word carry) + { + __asm + { +#if 0 + //it's slower!!! + mov edx,[z] + mov eax,[x] + movd mm7,[y] + + movd mm0,[eax] + movd mm1,[eax+4] + movd mm2,[eax+8] + pmuludq mm0,mm7 + pmuludq mm1,mm7 + pmuludq mm2,mm7 + + movd mm6,[carry] + paddq mm0,mm6 + movd [edx],mm0 + + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+4],mm1 + + movd mm3,[eax+12] + psrlq mm1,32 + paddq mm2,mm1 + movd [edx+8],mm2 + + pmuludq mm3,mm7 + movd mm4,[eax+16] + psrlq mm2,32 + paddq mm3,mm2 + movd [edx+12],mm3 + + pmuludq mm4,mm7 + movd mm5,[eax+20] + psrlq mm3,32 + paddq mm4,mm3 + movd [edx+16],mm4 + + pmuludq mm5,mm7 + movd mm0,[eax+24] + psrlq mm4,32 + paddq mm5,mm4 + movd [edx+20],mm5 + + pmuludq mm0,mm7 + movd mm1,[eax+28] + psrlq mm5,32 + paddq mm0,mm5 + movd [edx+24],mm0 + + pmuludq mm1,mm7 + psrlq mm0,32 + paddq mm1,mm0 + movd [edx+28],mm1 + + psrlq mm1,32 + movd eax,mm1 + emms +#else + mov edi,[z] + mov esi,[x] + mov eax,[esi] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,[carry] //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi],eax //load a + + mov eax,[esi+4] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+4],eax //load a + + mov eax,[esi+8] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+8],eax //load a + + mov eax,[esi+12] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+12],eax //load a + + mov eax,[esi+16] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+16],eax //load a + + mov eax,[esi+20] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+20],eax //load a + + mov eax,[esi+24] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov ecx,edx //store carry + mov [edi+24],eax //load a + + mov eax,[esi+28] //load a + mul [y] //edx(hi):eax(lo)=a*b + add eax,ecx //sum lo carry + adc edx,0 //sum hi carry + mov [edi+28],eax //load a + mov eax,edx //store carry +#endif + } + } + +/************************************************* +* Eight Word Block Multiply/Add * +*************************************************/ +inline word word8_madd3(word z[8], const word x[8], word y, word carry) + { + z[0] = word_madd3(x[0], y, z[0], &carry); + z[1] = word_madd3(x[1], y, z[1], &carry); + z[2] = word_madd3(x[2], y, z[2], &carry); + z[3] = word_madd3(x[3], y, z[3], &carry); + z[4] = word_madd3(x[4], y, z[4], &carry); + z[5] = word_madd3(x[5], y, z[5], &carry); + z[6] = word_madd3(x[6], y, z[6], &carry); + z[7] = word_madd3(x[7], y, z[7], &carry); + return carry; + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b) + { + dword z = (dword)a * b + (*w0); + *w0 = (word)z; //lo + + word t1 = (word)(z >> BOTAN_MP_WORD_BITS); //hi + *w1 += t1; //w1+=lo + *w2 += (*w1 < t1) ? 1 : 0; //w2+=carry + } + +/************************************************* +* Multiply-Add Accumulator * +*************************************************/ +inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b) + { + dword z = (dword)a * b; + word t0 = (word)z; + word t1 = (word)(z >> BOTAN_MP_WORD_BITS); + + *w0 += t0; + *w1 += t1 + ((*w0 < t0) ? 1 : 0); + *w2 += (*w1 < t1) ? 1 : 0; + + *w0 += t0; + *w1 += t1 + ((*w0 < t0) ? 1 : 0); + *w2 += (*w1 < t1) ? 1 : 0; + } + +} + +} + +#endif diff --git a/src/asm/sha1_sse2/modinfo.txt b/src/asm/sha1_sse2/modinfo.txt new file mode 100644 index 000000000..e1805260c --- /dev/null +++ b/src/asm/sha1_sse2/modinfo.txt @@ -0,0 +1,22 @@ +realname "SSE2 implementation of SHA-1" + +load_on request + +<replace> +sha160.cpp +sha160.h +</replace> + +<add> +sha1core.cpp +</add> + +<arch> +pentium4 +amd64 +</arch> + +<cc> +gcc +icc +</cc> diff --git a/src/asm/sha1_sse2/sha160.cpp b/src/asm/sha1_sse2/sha160.cpp new file mode 100644 index 000000000..dfb5fdfe5 --- /dev/null +++ b/src/asm/sha1_sse2/sha160.cpp @@ -0,0 +1,52 @@ +/************************************************* +* SHA-160 Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include <botan/sha160.h> +#include <botan/loadstor.h> +#include <botan/bit_ops.h> + +namespace Botan { + +extern "C" void botan_sha1_sse(u32bit[5], const byte[64]); + +/************************************************* +* SHA-160 Compression Function * +*************************************************/ +void SHA_160::hash(const byte input[]) + { + botan_sha1_sse(digest, input); + } + +/************************************************* +* Copy out the digest * +*************************************************/ +void SHA_160::copy_out(byte output[]) + { + for(u32bit j = 0; j != OUTPUT_LENGTH; ++j) + output[j] = get_byte(j % 4, digest[j/4]); + } + +/************************************************* +* Clear memory of sensitive data * +*************************************************/ +void SHA_160::clear() throw() + { + MDx_HashFunction::clear(); + digest[0] = 0x67452301; + digest[1] = 0xEFCDAB89; + digest[2] = 0x98BADCFE; + digest[3] = 0x10325476; + digest[4] = 0xC3D2E1F0; + } + +/************************************************* +* SHA_160 Constructor * +*************************************************/ +SHA_160::SHA_160() : MDx_HashFunction(20, 64, true, true) + { + clear(); + } + +} diff --git a/src/asm/sha1_sse2/sha160.h b/src/asm/sha1_sse2/sha160.h new file mode 100644 index 000000000..c6f8482cf --- /dev/null +++ b/src/asm/sha1_sse2/sha160.h @@ -0,0 +1,32 @@ +/************************************************* +* SHA-160 Header File * +* (C) 1999-2007 The Botan Project * +*************************************************/ + +#ifndef BOTAN_SHA_160_H__ +#define BOTAN_SHA_160_H__ + +#include <botan/mdx_hash.h> + +namespace Botan { + +/************************************************* +* SHA-160 * +*************************************************/ +class SHA_160 : public MDx_HashFunction + { + public: + void clear() throw(); + std::string name() const { return "SHA-160"; } + HashFunction* clone() const { return new SHA_160; } + SHA_160(); + private: + void hash(const byte[]); + void copy_out(byte[]); + + SecureBuffer<u32bit, 5> digest; + }; + +} + +#endif diff --git a/src/asm/sha1_sse2/sha1core.cpp b/src/asm/sha1_sse2/sha1core.cpp new file mode 100644 index 000000000..23dbfc5e2 --- /dev/null +++ b/src/asm/sha1_sse2/sha1core.cpp @@ -0,0 +1,327 @@ +/* this code is public domain. + * + * dean gaudet <[email protected]> + * + * this code was inspired by this paper: + * + * SHA: A Design for Parallel Architectures? + * Antoon Bosselaers, Ren�e Govaerts and Joos Vandewalle + * <http://www.esat.kuleuven.ac.be/~cosicart/pdf/AB-9700.pdf> + * + * more information available on this implementation here: + * + * http://arctic.org/~dean/crypto/sha1.html + * + * version: 2 + */ + +/* + * Lightly modified for Botan, tested under GCC 4.1.1 and ICC 9.1 + * on a Linux/Core2 system. + * + */ +#include <botan/types.h> +#include <xmmintrin.h> + +namespace Botan { + +typedef union { + u32bit u32[4]; + __m128i u128; + } v4si __attribute__((aligned(16))); + +static const v4si K00_19 = { { 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 } }; +static const v4si K20_39 = { { 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 } }; +static const v4si K40_59 = { { 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc } }; +static const v4si K60_79 = { { 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 } }; + +#define UNALIGNED 1 +#if UNALIGNED +#define load(p) _mm_loadu_si128(p) +#else +#define load(p) (*p) +#endif + + +/* +the first 16 bytes only need byte swapping + +prepared points to 4x u32bit, 16-byte aligned + +W points to the 4 dwords which need preparing -- +and is overwritten with the swapped bytes +*/ +#define prep00_15(prep, W) do { \ + __m128i r1, r2; \ + \ + r1 = (W); \ + if (1) { \ + r1 = _mm_shufflehi_epi16(r1, _MM_SHUFFLE(2, 3, 0, 1)); \ + r1 = _mm_shufflelo_epi16(r1, _MM_SHUFFLE(2, 3, 0, 1)); \ + r2 = _mm_slli_epi16(r1, 8); \ + r1 = _mm_srli_epi16(r1, 8); \ + r1 = _mm_or_si128(r1, r2); \ + (W) = r1; \ + } \ + (prep).u128 = _mm_add_epi32(K00_19.u128, r1); \ + } while(0) + + + +/* +for each multiple of 4, t, we want to calculate this: + +W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); +W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1); +W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1); +W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1); + +we'll actually calculate this: + +W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); +W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1); +W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1); +W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1); +W[t+3] ^= rol(W[t+0], 1); + +the parameters are: + +W0 = &W[t-16]; +W1 = &W[t-12]; +W2 = &W[t- 8]; +W3 = &W[t- 4]; + +and on output: +prepared = W0 + K +W0 = W[t]..W[t+3] +*/ + +/* note that there is a step here where i want to do a rol by 1, which +* normally would look like this: +* +* r1 = psrld r0,$31 +* r0 = pslld r0,$1 +* r0 = por r0,r1 +* +* but instead i do this: +* +* r1 = pcmpltd r0,zero +* r0 = paddd r0,r0 +* r0 = psub r0,r1 +* +* because pcmpltd and paddd are availabe in both MMX units on +* efficeon, pentium-m, and opteron but shifts are available in +* only one unit. +*/ +#define prep(prep, XW0, XW1, XW2, XW3, K) do { \ + __m128i r0, r1, r2, r3; \ + \ + /* load W[t-4] 16-byte aligned, and shift */ \ + r3 = _mm_srli_si128((XW3), 4); \ + r0 = (XW0); \ + /* get high 64-bits of XW0 into low 64-bits */ \ + r1 = _mm_shuffle_epi32((XW0), _MM_SHUFFLE(1,0,3,2)); \ + /* load high 64-bits of r1 */ \ + r1 = _mm_unpacklo_epi64(r1, (XW1)); \ + r2 = (XW2); \ + \ + r0 = _mm_xor_si128(r1, r0); \ + r2 = _mm_xor_si128(r3, r2); \ + r0 = _mm_xor_si128(r2, r0); \ + /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */ \ + \ + r2 = _mm_slli_si128(r0, 12); \ + r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128()); \ + r0 = _mm_add_epi32(r0, r0); /* shift left by 1 */ \ + r0 = _mm_sub_epi32(r0, r1); /* r0 has W[t]..W[t+2] */ \ + \ + r3 = _mm_srli_epi32(r2, 30); \ + r2 = _mm_slli_epi32(r2, 2); \ + \ + r0 = _mm_xor_si128(r0, r3); \ + r0 = _mm_xor_si128(r0, r2); /* r0 now has W[t+3] */ \ + \ + (XW0) = r0; \ + (prep).u128 = _mm_add_epi32(r0, (K).u128); \ + } while(0) + + +static inline u32bit rol(u32bit src, u32bit amt) + { + /* gcc and icc appear to turn this into a rotate */ + return (src << amt) | (src >> (32 - amt)); + } + + +static inline u32bit f00_19(u32bit x, u32bit y, u32bit z) + { + /* FIPS 180-2 says this: (x & y) ^ (~x & z) + * but we can calculate it in fewer steps. + */ + return ((y ^ z) & x) ^ z; + } + + +static inline u32bit f20_39(u32bit x, u32bit y, u32bit z) + { + return (x ^ z) ^ y; + } + + +static inline u32bit f40_59(u32bit x, u32bit y, u32bit z) + { + /* FIPS 180-2 says this: (x & y) ^ (x & z) ^ (y & z) + * but we can calculate it in fewer steps. + */ + return (x & z) | ((x | z) & y); + } + + +static inline u32bit f60_79(u32bit x, u32bit y, u32bit z) + { + return f20_39(x, y, z); + } + +#define step(nn_mm, xa, xb, xc, xd, xe, xt, input) do { \ + (xt) = (input) + f##nn_mm((xb), (xc), (xd)); \ + (xb) = rol((xb), 30); \ + (xt) += ((xe) + rol((xa), 5)); \ + } while(0) + +extern "C" void botan_sha1_sse(u32bit* H, + const u32bit* inputu) + { + const __m128i * input = (const __m128i *)inputu; + __m128i W0, W1, W2, W3; + v4si prep0, prep1, prep2; + u32bit a, b, c, d, e, t; + + a = H[0]; + b = H[1]; + c = H[2]; + d = H[3]; + e = H[4]; + + /* i've tried arranging the SSE2 code to be 4, 8, 12, and 16 + * steps ahead of the integer code. 12 steps ahead seems + * to produce the best performance. -dean + */ + W0 = load(&input[0]); + prep00_15(prep0, W0); /* prepare for 00 through 03 */ + W1 = load(&input[1]); + prep00_15(prep1, W1); /* prepare for 04 through 07 */ + W2 = load(&input[2]); + prep00_15(prep2, W2); /* prepare for 08 through 11 */ + + W3 = load(&input[3]); + step(00_19, a, b, c, d, e, t, prep0.u32[0]); /* 00 */ + step(00_19, t, a, b, c, d, e, prep0.u32[1]); /* 01 */ + step(00_19, e, t, a, b, c, d, prep0.u32[2]); /* 02 */ + step(00_19, d, e, t, a, b, c, prep0.u32[3]); /* 03 */ + prep00_15(prep0, W3); + step(00_19, c, d, e, t, a, b, prep1.u32[0]); /* 04 */ + step(00_19, b, c, d, e, t, a, prep1.u32[1]); /* 05 */ + step(00_19, a, b, c, d, e, t, prep1.u32[2]); /* 06 */ + step(00_19, t, a, b, c, d, e, prep1.u32[3]); /* 07 */ + prep(prep1, W0, W1, W2, W3, K00_19); /* prepare for 16 through 19 */ + step(00_19, e, t, a, b, c, d, prep2.u32[0]); /* 08 */ + step(00_19, d, e, t, a, b, c, prep2.u32[1]); /* 09 */ + step(00_19, c, d, e, t, a, b, prep2.u32[2]); /* 10 */ + step(00_19, b, c, d, e, t, a, prep2.u32[3]); /* 11 */ + prep(prep2, W1, W2, W3, W0, K20_39); /* prepare for 20 through 23 */ + step(00_19, a, b, c, d, e, t, prep0.u32[0]); /* 12 */ + step(00_19, t, a, b, c, d, e, prep0.u32[1]); /* 13 */ + step(00_19, e, t, a, b, c, d, prep0.u32[2]); /* 14 */ + step(00_19, d, e, t, a, b, c, prep0.u32[3]); /* 15 */ + prep(prep0, W2, W3, W0, W1, K20_39); + step(00_19, c, d, e, t, a, b, prep1.u32[0]); /* 16 */ + step(00_19, b, c, d, e, t, a, prep1.u32[1]); /* 17 */ + step(00_19, a, b, c, d, e, t, prep1.u32[2]); /* 18 */ + step(00_19, t, a, b, c, d, e, prep1.u32[3]); /* 19 */ + + prep(prep1, W3, W0, W1, W2, K20_39); + step(20_39, e, t, a, b, c, d, prep2.u32[0]); /* 20 */ + step(20_39, d, e, t, a, b, c, prep2.u32[1]); /* 21 */ + step(20_39, c, d, e, t, a, b, prep2.u32[2]); /* 22 */ + step(20_39, b, c, d, e, t, a, prep2.u32[3]); /* 23 */ + prep(prep2, W0, W1, W2, W3, K20_39); + step(20_39, a, b, c, d, e, t, prep0.u32[0]); /* 24 */ + step(20_39, t, a, b, c, d, e, prep0.u32[1]); /* 25 */ + step(20_39, e, t, a, b, c, d, prep0.u32[2]); /* 26 */ + step(20_39, d, e, t, a, b, c, prep0.u32[3]); /* 27 */ + prep(prep0, W1, W2, W3, W0, K20_39); + step(20_39, c, d, e, t, a, b, prep1.u32[0]); /* 28 */ + step(20_39, b, c, d, e, t, a, prep1.u32[1]); /* 29 */ + step(20_39, a, b, c, d, e, t, prep1.u32[2]); /* 30 */ + step(20_39, t, a, b, c, d, e, prep1.u32[3]); /* 31 */ + prep(prep1, W2, W3, W0, W1, K40_59); + step(20_39, e, t, a, b, c, d, prep2.u32[0]); /* 32 */ + step(20_39, d, e, t, a, b, c, prep2.u32[1]); /* 33 */ + step(20_39, c, d, e, t, a, b, prep2.u32[2]); /* 34 */ + step(20_39, b, c, d, e, t, a, prep2.u32[3]); /* 35 */ + prep(prep2, W3, W0, W1, W2, K40_59); + step(20_39, a, b, c, d, e, t, prep0.u32[0]); /* 36 */ + step(20_39, t, a, b, c, d, e, prep0.u32[1]); /* 37 */ + step(20_39, e, t, a, b, c, d, prep0.u32[2]); /* 38 */ + step(20_39, d, e, t, a, b, c, prep0.u32[3]); /* 39 */ + + prep(prep0, W0, W1, W2, W3, K40_59); + step(40_59, c, d, e, t, a, b, prep1.u32[0]); /* 40 */ + step(40_59, b, c, d, e, t, a, prep1.u32[1]); /* 41 */ + step(40_59, a, b, c, d, e, t, prep1.u32[2]); /* 42 */ + step(40_59, t, a, b, c, d, e, prep1.u32[3]); /* 43 */ + prep(prep1, W1, W2, W3, W0, K40_59); + step(40_59, e, t, a, b, c, d, prep2.u32[0]); /* 44 */ + step(40_59, d, e, t, a, b, c, prep2.u32[1]); /* 45 */ + step(40_59, c, d, e, t, a, b, prep2.u32[2]); /* 46 */ + step(40_59, b, c, d, e, t, a, prep2.u32[3]); /* 47 */ + prep(prep2, W2, W3, W0, W1, K40_59); + step(40_59, a, b, c, d, e, t, prep0.u32[0]); /* 48 */ + step(40_59, t, a, b, c, d, e, prep0.u32[1]); /* 49 */ + step(40_59, e, t, a, b, c, d, prep0.u32[2]); /* 50 */ + step(40_59, d, e, t, a, b, c, prep0.u32[3]); /* 51 */ + prep(prep0, W3, W0, W1, W2, K60_79); + step(40_59, c, d, e, t, a, b, prep1.u32[0]); /* 52 */ + step(40_59, b, c, d, e, t, a, prep1.u32[1]); /* 53 */ + step(40_59, a, b, c, d, e, t, prep1.u32[2]); /* 54 */ + step(40_59, t, a, b, c, d, e, prep1.u32[3]); /* 55 */ + prep(prep1, W0, W1, W2, W3, K60_79); + step(40_59, e, t, a, b, c, d, prep2.u32[0]); /* 56 */ + step(40_59, d, e, t, a, b, c, prep2.u32[1]); /* 57 */ + step(40_59, c, d, e, t, a, b, prep2.u32[2]); /* 58 */ + step(40_59, b, c, d, e, t, a, prep2.u32[3]); /* 59 */ + + prep(prep2, W1, W2, W3, W0, K60_79); + step(60_79, a, b, c, d, e, t, prep0.u32[0]); /* 60 */ + step(60_79, t, a, b, c, d, e, prep0.u32[1]); /* 61 */ + step(60_79, e, t, a, b, c, d, prep0.u32[2]); /* 62 */ + step(60_79, d, e, t, a, b, c, prep0.u32[3]); /* 63 */ + prep(prep0, W2, W3, W0, W1, K60_79); + step(60_79, c, d, e, t, a, b, prep1.u32[0]); /* 64 */ + step(60_79, b, c, d, e, t, a, prep1.u32[1]); /* 65 */ + step(60_79, a, b, c, d, e, t, prep1.u32[2]); /* 66 */ + step(60_79, t, a, b, c, d, e, prep1.u32[3]); /* 67 */ + prep(prep1, W3, W0, W1, W2, K60_79); + step(60_79, e, t, a, b, c, d, prep2.u32[0]); /* 68 */ + step(60_79, d, e, t, a, b, c, prep2.u32[1]); /* 69 */ + step(60_79, c, d, e, t, a, b, prep2.u32[2]); /* 70 */ + step(60_79, b, c, d, e, t, a, prep2.u32[3]); /* 71 */ + + step(60_79, a, b, c, d, e, t, prep0.u32[0]); /* 72 */ + step(60_79, t, a, b, c, d, e, prep0.u32[1]); /* 73 */ + step(60_79, e, t, a, b, c, d, prep0.u32[2]); /* 74 */ + step(60_79, d, e, t, a, b, c, prep0.u32[3]); /* 75 */ + /* no more input to prepare */ + step(60_79, c, d, e, t, a, b, prep1.u32[0]); /* 76 */ + step(60_79, b, c, d, e, t, a, prep1.u32[1]); /* 77 */ + step(60_79, a, b, c, d, e, t, prep1.u32[2]); /* 78 */ + step(60_79, t, a, b, c, d, e, prep1.u32[3]); /* 79 */ + /* e, t, a, b, c, d */ + H[0] += e; + H[1] += t; + H[2] += a; + H[3] += b; + H[4] += c; + } + +} |