From a3366dc421c4ef9f802009a667ddcb8a9cd2c8a6 Mon Sep 17 00:00:00 2001 From: lloyd Date: Mon, 29 Sep 2008 20:20:49 +0000 Subject: Reorg BigInt asm modules --- src/bigint/asm_amd64/mp_monty.S | 397 -------------------------------- src/bigint/asm_amd64/mp_mulop.cpp | 94 -------- src/bigint/asm_amd64/mp_mulop_amd64.S | 128 ---------- src/bigint/asm_amd64/xxxinfo.txt | 34 --- src/bigint/asm_ia32/mp_mulop.S | 62 ----- src/bigint/asm_ia32/xxxinfo.txt | 43 ---- src/bigint/monty_amd64/mp_monty.S | 397 ++++++++++++++++++++++++++++++++ src/bigint/monty_amd64/xxxinfo.txt | 34 +++ src/bigint/mulop_amd64/mp_mulop.cpp | 94 ++++++++ src/bigint/mulop_amd64/mp_mulop_amd64.S | 128 ++++++++++ src/bigint/mulop_ia32/mp_mulop.S | 62 +++++ src/bigint/mulop_ia32/xxxinfo.txt | 43 ++++ 12 files changed, 758 insertions(+), 758 deletions(-) delete mode 100644 src/bigint/asm_amd64/mp_monty.S delete mode 100644 src/bigint/asm_amd64/mp_mulop.cpp delete mode 100644 src/bigint/asm_amd64/mp_mulop_amd64.S delete mode 100644 src/bigint/asm_amd64/xxxinfo.txt delete mode 100644 src/bigint/asm_ia32/mp_mulop.S delete mode 100644 src/bigint/asm_ia32/xxxinfo.txt create mode 100644 src/bigint/monty_amd64/mp_monty.S create mode 100644 src/bigint/monty_amd64/xxxinfo.txt create mode 100644 src/bigint/mulop_amd64/mp_mulop.cpp create mode 100644 src/bigint/mulop_amd64/mp_mulop_amd64.S create mode 100644 src/bigint/mulop_ia32/mp_mulop.S create mode 100644 src/bigint/mulop_ia32/xxxinfo.txt (limited to 'src') diff --git a/src/bigint/asm_amd64/mp_monty.S b/src/bigint/asm_amd64/mp_monty.S deleted file mode 100644 index 3dd4040bc..000000000 --- a/src/bigint/asm_amd64/mp_monty.S +++ /dev/null @@ -1,397 +0,0 @@ -/************************************************* -* Montgomery Reduction Source File * -* (C) 2008 Jack Lloyd * -*************************************************/ - -#include - -START_LISTING(mp_monty.S) - -START_FUNCTION(bigint_monty_redc) - pushq %r15 # - pushq %r14 # - pushq %r13 # - pushq %r12 # - pushq %rbp # - pushq %rbx # - - movq %rdi, %r14 # z - movq %rdx, %r12 # x - movl %esi, %ebp # z_size - - xorl %esi, %esi # j.76 - movq %r8, -16(%rsp) # u, u - movl %ecx, %ebx # x_size, x_size - movl %ecx, %r8d # x_size, blocks_of_8 - andl $-8, %r8d #, blocks_of_8 - testl %ecx, %ecx # x_size - je .L3 #, - mov %ecx, %eax # x_size, pretmp.71 - leal 1(%rbx), %r15d #, k.73 - salq $3, %rax #, - xorl %r13d, %r13d # j - movq %rax, -8(%rsp) #, pretmp.21 - .p2align 4,,10 - .p2align 3 -.L11: - mov %r13d, %eax # j, j - movq -16(%rsp), %rdi # u, y - leaq (%r14,%rax,8), %r11 #, z_j - xorl %r9d, %r9d # i - imulq (%r11), %rdi #* z_j, y - xorl %r10d, %r10d # carry - testl %r8d, %r8d # blocks_of_8 - je .L7 #, - .p2align 4,,10 - .p2align 3 -.LOOP_MUL_ADD: - mov %r9d, %ecx # i, i - addl $8, %r9d #, i - salq $3, %rcx #, D.2315 - leaq (%r11,%rcx), %rsi #, tmp130 - leaq (%r12,%rcx), %rcx #, tmp131 - - movq 8*0(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*0(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*0 (%rsi) - - movq 8*1(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*1(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*1 (%rsi) - - movq 8*2(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*2(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*2 (%rsi) - - movq 8*3(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*3(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*3 (%rsi) - - movq 8*4(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*4(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*4 (%rsi) - - movq 8*5(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*5(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*5 (%rsi) - - movq 8*6(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*6(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*6 (%rsi) - - movq 8*7(%rcx), %rax - mulq %rdi # y - addq %r10, %rax # carry - adcq $0,%rdx - addq 8*7(%rsi), %rax - adcq $0,%rdx - movq %rdx,%r10 # carry - movq %rax, 8*7 (%rsi) - - cmpl %r9d, %r8d # i, blocks_of_8 - jne .LOOP_MUL_ADD #, - cmpl %r8d, %ebx # blocks_of_8, x_size - je .L8 #, -.L7: - movl %r8d, %esi # blocks_of_8, i - .p2align 4,,10 - .p2align 3 -.L5: - mov %esi, %eax # i, i - movq %rdi, %rcx # y, b - leaq (%r11, %rax,8), %r9 #, D.2325 - incl %esi # i - movq (%r12, %rax,8), %rax #* x, tmp133 - - mulq %rcx # b - addq (%r9), %rax #* D.2325, a - adcq $0,%rdx # - addq %r10, %rax # carry, a - adcq $0,%rdx # - - cmpl %esi, %ebx # i, x_size - movq %rdx, %r10 #, carry - movq %rax, (%r9) # a,* D.2325 - jne .L5 #, -.L8: - movq -8(%rsp), %rdx # pretmp.21, - leaq (%r11,%rdx), %rax #, D.2332 - movq (%rax), %rcx #* D.2332, D.2333 - leaq (%r10,%rcx), %rdx #, z_sum - movq %rdx, (%rax) # z_sum,* D.2332 - cmpq %rdx, %rcx # z_sum, D.2333 - jbe .L9 #, - cmpl %ebp, %r15d # z_size, k.73 - je .L9 #, - movl %r15d, %ecx # k.73, k - jmp .L10 # - .p2align 4,,10 - .p2align 3 -.L31: - incl %ecx # k - cmpl %ecx, %ebp # k, z_size - .p2align 4,,4 - .p2align 3 - je .L9 #, -.L10: - mov %ecx, %edx # k, k - leaq (%r11,%rdx,8), %rdx #, D.2342 - movq (%rdx), %rax #* D.2342, tmp136 - incq %rax # D.2344 - movq %rax, (%rdx) # D.2344,* D.2342 - testq %rax, %rax # D.2344 - je .L31 #, -.L9: - incl %r13d # j - decl %ebp # z_size - cmpl %r13d, %ebx # j, x_size - jne .L11 #, - movl %ebx, %esi # x_size, j.76 -.L3: - leal (%rbx,%rbx), %eax #, tmp137 - mov %eax, %eax - leaq (%r14, %rax,8), %rdi #, D.2349 - cmpq $0, (%rdi) #,* D.2349 - jne .L12 #, - testl %ebx, %ebx # x_size - je .L12 #, - leal -1(%rbx), %ecx #, j - leal (%rsi,%rcx), %edx #, tmp141 - mov %ecx, %eax # j, j - movq (%r14,%rdx,8), %rbp #* z, - cmpq %rbp, (%r12, %rax,8) #,* x - jb .L12 #, - ja .L_EXIT #, - leal -2(%rsi,%rbx), %edx #, ivtmp.45 - jmp .L14 # - .p2align 4,,10 - .p2align 3 -.L15: - mov %edx, %eax # ivtmp.45, ivtmp.45 - decl %ecx # j - movq (%r14, %rax,8), %rsi #* z, D.2360 - mov %ecx, %eax # j, j - movq (%r12, %rax,8), %rax #* x, temp.68 - cmpq %rax, %rsi - ja .L12 #, - decl %edx # ivtmp.45 - cmpq %rax, %rsi - jb .L_EXIT #, -.L14: - testl %ecx, %ecx # j - jne .L15 #, -.L12: - xorl %ecx, %ecx # j - xorl %r10d, %r10d # carry - mov %ebx, %esi # x_size, pretmp.19 - testl %r8d, %r8d # blocks_of_8 - je .L17 #, - .p2align 4,,10 - .p2align 3 -.L22: - mov %ecx, %edx # j, D.2375 - addl $8, %ecx #, j - leaq (%rdx,%rsi), %rax #, tmp146 - leaq (%r12,%rdx,8), %rdx #, tmp150 - leaq (%r14, %rax,8), %rax #, tmp148 - - rorq %r10 # carry - - movq 8*0(%rdx), %r10 - sbbq %r10, 8*0(%rax) - - movq 8*1(%rdx), %r10 - sbbq %r10, 8*1(%rax) - - movq 8*2(%rdx), %r10 - sbbq %r10, 8*2(%rax) - - movq 8*3(%rdx), %r10 - sbbq %r10, 8*3(%rax) - - movq 8*4(%rdx), %r10 - sbbq %r10, 8*4(%rax) - - movq 8*5(%rdx), %r10 - sbbq %r10, 8*5(%rax) - - movq 8*6(%rdx), %r10 - sbbq %r10, 8*6(%rax) - - movq 8*7(%rdx), %r10 - sbbq %r10, 8*7(%rax) - - sbbq %r10,%r10 # carry - negq %r10 # carry - - cmpl %ecx, %r8d # j, blocks_of_8 - jne .L22 #, -.L17: - cmpl %r8d, %ebx # blocks_of_8, x_size - je .L19 #, - leal (%r8,%rbx), %r9d #, ivtmp.33 - movl %r8d, %esi # blocks_of_8, j - .p2align 4,,10 - .p2align 3 -.L20: - mov %r9d, %eax # ivtmp.33, ivtmp.33 - mov %esi, %ecx # j, j - leaq (%r14, %rax,8), %rax #, D.2387 - incl %esi # j - movq (%rax), %rdx #* D.2387, tmp153 - incl %r9d # ivtmp.33 - - rorq %r10 # carry - sbbq (%r12,%rcx,8),%rdx #* x, x - sbbq %r10,%r10 # carry - negq %r10 # carry - - cmpl %esi, %ebx # j, x_size - movq %rdx, (%rax) # x,* D.2387 - jne .L20 #, -.L19: - testq %r10, %r10 # carry - je .L_EXIT #, - decq (%rdi) #* D.2349 -.L_EXIT: - popq %rbx # - popq %rbp # - popq %r12 # - popq %r13 # - popq %r14 # - popq %r15 # -END_FUNCTION(bigint_monty_redc) - - -#if 0 - #define Z_ARR ARG_1 // rdi -#define Z_SIZE ARG_2_32 // esi -// X_ARR is ARG_3 == rdx, moved b/c needed for multiply -#define X_SIZE ARG_4_32 // ecx -#define U ARG_5 // r8 - -/* - We need all arguments for a while (we can reuse U eventually) - So only temp registers are - TEMP_1 %r10 - TEMP_2 %r11 - TEMP_3 = ARG_6 = %r9 - void return, so also - R0 %rax (aka TEMP_9) - is free (but needed for multiply) - - Can push: - %rbx (base pointer, callee saved) - %rpb (frame pointer, callee saved) - %r12-%r15 (callee saved) - - Can push base/frame pointers since this is a leaf function - and does not reference any data. -*/ - - push %r12 - push %r13 - push %r14 - push %r15 - -#define LOOP_CTR_I %r12 -#define LOOP_CTR_J %r13 - -#define CARRY TEMP_1 -#define Z_WORD TEMP_2 -#define X_ARR TEMP_3 -#define MUL_LO %rax -#define MUL_HI %rdx - - ASSIGN(X_ARR, ARG_3) - - /* - ZEROIZE(CARRY) - - ASSIGN(LOOP_CTR, X_SIZE) - - JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) - JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) - -#define MULADD_OP(N) \ - ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ - ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ - MUL(Y) ; \ - ADD(Z_WORD, CARRY) ; \ - ASSIGN(CARRY, MUL_HI) ; \ - ADD_LAST_CARRY(CARRY) ; \ - ADD(Z_WORD, MUL_LO) ; \ - ADD_LAST_CARRY(CARRY) ; \ - ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) - -ALIGN -.LOOP_MULADD8: - MULADD_OP(0) - MULADD_OP(1) - MULADD_OP(2) - MULADD_OP(3) - MULADD_OP(4) - MULADD_OP(5) - MULADD_OP(6) - MULADD_OP(7) - - SUB_IMM(LOOP_CTR, 8) - ADD_IMM(Z_ARR, 64) - ADD_IMM(X_ARR, 64) - cmp IMM(8), LOOP_CTR - jge .LOOP_MULADD8 - - JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) - -ALIGN -.LOOP_MULADD1: - MULADD_OP(0) - - SUB_IMM(LOOP_CTR, 1) - ADD_IMM(Z_ARR, 8) - ADD_IMM(X_ARR, 8) - - cmp IMM(0), LOOP_CTR - jne .LOOP_MULADD1 -*/ - - pop %r15 - pop %r14 - pop %r13 - pop %r12 -#endif diff --git a/src/bigint/asm_amd64/mp_mulop.cpp b/src/bigint/asm_amd64/mp_mulop.cpp deleted file mode 100644 index d1aa51489..000000000 --- a/src/bigint/asm_amd64/mp_mulop.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/************************************************* -* Simple O(N^2) Multiplication and Squaring * -* (C) 1999-2008 Jack Lloyd * -*************************************************/ - -#include -#include -#include -#include - -namespace Botan { - -extern "C" { - -/************************************************* -* Simple O(N^2) Multiplication * -*************************************************/ -void bigint_simple_mul(word z[], const word x[], u32bit x_size, - const word y[], u32bit y_size) - { - const u32bit blocks = x_size - (x_size % 8); - - clear_mem(z, x_size + y_size); - - for(u32bit i = 0; i != y_size; ++i) - { - word carry = 0; - - for(u32bit j = 0; j != blocks; j += 8) - carry = word8_madd3(z + i + j, x + j, y[i], carry); - - for(u32bit j = blocks; j != x_size; ++j) - z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry); - - z[x_size+i] = carry; - } - } - -inline word word_sqr(word x, - -/************************************************* -* Simple O(N^2) Squaring - -This is exactly the same algorithm as bigint_simple_mul, -however because C/C++ compilers suck at alias analysis it -is good to have the version where the compiler knows -that x == y -*************************************************/ -void bigint_simple_sqr(word z[], const word x[], u32bit x_size) - { - clear_mem(z, 2*x_size); - - for(u32bit i = 0; i != x_size; ++i) - { - const word x_i = x[i]; - - word carry = z[2*i]; - z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry); - - for(u32bit j = i; j != x_size; ++j) - { - // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry; - - /* - load z[i+j] into register - load x[j] into %hi - mulq %[x_i] -> x[i] * x[j] -> %lo:%hi - shlq %lo, $1 - - // put carry bit (cf) from %lo into %temp - xorl %temp - adcq $0, %temp - - // high bit of lo now in cf - shl %hi, $1 - // add in lowest bid from %lo - orl %temp, %hi - - addq %[c], %[lo] - adcq $0, %[hi] - addq %[z_ij], %[lo] - adcq $0, %[hi] - - */ - - } - - z[x_size+i] = carry; - } - } - -} - -} diff --git a/src/bigint/asm_amd64/mp_mulop_amd64.S b/src/bigint/asm_amd64/mp_mulop_amd64.S deleted file mode 100644 index e5bba23fb..000000000 --- a/src/bigint/asm_amd64/mp_mulop_amd64.S +++ /dev/null @@ -1,128 +0,0 @@ -/************************************************* -* Simple O(N^2) Multiplication and Squaring * -* (C) 1999-2008 Jack Lloyd * -*************************************************/ - -#include - -START_LISTING(mp_mulop.S) - -#if 0 -void bigint_simple_sqr(word z[], const word x[], u32bit x_size) - { - const u32bit blocks = x_size - (x_size % 8); - - clear_mem(z, 2*x_size); - - for(u32bit i = 0; i != x_size; ++i) - { - word carry = 0; - - /* - for(u32bit j = 0; j != blocks; j += 8) - carry = word8_madd3(z + i + j, x + j, x[i], carry); - - for(u32bit j = blocks; j != x_size; ++j) - z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); - */ - - - for(u32bit j = 0; j != x_size; ++j) - z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); - - for(u32bit j = 0; j != x_size; ++j) - { - dword z = (dword)a * b + c + *d; - *d = (word)(z >> BOTAN_MP_WORD_BITS); - return (word)z; - } - - - - z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); - - } - - - - z[x_size+i] = carry; - } - } - -#endif - -START_FUNCTION(bigint_simple_sqr) - -#define Z_ARR ARG_1 -#define X_ARR ARG_2 -//#define X_SIZE ARG_3_32 - -#define CARRY TEMP_1 -#define Z_WORD TEMP_2 -#define LOOP_I TEMP_3 -#define LOOP_J TEMP_4 -#define X_SIZE TEMP_5 -#define MUL_LO %rax -// arg 3, xsize -#define MUL_HI %rdx - -// need arg3 == rdx for multiply - ASSIGN(X_SIZE, ARG3_32) - - ZEROIZE(CARRY) - - ZEROIZE(LOOP_I) - -.LOOP_ZEROIZE_Z: - - cmp LOOP_I, X_SIZE - - - - - JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) - JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) - -#define MULADD_OP(N) \ - ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ - ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ - MUL(Y) ; \ - ADD(Z_WORD, CARRY) ; \ - ASSIGN(CARRY, MUL_HI) ; \ - ADD_LAST_CARRY(CARRY) ; \ - ADD(Z_WORD, MUL_LO) ; \ - ADD_LAST_CARRY(CARRY) ; \ - ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) - -.LOOP_MULADD8: - MULADD_OP(0) - MULADD_OP(1) - MULADD_OP(2) - MULADD_OP(3) - MULADD_OP(4) - MULADD_OP(5) - MULADD_OP(6) - MULADD_OP(7) - - SUB_IMM(LOOP_CTR, 8) - ADD_IMM(Z_ARR, 64) - ADD_IMM(X_ARR, 64) - cmp IMM(8), LOOP_CTR - jge .LOOP_MULADD8 - - JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) - -ALIGN -.LOOP_MULADD1: - MULADD_OP(0) - - SUB_IMM(LOOP_CTR, 1) - ADD_IMM(Z_ARR, 8) - ADD_IMM(X_ARR, 8) - - cmp IMM(0), LOOP_CTR - jne .LOOP_MULADD1 - -.L_MULADD_DONE: - RETURN_VALUE_IS(CARRY) -END_FUNCTION(bigint_simple_square) diff --git a/src/bigint/asm_amd64/xxxinfo.txt b/src/bigint/asm_amd64/xxxinfo.txt deleted file mode 100644 index 2a8f9fe5b..000000000 --- a/src/bigint/asm_amd64/xxxinfo.txt +++ /dev/null @@ -1,34 +0,0 @@ -realname "x86-64 Assembler" - -mp_bits 64 - -load_on request - - -#mp_mulop.cpp -#mp_monty.cpp - - - -asm_macr.h -#mp_mulop_amd64.S -#mp_monty.S - - - -amd64 - - - -gcc -icc - - -# ELF systems - -linux -freebsd -netbsd -openbsd -solaris - diff --git a/src/bigint/asm_ia32/mp_mulop.S b/src/bigint/asm_ia32/mp_mulop.S deleted file mode 100644 index a5f0d3b27..000000000 --- a/src/bigint/asm_ia32/mp_mulop.S +++ /dev/null @@ -1,62 +0,0 @@ -/************************************************* -* Multiply/Add Algorithm Source File * -* (C) 1999-2007 Jack Lloyd * -*************************************************/ - -#include - -START_LISTING(mp_muladd.S) - -START_FUNCTION(bigint_mul_add_words) - SPILL_REGS() -#define PUSHED 4 - -#define LOOP_CTR ESI - ASSIGN(LOOP_CTR, ARG(3)) /* x_size */ - ZEROIZE(EDI) - - ASSIGN(ECX, ARG(1)) /* z[] */ - ASSIGN(EBX, ARG(2)) /* x[] */ - ASSIGN(EBP, ARG(4)) /* y */ - -#define MULADD_OP(N) \ - ASSIGN(EAX, ARRAY4(EBX, N)) ; \ - MUL(EBP) ; \ - ADD_W_CARRY(EAX, EDX, EDI) ; \ - ASSIGN(EDI, EDX) ; \ - ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ; - - JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) - JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP) - -START_LOOP(.MULADD8) - MULADD_OP(0) - MULADD_OP(1) - MULADD_OP(2) - MULADD_OP(3) - MULADD_OP(4) - MULADD_OP(5) - MULADD_OP(6) - MULADD_OP(7) - - SUB_IMM(LOOP_CTR, 8) - ADD_IMM(EBX, 32) - ADD_IMM(ECX, 32) -LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8) - - JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) - -START_LOOP(.MULADD1) - MULADD_OP(0) - - SUB_IMM(LOOP_CTR, 1) - ADD_IMM(EBX, 4) - ADD_IMM(ECX, 4) -LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1) - -.MUL_ADD_DONE: - - ASSIGN(EAX, EDI) -#undef PUSHED - RESTORE_REGS() -END_FUNCTION(bigint_mul_add_words) diff --git a/src/bigint/asm_ia32/xxxinfo.txt b/src/bigint/asm_ia32/xxxinfo.txt deleted file mode 100644 index 12c8cd96d..000000000 --- a/src/bigint/asm_ia32/xxxinfo.txt +++ /dev/null @@ -1,43 +0,0 @@ -realname "x86 Assembler" - -#mp_bits 32 - -load_on asm_ok - - -md4.cpp -md5.cpp -sha160.cpp -serpent.cpp - - - -#mp_mulop.cpp - - - -asm_macr.h -md4core.S -md5core.S -sha1_asm.S -serp_asm.S -#mp_mulop.S - - - -ia32 - - - -gcc -icc - - -# ELF systems - -linux -freebsd -netbsd -openbsd -solaris - diff --git a/src/bigint/monty_amd64/mp_monty.S b/src/bigint/monty_amd64/mp_monty.S new file mode 100644 index 000000000..3dd4040bc --- /dev/null +++ b/src/bigint/monty_amd64/mp_monty.S @@ -0,0 +1,397 @@ +/************************************************* +* Montgomery Reduction Source File * +* (C) 2008 Jack Lloyd * +*************************************************/ + +#include + +START_LISTING(mp_monty.S) + +START_FUNCTION(bigint_monty_redc) + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + + movq %rdi, %r14 # z + movq %rdx, %r12 # x + movl %esi, %ebp # z_size + + xorl %esi, %esi # j.76 + movq %r8, -16(%rsp) # u, u + movl %ecx, %ebx # x_size, x_size + movl %ecx, %r8d # x_size, blocks_of_8 + andl $-8, %r8d #, blocks_of_8 + testl %ecx, %ecx # x_size + je .L3 #, + mov %ecx, %eax # x_size, pretmp.71 + leal 1(%rbx), %r15d #, k.73 + salq $3, %rax #, + xorl %r13d, %r13d # j + movq %rax, -8(%rsp) #, pretmp.21 + .p2align 4,,10 + .p2align 3 +.L11: + mov %r13d, %eax # j, j + movq -16(%rsp), %rdi # u, y + leaq (%r14,%rax,8), %r11 #, z_j + xorl %r9d, %r9d # i + imulq (%r11), %rdi #* z_j, y + xorl %r10d, %r10d # carry + testl %r8d, %r8d # blocks_of_8 + je .L7 #, + .p2align 4,,10 + .p2align 3 +.LOOP_MUL_ADD: + mov %r9d, %ecx # i, i + addl $8, %r9d #, i + salq $3, %rcx #, D.2315 + leaq (%r11,%rcx), %rsi #, tmp130 + leaq (%r12,%rcx), %rcx #, tmp131 + + movq 8*0(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*0(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*0 (%rsi) + + movq 8*1(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*1(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*1 (%rsi) + + movq 8*2(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*2(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*2 (%rsi) + + movq 8*3(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*3(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*3 (%rsi) + + movq 8*4(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*4(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*4 (%rsi) + + movq 8*5(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*5(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*5 (%rsi) + + movq 8*6(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*6(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*6 (%rsi) + + movq 8*7(%rcx), %rax + mulq %rdi # y + addq %r10, %rax # carry + adcq $0,%rdx + addq 8*7(%rsi), %rax + adcq $0,%rdx + movq %rdx,%r10 # carry + movq %rax, 8*7 (%rsi) + + cmpl %r9d, %r8d # i, blocks_of_8 + jne .LOOP_MUL_ADD #, + cmpl %r8d, %ebx # blocks_of_8, x_size + je .L8 #, +.L7: + movl %r8d, %esi # blocks_of_8, i + .p2align 4,,10 + .p2align 3 +.L5: + mov %esi, %eax # i, i + movq %rdi, %rcx # y, b + leaq (%r11, %rax,8), %r9 #, D.2325 + incl %esi # i + movq (%r12, %rax,8), %rax #* x, tmp133 + + mulq %rcx # b + addq (%r9), %rax #* D.2325, a + adcq $0,%rdx # + addq %r10, %rax # carry, a + adcq $0,%rdx # + + cmpl %esi, %ebx # i, x_size + movq %rdx, %r10 #, carry + movq %rax, (%r9) # a,* D.2325 + jne .L5 #, +.L8: + movq -8(%rsp), %rdx # pretmp.21, + leaq (%r11,%rdx), %rax #, D.2332 + movq (%rax), %rcx #* D.2332, D.2333 + leaq (%r10,%rcx), %rdx #, z_sum + movq %rdx, (%rax) # z_sum,* D.2332 + cmpq %rdx, %rcx # z_sum, D.2333 + jbe .L9 #, + cmpl %ebp, %r15d # z_size, k.73 + je .L9 #, + movl %r15d, %ecx # k.73, k + jmp .L10 # + .p2align 4,,10 + .p2align 3 +.L31: + incl %ecx # k + cmpl %ecx, %ebp # k, z_size + .p2align 4,,4 + .p2align 3 + je .L9 #, +.L10: + mov %ecx, %edx # k, k + leaq (%r11,%rdx,8), %rdx #, D.2342 + movq (%rdx), %rax #* D.2342, tmp136 + incq %rax # D.2344 + movq %rax, (%rdx) # D.2344,* D.2342 + testq %rax, %rax # D.2344 + je .L31 #, +.L9: + incl %r13d # j + decl %ebp # z_size + cmpl %r13d, %ebx # j, x_size + jne .L11 #, + movl %ebx, %esi # x_size, j.76 +.L3: + leal (%rbx,%rbx), %eax #, tmp137 + mov %eax, %eax + leaq (%r14, %rax,8), %rdi #, D.2349 + cmpq $0, (%rdi) #,* D.2349 + jne .L12 #, + testl %ebx, %ebx # x_size + je .L12 #, + leal -1(%rbx), %ecx #, j + leal (%rsi,%rcx), %edx #, tmp141 + mov %ecx, %eax # j, j + movq (%r14,%rdx,8), %rbp #* z, + cmpq %rbp, (%r12, %rax,8) #,* x + jb .L12 #, + ja .L_EXIT #, + leal -2(%rsi,%rbx), %edx #, ivtmp.45 + jmp .L14 # + .p2align 4,,10 + .p2align 3 +.L15: + mov %edx, %eax # ivtmp.45, ivtmp.45 + decl %ecx # j + movq (%r14, %rax,8), %rsi #* z, D.2360 + mov %ecx, %eax # j, j + movq (%r12, %rax,8), %rax #* x, temp.68 + cmpq %rax, %rsi + ja .L12 #, + decl %edx # ivtmp.45 + cmpq %rax, %rsi + jb .L_EXIT #, +.L14: + testl %ecx, %ecx # j + jne .L15 #, +.L12: + xorl %ecx, %ecx # j + xorl %r10d, %r10d # carry + mov %ebx, %esi # x_size, pretmp.19 + testl %r8d, %r8d # blocks_of_8 + je .L17 #, + .p2align 4,,10 + .p2align 3 +.L22: + mov %ecx, %edx # j, D.2375 + addl $8, %ecx #, j + leaq (%rdx,%rsi), %rax #, tmp146 + leaq (%r12,%rdx,8), %rdx #, tmp150 + leaq (%r14, %rax,8), %rax #, tmp148 + + rorq %r10 # carry + + movq 8*0(%rdx), %r10 + sbbq %r10, 8*0(%rax) + + movq 8*1(%rdx), %r10 + sbbq %r10, 8*1(%rax) + + movq 8*2(%rdx), %r10 + sbbq %r10, 8*2(%rax) + + movq 8*3(%rdx), %r10 + sbbq %r10, 8*3(%rax) + + movq 8*4(%rdx), %r10 + sbbq %r10, 8*4(%rax) + + movq 8*5(%rdx), %r10 + sbbq %r10, 8*5(%rax) + + movq 8*6(%rdx), %r10 + sbbq %r10, 8*6(%rax) + + movq 8*7(%rdx), %r10 + sbbq %r10, 8*7(%rax) + + sbbq %r10,%r10 # carry + negq %r10 # carry + + cmpl %ecx, %r8d # j, blocks_of_8 + jne .L22 #, +.L17: + cmpl %r8d, %ebx # blocks_of_8, x_size + je .L19 #, + leal (%r8,%rbx), %r9d #, ivtmp.33 + movl %r8d, %esi # blocks_of_8, j + .p2align 4,,10 + .p2align 3 +.L20: + mov %r9d, %eax # ivtmp.33, ivtmp.33 + mov %esi, %ecx # j, j + leaq (%r14, %rax,8), %rax #, D.2387 + incl %esi # j + movq (%rax), %rdx #* D.2387, tmp153 + incl %r9d # ivtmp.33 + + rorq %r10 # carry + sbbq (%r12,%rcx,8),%rdx #* x, x + sbbq %r10,%r10 # carry + negq %r10 # carry + + cmpl %esi, %ebx # j, x_size + movq %rdx, (%rax) # x,* D.2387 + jne .L20 #, +.L19: + testq %r10, %r10 # carry + je .L_EXIT #, + decq (%rdi) #* D.2349 +.L_EXIT: + popq %rbx # + popq %rbp # + popq %r12 # + popq %r13 # + popq %r14 # + popq %r15 # +END_FUNCTION(bigint_monty_redc) + + +#if 0 + #define Z_ARR ARG_1 // rdi +#define Z_SIZE ARG_2_32 // esi +// X_ARR is ARG_3 == rdx, moved b/c needed for multiply +#define X_SIZE ARG_4_32 // ecx +#define U ARG_5 // r8 + +/* + We need all arguments for a while (we can reuse U eventually) + So only temp registers are + TEMP_1 %r10 + TEMP_2 %r11 + TEMP_3 = ARG_6 = %r9 + void return, so also + R0 %rax (aka TEMP_9) + is free (but needed for multiply) + + Can push: + %rbx (base pointer, callee saved) + %rpb (frame pointer, callee saved) + %r12-%r15 (callee saved) + + Can push base/frame pointers since this is a leaf function + and does not reference any data. +*/ + + push %r12 + push %r13 + push %r14 + push %r15 + +#define LOOP_CTR_I %r12 +#define LOOP_CTR_J %r13 + +#define CARRY TEMP_1 +#define Z_WORD TEMP_2 +#define X_ARR TEMP_3 +#define MUL_LO %rax +#define MUL_HI %rdx + + ASSIGN(X_ARR, ARG_3) + + /* + ZEROIZE(CARRY) + + ASSIGN(LOOP_CTR, X_SIZE) + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) + +#define MULADD_OP(N) \ + ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ + ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ + MUL(Y) ; \ + ADD(Z_WORD, CARRY) ; \ + ASSIGN(CARRY, MUL_HI) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ADD(Z_WORD, MUL_LO) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) + +ALIGN +.LOOP_MULADD8: + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(Z_ARR, 64) + ADD_IMM(X_ARR, 64) + cmp IMM(8), LOOP_CTR + jge .LOOP_MULADD8 + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + +ALIGN +.LOOP_MULADD1: + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(Z_ARR, 8) + ADD_IMM(X_ARR, 8) + + cmp IMM(0), LOOP_CTR + jne .LOOP_MULADD1 +*/ + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif diff --git a/src/bigint/monty_amd64/xxxinfo.txt b/src/bigint/monty_amd64/xxxinfo.txt new file mode 100644 index 000000000..2a8f9fe5b --- /dev/null +++ b/src/bigint/monty_amd64/xxxinfo.txt @@ -0,0 +1,34 @@ +realname "x86-64 Assembler" + +mp_bits 64 + +load_on request + + +#mp_mulop.cpp +#mp_monty.cpp + + + +asm_macr.h +#mp_mulop_amd64.S +#mp_monty.S + + + +amd64 + + + +gcc +icc + + +# ELF systems + +linux +freebsd +netbsd +openbsd +solaris + diff --git a/src/bigint/mulop_amd64/mp_mulop.cpp b/src/bigint/mulop_amd64/mp_mulop.cpp new file mode 100644 index 000000000..d1aa51489 --- /dev/null +++ b/src/bigint/mulop_amd64/mp_mulop.cpp @@ -0,0 +1,94 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include +#include +#include +#include + +namespace Botan { + +extern "C" { + +/************************************************* +* Simple O(N^2) Multiplication * +*************************************************/ +void bigint_simple_mul(word z[], const word x[], u32bit x_size, + const word y[], u32bit y_size) + { + const u32bit blocks = x_size - (x_size % 8); + + clear_mem(z, x_size + y_size); + + for(u32bit i = 0; i != y_size; ++i) + { + word carry = 0; + + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_madd3(z + i + j, x + j, y[i], carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[i+j] = word_madd3(x[j], y[i], z[i+j], &carry); + + z[x_size+i] = carry; + } + } + +inline word word_sqr(word x, + +/************************************************* +* Simple O(N^2) Squaring + +This is exactly the same algorithm as bigint_simple_mul, +however because C/C++ compilers suck at alias analysis it +is good to have the version where the compiler knows +that x == y +*************************************************/ +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + const word x_i = x[i]; + + word carry = z[2*i]; + z[2*i] = word_madd2(x_i, x_i, z[2*i], &carry); + + for(u32bit j = i; j != x_size; ++j) + { + // z[i+j] = z[i+j] + 2 * x[j] * x_i + carry; + + /* + load z[i+j] into register + load x[j] into %hi + mulq %[x_i] -> x[i] * x[j] -> %lo:%hi + shlq %lo, $1 + + // put carry bit (cf) from %lo into %temp + xorl %temp + adcq $0, %temp + + // high bit of lo now in cf + shl %hi, $1 + // add in lowest bid from %lo + orl %temp, %hi + + addq %[c], %[lo] + adcq $0, %[hi] + addq %[z_ij], %[lo] + adcq $0, %[hi] + + */ + + } + + z[x_size+i] = carry; + } + } + +} + +} diff --git a/src/bigint/mulop_amd64/mp_mulop_amd64.S b/src/bigint/mulop_amd64/mp_mulop_amd64.S new file mode 100644 index 000000000..e5bba23fb --- /dev/null +++ b/src/bigint/mulop_amd64/mp_mulop_amd64.S @@ -0,0 +1,128 @@ +/************************************************* +* Simple O(N^2) Multiplication and Squaring * +* (C) 1999-2008 Jack Lloyd * +*************************************************/ + +#include + +START_LISTING(mp_mulop.S) + +#if 0 +void bigint_simple_sqr(word z[], const word x[], u32bit x_size) + { + const u32bit blocks = x_size - (x_size % 8); + + clear_mem(z, 2*x_size); + + for(u32bit i = 0; i != x_size; ++i) + { + word carry = 0; + + /* + for(u32bit j = 0; j != blocks; j += 8) + carry = word8_madd3(z + i + j, x + j, x[i], carry); + + for(u32bit j = blocks; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + */ + + + for(u32bit j = 0; j != x_size; ++j) + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + + for(u32bit j = 0; j != x_size; ++j) + { + dword z = (dword)a * b + c + *d; + *d = (word)(z >> BOTAN_MP_WORD_BITS); + return (word)z; + } + + + + z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry); + + } + + + + z[x_size+i] = carry; + } + } + +#endif + +START_FUNCTION(bigint_simple_sqr) + +#define Z_ARR ARG_1 +#define X_ARR ARG_2 +//#define X_SIZE ARG_3_32 + +#define CARRY TEMP_1 +#define Z_WORD TEMP_2 +#define LOOP_I TEMP_3 +#define LOOP_J TEMP_4 +#define X_SIZE TEMP_5 +#define MUL_LO %rax +// arg 3, xsize +#define MUL_HI %rdx + +// need arg3 == rdx for multiply + ASSIGN(X_SIZE, ARG3_32) + + ZEROIZE(CARRY) + + ZEROIZE(LOOP_I) + +.LOOP_ZEROIZE_Z: + + cmp LOOP_I, X_SIZE + + + + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1) + +#define MULADD_OP(N) \ + ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \ + ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \ + MUL(Y) ; \ + ADD(Z_WORD, CARRY) ; \ + ASSIGN(CARRY, MUL_HI) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ADD(Z_WORD, MUL_LO) ; \ + ADD_LAST_CARRY(CARRY) ; \ + ASSIGN(ARRAY8(Z_ARR, N), Z_WORD) + +.LOOP_MULADD8: + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(Z_ARR, 64) + ADD_IMM(X_ARR, 64) + cmp IMM(8), LOOP_CTR + jge .LOOP_MULADD8 + + JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE) + +ALIGN +.LOOP_MULADD1: + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(Z_ARR, 8) + ADD_IMM(X_ARR, 8) + + cmp IMM(0), LOOP_CTR + jne .LOOP_MULADD1 + +.L_MULADD_DONE: + RETURN_VALUE_IS(CARRY) +END_FUNCTION(bigint_simple_square) diff --git a/src/bigint/mulop_ia32/mp_mulop.S b/src/bigint/mulop_ia32/mp_mulop.S new file mode 100644 index 000000000..a5f0d3b27 --- /dev/null +++ b/src/bigint/mulop_ia32/mp_mulop.S @@ -0,0 +1,62 @@ +/************************************************* +* Multiply/Add Algorithm Source File * +* (C) 1999-2007 Jack Lloyd * +*************************************************/ + +#include + +START_LISTING(mp_muladd.S) + +START_FUNCTION(bigint_mul_add_words) + SPILL_REGS() +#define PUSHED 4 + +#define LOOP_CTR ESI + ASSIGN(LOOP_CTR, ARG(3)) /* x_size */ + ZEROIZE(EDI) + + ASSIGN(ECX, ARG(1)) /* z[] */ + ASSIGN(EBX, ARG(2)) /* x[] */ + ASSIGN(EBP, ARG(4)) /* y */ + +#define MULADD_OP(N) \ + ASSIGN(EAX, ARRAY4(EBX, N)) ; \ + MUL(EBP) ; \ + ADD_W_CARRY(EAX, EDX, EDI) ; \ + ASSIGN(EDI, EDX) ; \ + ADD_W_CARRY(ARRAY4(ECX, N), EDI, EAX) ; + + JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) + JUMP_IF_LT(LOOP_CTR, 8, .MULADD1_LOOP) + +START_LOOP(.MULADD8) + MULADD_OP(0) + MULADD_OP(1) + MULADD_OP(2) + MULADD_OP(3) + MULADD_OP(4) + MULADD_OP(5) + MULADD_OP(6) + MULADD_OP(7) + + SUB_IMM(LOOP_CTR, 8) + ADD_IMM(EBX, 32) + ADD_IMM(ECX, 32) +LOOP_UNTIL_LT(LOOP_CTR, 8, .MULADD8) + + JUMP_IF_ZERO(LOOP_CTR, .MUL_ADD_DONE) + +START_LOOP(.MULADD1) + MULADD_OP(0) + + SUB_IMM(LOOP_CTR, 1) + ADD_IMM(EBX, 4) + ADD_IMM(ECX, 4) +LOOP_UNTIL_EQ(LOOP_CTR, 0, .MULADD1) + +.MUL_ADD_DONE: + + ASSIGN(EAX, EDI) +#undef PUSHED + RESTORE_REGS() +END_FUNCTION(bigint_mul_add_words) diff --git a/src/bigint/mulop_ia32/xxxinfo.txt b/src/bigint/mulop_ia32/xxxinfo.txt new file mode 100644 index 000000000..12c8cd96d --- /dev/null +++ b/src/bigint/mulop_ia32/xxxinfo.txt @@ -0,0 +1,43 @@ +realname "x86 Assembler" + +#mp_bits 32 + +load_on asm_ok + + +md4.cpp +md5.cpp +sha160.cpp +serpent.cpp + + + +#mp_mulop.cpp + + + +asm_macr.h +md4core.S +md5core.S +sha1_asm.S +serp_asm.S +#mp_mulop.S + + + +ia32 + + + +gcc +icc + + +# ELF systems + +linux +freebsd +netbsd +openbsd +solaris + -- cgit v1.2.3