aboutsummaryrefslogtreecommitdiffstats
path: root/src/bigint/asm_amd64
diff options
context:
space:
mode:
authorlloyd <[email protected]>2008-09-28 20:09:30 +0000
committerlloyd <[email protected]>2008-09-28 20:09:30 +0000
commit30241a0530d66b558006466460b4bfb22b79a49c (patch)
tree1ee6970f0493576befae9f94f74d777562542e71 /src/bigint/asm_amd64
parenteb8fd42d6aa3ed267c7444b114e02e64a567ca00 (diff)
Split up asm modules into appropriate (topic-specific) modules, eg
hash/sha1_amd64 and cipher/serpent_ia32. Remaining code in asm/ dir is for BigInt, so rename to bigint/ in prep for all (or most) of BigInt being modularized.
Diffstat (limited to 'src/bigint/asm_amd64')
-rw-r--r--src/bigint/asm_amd64/asm_macr.h125
-rw-r--r--src/bigint/asm_amd64/modinfo.txt34
-rw-r--r--src/bigint/asm_amd64/mp_monty.S397
-rw-r--r--src/bigint/asm_amd64/mp_mulop_amd64.S128
4 files changed, 684 insertions, 0 deletions
diff --git a/src/bigint/asm_amd64/asm_macr.h b/src/bigint/asm_amd64/asm_macr.h
new file mode 100644
index 000000000..3cdd42dc6
--- /dev/null
+++ b/src/bigint/asm_amd64/asm_macr.h
@@ -0,0 +1,125 @@
+/*************************************************
+* Assembly Macros Header File *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#ifndef BOTAN_AMD64_ASM_MACROS_H__
+#define BOTAN_AMD64_ASM_MACROS_H__
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+/*************************************************
+* General/Global Macros *
+*************************************************/
+#define ALIGN .p2align 4,,15
+
+#define START_LISTING(FILENAME) \
+ .file #FILENAME; \
+ .text; \
+ ALIGN;
+
+/*************************************************
+* Function Definitions *
+*************************************************/
+#define START_FUNCTION(func_name) \
+ ALIGN; \
+ .global func_name; \
+ .type func_name,@function; \
+func_name:
+
+#define END_FUNCTION(func_name) \
+ ret
+
+/*************************************************
+* Conditional Jumps *
+*************************************************/
+#define JUMP_IF_ZERO(REG, LABEL) \
+ cmp IMM(0), REG; \
+ jz LABEL
+
+#define JUMP_IF_LT(REG, NUM, LABEL) \
+ cmp IMM(NUM), REG; \
+ jl LABEL
+
+/*************************************************
+* Register Names *
+*************************************************/
+#define R0 %rax
+#define R1 %rbx
+#define R2 %rcx
+#define R2_32 %ecx
+#define R3 %rdx
+#define R3_32 %edx
+#define R4 %rsp
+#define R5 %rbp
+#define R6 %rsi
+#define R6_32 %esi
+#define R7 %rdi
+#define R8 %r8
+#define R9 %r9
+#define R9_32 %r9d
+#define R10 %r10
+#define R11 %r11
+#define R12 %r12
+#define R13 %r13
+#define R14 %r14
+#define R15 %r15
+#define R16 %r16
+
+#define ARG_1 R7
+#define ARG_2 R6
+#define ARG_2_32 R6_32
+#define ARG_3 R3
+#define ARG_3_32 R3_32
+#define ARG_4 R2
+#define ARG_4_32 R2_32
+#define ARG_5 R8
+#define ARG_6 R9
+#define ARG_6_32 R9_32
+
+#define TEMP_1 R10
+#define TEMP_2 R11
+#define TEMP_3 ARG_6
+#define TEMP_4 ARG_5
+#define TEMP_5 ARG_4
+#define TEMP_5_32 ARG_4_32
+#define TEMP_6 ARG_3
+#define TEMP_7 ARG_2
+#define TEMP_8 ARG_1
+#define TEMP_9 R0
+
+/*************************************************
+* Memory Access Operations *
+*************************************************/
+#define ARRAY8(REG, NUM) 8*(NUM)(REG)
+#define ARRAY4(REG, NUM) 4*(NUM)(REG)
+
+#define ASSIGN(TO, FROM) mov FROM, TO
+
+/*************************************************
+* ALU Operations *
+*************************************************/
+#define IMM(VAL) $VAL
+
+#define ADD(TO, FROM) add FROM, TO
+#define ADD_LAST_CARRY(REG) adc IMM(0), REG
+#define ADD_IMM(TO, NUM) ADD(TO, IMM(NUM))
+#define ADD_W_CARRY(TO1, TO2, FROM) add FROM, TO1; adc IMM(0), TO2;
+#define SUB_IMM(TO, NUM) sub IMM(NUM), TO
+#define MUL(REG) mul REG
+
+#define XOR(TO, FROM) xor FROM, TO
+#define AND(TO, FROM) and FROM, TO
+#define OR(TO, FROM) or FROM, TO
+#define NOT(REG) not REG
+#define ZEROIZE(REG) XOR(REG, REG)
+
+#define RETURN_VALUE_IS(V) ASSIGN(%rax, V)
+
+#define ROTL_IMM(REG, NUM) rol IMM(NUM), REG
+#define ROTR_IMM(REG, NUM) ror IMM(NUM), REG
+#define ADD3_IMM(TO, FROM, NUM) lea NUM(TO,FROM,1), TO
+
+#endif
diff --git a/src/bigint/asm_amd64/modinfo.txt b/src/bigint/asm_amd64/modinfo.txt
new file mode 100644
index 000000000..2a8f9fe5b
--- /dev/null
+++ b/src/bigint/asm_amd64/modinfo.txt
@@ -0,0 +1,34 @@
+realname "x86-64 Assembler"
+
+mp_bits 64
+
+load_on request
+
+<ignore>
+#mp_mulop.cpp
+#mp_monty.cpp
+</ignore>
+
+<add>
+asm_macr.h
+#mp_mulop_amd64.S
+#mp_monty.S
+</add>
+
+<arch>
+amd64
+</arch>
+
+<cc>
+gcc
+icc
+</cc>
+
+# ELF systems
+<os>
+linux
+freebsd
+netbsd
+openbsd
+solaris
+</os>
diff --git a/src/bigint/asm_amd64/mp_monty.S b/src/bigint/asm_amd64/mp_monty.S
new file mode 100644
index 000000000..3dd4040bc
--- /dev/null
+++ b/src/bigint/asm_amd64/mp_monty.S
@@ -0,0 +1,397 @@
+/*************************************************
+* Montgomery Reduction Source File *
+* (C) 2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_monty.S)
+
+START_FUNCTION(bigint_monty_redc)
+ pushq %r15 #
+ pushq %r14 #
+ pushq %r13 #
+ pushq %r12 #
+ pushq %rbp #
+ pushq %rbx #
+
+ movq %rdi, %r14 # z
+ movq %rdx, %r12 # x
+ movl %esi, %ebp # z_size
+
+ xorl %esi, %esi # j.76
+ movq %r8, -16(%rsp) # u, u
+ movl %ecx, %ebx # x_size, x_size
+ movl %ecx, %r8d # x_size, blocks_of_8
+ andl $-8, %r8d #, blocks_of_8
+ testl %ecx, %ecx # x_size
+ je .L3 #,
+ mov %ecx, %eax # x_size, pretmp.71
+ leal 1(%rbx), %r15d #, k.73
+ salq $3, %rax #,
+ xorl %r13d, %r13d # j
+ movq %rax, -8(%rsp) #, pretmp.21
+ .p2align 4,,10
+ .p2align 3
+.L11:
+ mov %r13d, %eax # j, j
+ movq -16(%rsp), %rdi # u, y
+ leaq (%r14,%rax,8), %r11 #, z_j
+ xorl %r9d, %r9d # i
+ imulq (%r11), %rdi #* z_j, y
+ xorl %r10d, %r10d # carry
+ testl %r8d, %r8d # blocks_of_8
+ je .L7 #,
+ .p2align 4,,10
+ .p2align 3
+.LOOP_MUL_ADD:
+ mov %r9d, %ecx # i, i
+ addl $8, %r9d #, i
+ salq $3, %rcx #, D.2315
+ leaq (%r11,%rcx), %rsi #, tmp130
+ leaq (%r12,%rcx), %rcx #, tmp131
+
+ movq 8*0(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*0(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*0 (%rsi)
+
+ movq 8*1(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*1(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*1 (%rsi)
+
+ movq 8*2(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*2(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*2 (%rsi)
+
+ movq 8*3(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*3(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*3 (%rsi)
+
+ movq 8*4(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*4(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*4 (%rsi)
+
+ movq 8*5(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*5(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*5 (%rsi)
+
+ movq 8*6(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*6(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*6 (%rsi)
+
+ movq 8*7(%rcx), %rax
+ mulq %rdi # y
+ addq %r10, %rax # carry
+ adcq $0,%rdx
+ addq 8*7(%rsi), %rax
+ adcq $0,%rdx
+ movq %rdx,%r10 # carry
+ movq %rax, 8*7 (%rsi)
+
+ cmpl %r9d, %r8d # i, blocks_of_8
+ jne .LOOP_MUL_ADD #,
+ cmpl %r8d, %ebx # blocks_of_8, x_size
+ je .L8 #,
+.L7:
+ movl %r8d, %esi # blocks_of_8, i
+ .p2align 4,,10
+ .p2align 3
+.L5:
+ mov %esi, %eax # i, i
+ movq %rdi, %rcx # y, b
+ leaq (%r11, %rax,8), %r9 #, D.2325
+ incl %esi # i
+ movq (%r12, %rax,8), %rax #* x, tmp133
+
+ mulq %rcx # b
+ addq (%r9), %rax #* D.2325, a
+ adcq $0,%rdx #
+ addq %r10, %rax # carry, a
+ adcq $0,%rdx #
+
+ cmpl %esi, %ebx # i, x_size
+ movq %rdx, %r10 #, carry
+ movq %rax, (%r9) # a,* D.2325
+ jne .L5 #,
+.L8:
+ movq -8(%rsp), %rdx # pretmp.21,
+ leaq (%r11,%rdx), %rax #, D.2332
+ movq (%rax), %rcx #* D.2332, D.2333
+ leaq (%r10,%rcx), %rdx #, z_sum
+ movq %rdx, (%rax) # z_sum,* D.2332
+ cmpq %rdx, %rcx # z_sum, D.2333
+ jbe .L9 #,
+ cmpl %ebp, %r15d # z_size, k.73
+ je .L9 #,
+ movl %r15d, %ecx # k.73, k
+ jmp .L10 #
+ .p2align 4,,10
+ .p2align 3
+.L31:
+ incl %ecx # k
+ cmpl %ecx, %ebp # k, z_size
+ .p2align 4,,4
+ .p2align 3
+ je .L9 #,
+.L10:
+ mov %ecx, %edx # k, k
+ leaq (%r11,%rdx,8), %rdx #, D.2342
+ movq (%rdx), %rax #* D.2342, tmp136
+ incq %rax # D.2344
+ movq %rax, (%rdx) # D.2344,* D.2342
+ testq %rax, %rax # D.2344
+ je .L31 #,
+.L9:
+ incl %r13d # j
+ decl %ebp # z_size
+ cmpl %r13d, %ebx # j, x_size
+ jne .L11 #,
+ movl %ebx, %esi # x_size, j.76
+.L3:
+ leal (%rbx,%rbx), %eax #, tmp137
+ mov %eax, %eax
+ leaq (%r14, %rax,8), %rdi #, D.2349
+ cmpq $0, (%rdi) #,* D.2349
+ jne .L12 #,
+ testl %ebx, %ebx # x_size
+ je .L12 #,
+ leal -1(%rbx), %ecx #, j
+ leal (%rsi,%rcx), %edx #, tmp141
+ mov %ecx, %eax # j, j
+ movq (%r14,%rdx,8), %rbp #* z,
+ cmpq %rbp, (%r12, %rax,8) #,* x
+ jb .L12 #,
+ ja .L_EXIT #,
+ leal -2(%rsi,%rbx), %edx #, ivtmp.45
+ jmp .L14 #
+ .p2align 4,,10
+ .p2align 3
+.L15:
+ mov %edx, %eax # ivtmp.45, ivtmp.45
+ decl %ecx # j
+ movq (%r14, %rax,8), %rsi #* z, D.2360
+ mov %ecx, %eax # j, j
+ movq (%r12, %rax,8), %rax #* x, temp.68
+ cmpq %rax, %rsi
+ ja .L12 #,
+ decl %edx # ivtmp.45
+ cmpq %rax, %rsi
+ jb .L_EXIT #,
+.L14:
+ testl %ecx, %ecx # j
+ jne .L15 #,
+.L12:
+ xorl %ecx, %ecx # j
+ xorl %r10d, %r10d # carry
+ mov %ebx, %esi # x_size, pretmp.19
+ testl %r8d, %r8d # blocks_of_8
+ je .L17 #,
+ .p2align 4,,10
+ .p2align 3
+.L22:
+ mov %ecx, %edx # j, D.2375
+ addl $8, %ecx #, j
+ leaq (%rdx,%rsi), %rax #, tmp146
+ leaq (%r12,%rdx,8), %rdx #, tmp150
+ leaq (%r14, %rax,8), %rax #, tmp148
+
+ rorq %r10 # carry
+
+ movq 8*0(%rdx), %r10
+ sbbq %r10, 8*0(%rax)
+
+ movq 8*1(%rdx), %r10
+ sbbq %r10, 8*1(%rax)
+
+ movq 8*2(%rdx), %r10
+ sbbq %r10, 8*2(%rax)
+
+ movq 8*3(%rdx), %r10
+ sbbq %r10, 8*3(%rax)
+
+ movq 8*4(%rdx), %r10
+ sbbq %r10, 8*4(%rax)
+
+ movq 8*5(%rdx), %r10
+ sbbq %r10, 8*5(%rax)
+
+ movq 8*6(%rdx), %r10
+ sbbq %r10, 8*6(%rax)
+
+ movq 8*7(%rdx), %r10
+ sbbq %r10, 8*7(%rax)
+
+ sbbq %r10,%r10 # carry
+ negq %r10 # carry
+
+ cmpl %ecx, %r8d # j, blocks_of_8
+ jne .L22 #,
+.L17:
+ cmpl %r8d, %ebx # blocks_of_8, x_size
+ je .L19 #,
+ leal (%r8,%rbx), %r9d #, ivtmp.33
+ movl %r8d, %esi # blocks_of_8, j
+ .p2align 4,,10
+ .p2align 3
+.L20:
+ mov %r9d, %eax # ivtmp.33, ivtmp.33
+ mov %esi, %ecx # j, j
+ leaq (%r14, %rax,8), %rax #, D.2387
+ incl %esi # j
+ movq (%rax), %rdx #* D.2387, tmp153
+ incl %r9d # ivtmp.33
+
+ rorq %r10 # carry
+ sbbq (%r12,%rcx,8),%rdx #* x, x
+ sbbq %r10,%r10 # carry
+ negq %r10 # carry
+
+ cmpl %esi, %ebx # j, x_size
+ movq %rdx, (%rax) # x,* D.2387
+ jne .L20 #,
+.L19:
+ testq %r10, %r10 # carry
+ je .L_EXIT #,
+ decq (%rdi) #* D.2349
+.L_EXIT:
+ popq %rbx #
+ popq %rbp #
+ popq %r12 #
+ popq %r13 #
+ popq %r14 #
+ popq %r15 #
+END_FUNCTION(bigint_monty_redc)
+
+
+#if 0
+ #define Z_ARR ARG_1 // rdi
+#define Z_SIZE ARG_2_32 // esi
+// X_ARR is ARG_3 == rdx, moved b/c needed for multiply
+#define X_SIZE ARG_4_32 // ecx
+#define U ARG_5 // r8
+
+/*
+ We need all arguments for a while (we can reuse U eventually)
+ So only temp registers are
+ TEMP_1 %r10
+ TEMP_2 %r11
+ TEMP_3 = ARG_6 = %r9
+ void return, so also
+ R0 %rax (aka TEMP_9)
+ is free (but needed for multiply)
+
+ Can push:
+ %rbx (base pointer, callee saved)
+ %rpb (frame pointer, callee saved)
+ %r12-%r15 (callee saved)
+
+ Can push base/frame pointers since this is a leaf function
+ and does not reference any data.
+*/
+
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+#define LOOP_CTR_I %r12
+#define LOOP_CTR_J %r13
+
+#define CARRY TEMP_1
+#define Z_WORD TEMP_2
+#define X_ARR TEMP_3
+#define MUL_LO %rax
+#define MUL_HI %rdx
+
+ ASSIGN(X_ARR, ARG_3)
+
+ /*
+ ZEROIZE(CARRY)
+
+ ASSIGN(LOOP_CTR, X_SIZE)
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N) \
+ ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+ ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+ MUL(Y) ; \
+ ADD(Z_WORD, CARRY) ; \
+ ASSIGN(CARRY, MUL_HI) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ADD(Z_WORD, MUL_LO) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+ALIGN
+.LOOP_MULADD8:
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(Z_ARR, 64)
+ ADD_IMM(X_ARR, 64)
+ cmp IMM(8), LOOP_CTR
+ jge .LOOP_MULADD8
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(Z_ARR, 8)
+ ADD_IMM(X_ARR, 8)
+
+ cmp IMM(0), LOOP_CTR
+ jne .LOOP_MULADD1
+*/
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+#endif
diff --git a/src/bigint/asm_amd64/mp_mulop_amd64.S b/src/bigint/asm_amd64/mp_mulop_amd64.S
new file mode 100644
index 000000000..e5bba23fb
--- /dev/null
+++ b/src/bigint/asm_amd64/mp_mulop_amd64.S
@@ -0,0 +1,128 @@
+/*************************************************
+* Simple O(N^2) Multiplication and Squaring *
+* (C) 1999-2008 Jack Lloyd *
+*************************************************/
+
+#include <botan/asm_macr.h>
+
+START_LISTING(mp_mulop.S)
+
+#if 0
+void bigint_simple_sqr(word z[], const word x[], u32bit x_size)
+ {
+ const u32bit blocks = x_size - (x_size % 8);
+
+ clear_mem(z, 2*x_size);
+
+ for(u32bit i = 0; i != x_size; ++i)
+ {
+ word carry = 0;
+
+ /*
+ for(u32bit j = 0; j != blocks; j += 8)
+ carry = word8_madd3(z + i + j, x + j, x[i], carry);
+
+ for(u32bit j = blocks; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+ */
+
+
+ for(u32bit j = 0; j != x_size; ++j)
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+ for(u32bit j = 0; j != x_size; ++j)
+ {
+ dword z = (dword)a * b + c + *d;
+ *d = (word)(z >> BOTAN_MP_WORD_BITS);
+ return (word)z;
+ }
+
+
+
+ z[i+j] = word_madd3(x[j], x[i], z[i+j], &carry);
+
+ }
+
+
+
+ z[x_size+i] = carry;
+ }
+ }
+
+#endif
+
+START_FUNCTION(bigint_simple_sqr)
+
+#define Z_ARR ARG_1
+#define X_ARR ARG_2
+//#define X_SIZE ARG_3_32
+
+#define CARRY TEMP_1
+#define Z_WORD TEMP_2
+#define LOOP_I TEMP_3
+#define LOOP_J TEMP_4
+#define X_SIZE TEMP_5
+#define MUL_LO %rax
+// arg 3, xsize
+#define MUL_HI %rdx
+
+// need arg3 == rdx for multiply
+ ASSIGN(X_SIZE, ARG3_32)
+
+ ZEROIZE(CARRY)
+
+ ZEROIZE(LOOP_I)
+
+.LOOP_ZEROIZE_Z:
+
+ cmp LOOP_I, X_SIZE
+
+
+
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+ JUMP_IF_LT(LOOP_CTR, 8, .LOOP_MULADD1)
+
+#define MULADD_OP(N) \
+ ASSIGN(MUL_LO, ARRAY8(X_ARR, N)) ; \
+ ASSIGN(Z_WORD, ARRAY8(Z_ARR, N)) ; \
+ MUL(Y) ; \
+ ADD(Z_WORD, CARRY) ; \
+ ASSIGN(CARRY, MUL_HI) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ADD(Z_WORD, MUL_LO) ; \
+ ADD_LAST_CARRY(CARRY) ; \
+ ASSIGN(ARRAY8(Z_ARR, N), Z_WORD)
+
+.LOOP_MULADD8:
+ MULADD_OP(0)
+ MULADD_OP(1)
+ MULADD_OP(2)
+ MULADD_OP(3)
+ MULADD_OP(4)
+ MULADD_OP(5)
+ MULADD_OP(6)
+ MULADD_OP(7)
+
+ SUB_IMM(LOOP_CTR, 8)
+ ADD_IMM(Z_ARR, 64)
+ ADD_IMM(X_ARR, 64)
+ cmp IMM(8), LOOP_CTR
+ jge .LOOP_MULADD8
+
+ JUMP_IF_ZERO(LOOP_CTR, .L_MULADD_DONE)
+
+ALIGN
+.LOOP_MULADD1:
+ MULADD_OP(0)
+
+ SUB_IMM(LOOP_CTR, 1)
+ ADD_IMM(Z_ARR, 8)
+ ADD_IMM(X_ARR, 8)
+
+ cmp IMM(0), LOOP_CTR
+ jne .LOOP_MULADD1
+
+.L_MULADD_DONE:
+ RETURN_VALUE_IS(CARRY)
+END_FUNCTION(bigint_simple_square)