diff options
author | lloyd <[email protected]> | 2008-09-09 22:27:16 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2008-09-09 22:27:16 +0000 |
commit | 57e8e85be6342b9495eb7ba9a3d96e508b984418 (patch) | |
tree | 7b35bc430f56c9b5848fd2a3e09c667e709c1780 /src | |
parent | 456c1ef79167ef0cba5748842ca3748fa9ed7fc0 (diff) |
Add 16x16->32 word Comba multiply and square
Diffstat (limited to 'src')
-rw-r--r-- | src/mp_comba.cpp | 762 | ||||
-rw-r--r-- | src/mp_karat.cpp | 130 |
2 files changed, 716 insertions, 176 deletions
diff --git a/src/mp_comba.cpp b/src/mp_comba.cpp index 589a4037c..635a7547a 100644 --- a/src/mp_comba.cpp +++ b/src/mp_comba.cpp @@ -11,6 +11,39 @@ namespace Botan { extern "C" { /************************************************* +* Comba 4x4 Squaring * +*************************************************/ +void bigint_comba_sqr4(word z[8], const word x[4]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], x[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); + word3_muladd(&w2, &w1, &w0, x[1], x[1]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); + word3_muladd(&w2, &w1, &w0, x[2], x[2]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[3], x[3]); + z[6] = w0; + z[7] = w1; + } + +/************************************************* * Comba 4x4 Multiplication * *************************************************/ void bigint_comba_mul4(word z[8], const word x[4], const word y[4]) @@ -50,6 +83,58 @@ void bigint_comba_mul4(word z[8], const word x[4], const word y[4]) } /************************************************* +* Comba 6x6 Squaring * +*************************************************/ +void bigint_comba_sqr6(word z[12], const word x[6]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], x[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); + word3_muladd(&w2, &w1, &w0, x[1], x[1]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); + word3_muladd(&w2, &w1, &w0, x[2], x[2]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[1], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[4]); + word3_muladd(&w2, &w1, &w0, x[3], x[3]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[2], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[4]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[3], x[5]); + word3_muladd(&w2, &w1, &w0, x[4], x[4]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[4], x[5]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[5], x[5]); + z[10] = w0; + z[11] = w1; + } + +/************************************************* * Comba 6x6 Multiplication * *************************************************/ void bigint_comba_mul6(word z[12], const word x[6], const word y[6]) @@ -117,6 +202,81 @@ void bigint_comba_mul6(word z[12], const word x[6], const word y[6]) } /************************************************* +* Comba 8x8 Squaring * +*************************************************/ +void bigint_comba_sqr8(word z[16], const word x[8]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[0], x[0]); + z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + z[1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); + word3_muladd(&w2, &w1, &w0, x[1], x[1]); + z[2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + z[3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); + word3_muladd(&w2, &w1, &w0, x[2], x[2]); + z[4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[4]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + z[5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[4]); + word3_muladd(&w2, &w1, &w0, x[3], x[3]); + z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[0], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[1], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[5]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[4]); + z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[1], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[2], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[5]); + word3_muladd(&w2, &w1, &w0, x[4], x[4]); + z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[2], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[3], x[6]); + word3_muladd_2(&w2, &w1, &w0, x[4], x[5]); + z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[3], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[4], x[6]); + word3_muladd(&w2, &w1, &w0, x[5], x[5]); + z[10] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[4], x[7]); + word3_muladd_2(&w2, &w1, &w0, x[5], x[6]); + z[11] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[5], x[7]); + word3_muladd(&w2, &w1, &w0, x[6], x[6]); + z[12] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[6], x[7]); + z[13] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[7], x[7]); + z[14] = w0; + z[15] = w1; + } + +/************************************************* * Comba 8x8 Multiplication * *************************************************/ void bigint_comba_mul8(word z[16], const word x[8], const word y[8]) @@ -220,163 +380,539 @@ void bigint_comba_mul8(word z[16], const word x[8], const word y[8]) } /************************************************* -* Comba 4x4 Squaring * +* Comba 16x16 Squaring * *************************************************/ -void bigint_comba_sqr4(word z[8], const word x[4]) +void bigint_comba_sqr16(word z[32], const word x[16]) { - word w2 = 0, w1 = 0, w0 = 0; - - word3_muladd(&w2, &w1, &w0, x[0], x[0]); - z[0] = w0; w0 = w1; w1 = w2; w2 = 0; + return bigint_comba_mul16(z, x, x); - word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); - z[1] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); - word3_muladd(&w2, &w1, &w0, x[1], x[1]); - z[2] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); - z[3] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); - word3_muladd(&w2, &w1, &w0, x[2], x[2]); - z[4] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); - z[5] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd(&w2, &w1, &w0, x[3], x[3]); - z[6] = w0; - z[7] = w1; - } - -/************************************************* -* Comba 6x6 Squaring * -*************************************************/ -void bigint_comba_sqr6(word z[12], const word x[6]) - { word w2 = 0, w1 = 0, w0 = 0; - word3_muladd(&w2, &w1, &w0, x[0], x[0]); - z[0] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); - z[1] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); - word3_muladd(&w2, &w1, &w0, x[1], x[1]); - z[2] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); - z[3] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[4]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); - word3_muladd(&w2, &w1, &w0, x[2], x[2]); - z[4] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[0], x[5]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[4]); - word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); - z[5] = w0; w0 = w1; w1 = w2; w2 = 0; - - word3_muladd_2(&w2, &w1, &w0, x[1], x[5]); - word3_muladd_2(&w2, &w1, &w0, x[2], x[4]); - word3_muladd(&w2, &w1, &w0, x[3], x[3]); - z[6] = w0; w0 = w1; w1 = w2; w2 = 0; + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 1]); + z[ 1] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 1], x[ 1]); + z[ 2] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 4]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 2], x[ 2]); + z[ 4] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 4]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 3]); + z[ 5] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 4]); + z[ 7] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 4], x[ 4]); + z[ 8] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]); + z[ 9] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 5], x[ 5]); + z[10] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[2], x[5]); - word3_muladd_2(&w2, &w1, &w0, x[3], x[4]); - z[7] = w0; w0 = w1; w1 = w2; w2 = 0; + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 6]); + z[11] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[3], x[5]); - word3_muladd(&w2, &w1, &w0, x[4], x[4]); - z[8] = w0; w0 = w1; w1 = w2; w2 = 0; + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], x[ 6]); + z[12] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[4], x[5]); - z[9] = w0; w0 = w1; w1 = w2; w2 = 0; + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 7]); + z[13] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd(&w2, &w1, &w0, x[5], x[5]); - z[10] = w0; - z[11] = w1; + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 7], x[ 7]); + z[14] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[ 9]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 8]); + z[15] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 8], x[ 8]); + z[16] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[10]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[ 9]); + z[17] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[10]); + word3_muladd(&w2, &w1, &w0, x[ 9], x[ 9]); + z[18] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[11]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[10]); + z[19] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[11]); + word3_muladd(&w2, &w1, &w0, x[10], x[10]); + z[20] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 6], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[12]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[11]); + z[21] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[12]); + word3_muladd(&w2, &w1, &w0, x[11], x[11]); + z[22] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 8], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[13]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[12]); + z[23] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 9], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[10], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[13]); + word3_muladd(&w2, &w1, &w0, x[12], x[12]); + z[24] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[10], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[11], x[14]); + word3_muladd_2(&w2, &w1, &w0, x[12], x[13]); + z[25] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[11], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[12], x[14]); + word3_muladd(&w2, &w1, &w0, x[13], x[13]); + z[26] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[12], x[15]); + word3_muladd_2(&w2, &w1, &w0, x[13], x[14]); + z[27] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[13], x[15]); + word3_muladd(&w2, &w1, &w0, x[14], x[14]); + z[28] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[14], x[15]); + z[29] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[15], x[15]); + z[30] = w0; + z[31] = w1; } /************************************************* -* Comba 8x8 Squaring * +* Comba 16x16 Multiplication * *************************************************/ -void bigint_comba_sqr8(word z[16], const word x[8]) +void bigint_comba_mul16(word z[32], const word x[16], const word y[16]) { word w2 = 0, w1 = 0, w0 = 0; - word3_muladd(&w2, &w1, &w0, x[0], x[0]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); z[0] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[1]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 0]); z[1] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[2]); - word3_muladd(&w2, &w1, &w0, x[1], x[1]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 0]); z[2] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[3]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[2]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); z[3] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[4]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[3]); - word3_muladd(&w2, &w1, &w0, x[2], x[2]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 0]); z[4] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[5]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[4]); - word3_muladd_2(&w2, &w1, &w0, x[2], x[3]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 0]); z[5] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[6]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[5]); - word3_muladd_2(&w2, &w1, &w0, x[2], x[4]); - word3_muladd(&w2, &w1, &w0, x[3], x[3]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 0]); z[6] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[0], x[7]); - word3_muladd_2(&w2, &w1, &w0, x[1], x[6]); - word3_muladd_2(&w2, &w1, &w0, x[2], x[5]); - word3_muladd_2(&w2, &w1, &w0, x[3], x[4]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 0]); z[7] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[1], x[7]); - word3_muladd_2(&w2, &w1, &w0, x[2], x[6]); - word3_muladd_2(&w2, &w1, &w0, x[3], x[5]); - word3_muladd(&w2, &w1, &w0, x[4], x[4]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 0]); z[8] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[2], x[7]); - word3_muladd_2(&w2, &w1, &w0, x[3], x[6]); - word3_muladd_2(&w2, &w1, &w0, x[4], x[5]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 0]); z[9] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[3], x[7]); - word3_muladd_2(&w2, &w1, &w0, x[4], x[6]); - word3_muladd(&w2, &w1, &w0, x[5], x[5]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 0]); z[10] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[4], x[7]); - word3_muladd_2(&w2, &w1, &w0, x[5], x[6]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 0]); z[11] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[5], x[7]); - word3_muladd(&w2, &w1, &w0, x[6], x[6]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 0]); z[12] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd_2(&w2, &w1, &w0, x[6], x[7]); + word3_muladd(&w2, &w1, &w0, x[ 0], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 0]); z[13] = w0; w0 = w1; w1 = w2; w2 = 0; - word3_muladd(&w2, &w1, &w0, x[7], x[7]); - z[14] = w0; - z[15] = w1; + word3_muladd(&w2, &w1, &w0, x[ 0], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 0]); + z[14] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 0]); + z[15] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 1], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 1]); + z[16] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 2], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 2]); + z[17] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 3], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[10]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 3]); + z[18] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 4], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[11]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[10]); + word3_muladd(&w2, &w1, &w0, x[10], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 4]); + z[19] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 5], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[12]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[11]); + word3_muladd(&w2, &w1, &w0, x[10], y[10]); + word3_muladd(&w2, &w1, &w0, x[11], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 5]); + z[20] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 6], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[13]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[12]); + word3_muladd(&w2, &w1, &w0, x[10], y[11]); + word3_muladd(&w2, &w1, &w0, x[11], y[10]); + word3_muladd(&w2, &w1, &w0, x[12], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 6]); + z[21] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 7], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[14]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[13]); + word3_muladd(&w2, &w1, &w0, x[10], y[12]); + word3_muladd(&w2, &w1, &w0, x[11], y[11]); + word3_muladd(&w2, &w1, &w0, x[12], y[10]); + word3_muladd(&w2, &w1, &w0, x[13], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 7]); + z[22] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 8], y[15]); + word3_muladd(&w2, &w1, &w0, x[ 9], y[14]); + word3_muladd(&w2, &w1, &w0, x[10], y[13]); + word3_muladd(&w2, &w1, &w0, x[11], y[12]); + word3_muladd(&w2, &w1, &w0, x[12], y[11]); + word3_muladd(&w2, &w1, &w0, x[13], y[10]); + word3_muladd(&w2, &w1, &w0, x[14], y[ 9]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 8]); + z[23] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 9], y[15]); + word3_muladd(&w2, &w1, &w0, x[10], y[14]); + word3_muladd(&w2, &w1, &w0, x[11], y[13]); + word3_muladd(&w2, &w1, &w0, x[12], y[12]); + word3_muladd(&w2, &w1, &w0, x[13], y[11]); + word3_muladd(&w2, &w1, &w0, x[14], y[10]); + word3_muladd(&w2, &w1, &w0, x[15], y[ 9]); + z[24] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[10], y[15]); + word3_muladd(&w2, &w1, &w0, x[11], y[14]); + word3_muladd(&w2, &w1, &w0, x[12], y[13]); + word3_muladd(&w2, &w1, &w0, x[13], y[12]); + word3_muladd(&w2, &w1, &w0, x[14], y[11]); + word3_muladd(&w2, &w1, &w0, x[15], y[10]); + z[25] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[11], y[15]); + word3_muladd(&w2, &w1, &w0, x[12], y[14]); + word3_muladd(&w2, &w1, &w0, x[13], y[13]); + word3_muladd(&w2, &w1, &w0, x[14], y[12]); + word3_muladd(&w2, &w1, &w0, x[15], y[11]); + z[26] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[12], y[15]); + word3_muladd(&w2, &w1, &w0, x[13], y[14]); + word3_muladd(&w2, &w1, &w0, x[14], y[13]); + word3_muladd(&w2, &w1, &w0, x[15], y[12]); + z[27] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[13], y[15]); + word3_muladd(&w2, &w1, &w0, x[14], y[14]); + word3_muladd(&w2, &w1, &w0, x[15], y[13]); + z[28] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[14], y[15]); + word3_muladd(&w2, &w1, &w0, x[15], y[14]); + z[29] = w0; w0 = w1; w1 = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[15], y[15]); + z[30] = w0; + z[31] = w1; } } diff --git a/src/mp_karat.cpp b/src/mp_karat.cpp index 770846b4e..15b0551fd 100644 --- a/src/mp_karat.cpp +++ b/src/mp_karat.cpp @@ -17,13 +17,13 @@ namespace { void karatsuba_mul(word z[], const word x[], const word y[], u32bit N, word workspace[]) { - const u32bit KARATSUBA_MUL_LOWER_SIZE = BOTAN_KARAT_MUL_THRESHOLD; - if(N == 6) bigint_comba_mul6(z, x, y); else if(N == 8) bigint_comba_mul8(z, x, y); - else if(N < KARATSUBA_MUL_LOWER_SIZE || N % 2) + else if(N == 16) + bigint_comba_mul16(z, x, y); + else if(N < BOTAN_KARAT_MUL_THRESHOLD || N % 2) bigint_simple_mul(z, x, N, y, N); else { @@ -96,13 +96,13 @@ void karatsuba_mul(word z[], const word x[], const word y[], u32bit N, *************************************************/ void karatsuba_sqr(word z[], const word x[], u32bit N, word workspace[]) { - const u32bit KARATSUBA_SQR_LOWER_SIZE = BOTAN_KARAT_SQR_THRESHOLD; - if(N == 6) bigint_comba_sqr6(z, x); else if(N == 8) bigint_comba_sqr8(z, x); - else if(N < KARATSUBA_SQR_LOWER_SIZE || N % 2) + else if(N == 16) + bigint_comba_sqr16(z, x); + else if(N < BOTAN_KARAT_SQR_THRESHOLD || N % 2) bigint_simple_sqr(z, x, N); else { @@ -234,97 +234,101 @@ u32bit karatsuba_size(u32bit z_size, u32bit x_size, u32bit x_sw) return 0; } +} + /************************************************* -* Handle small operand multiplies * +* Multiplication Algorithm Dispatcher * *************************************************/ -void handle_small_mul(word z[], u32bit z_size, - const word x[], u32bit x_size, u32bit x_sw, - const word y[], u32bit y_size, u32bit y_sw) +void bigint_mul(word z[], u32bit z_size, word workspace[], + const word x[], u32bit x_size, u32bit x_sw, + const word y[], u32bit y_size, u32bit y_sw) { - if(x_sw == 1) bigint_linmul3(z, y, y_sw, x[0]); - else if(y_sw == 1) bigint_linmul3(z, x, x_sw, y[0]); - + if(x_sw == 1) + { + bigint_linmul3(z, y, y_sw, x[0]); + } + else if(y_sw == 1) + { + bigint_linmul3(z, x, x_sw, y[0]); + } else if(x_sw <= 4 && x_size >= 4 && y_sw <= 4 && y_size >= 4 && z_size >= 8) + { bigint_comba_mul4(z, x, y); - + } else if(x_sw <= 6 && x_size >= 6 && y_sw <= 6 && y_size >= 6 && z_size >= 12) + { bigint_comba_mul6(z, x, y); - + } else if(x_sw <= 8 && x_size >= 8 && y_sw <= 8 && y_size >= 8 && z_size >= 16) + { bigint_comba_mul8(z, x, y); - - else + } + else if(x_sw <= 16 && x_size >= 16 && + y_sw <= 16 && y_size >= 16 && z_size >= 32) + { + bigint_comba_mul16(z, x, y); + } + else if(x_sw < BOTAN_KARAT_MUL_THRESHOLD || y_sw < BOTAN_KARAT_MUL_THRESHOLD) bigint_simple_mul(z, x, x_sw, y, y_sw); + else + { + const u32bit N = karatsuba_size(z_size, x_size, x_sw, y_size, y_sw); + + if(N) + { + clear_mem(workspace, 2*N); + karatsuba_mul(z, x, y, N, workspace); + } + else + bigint_simple_mul(z, x, x_sw, y, y_sw); + } } /************************************************* -* Handle small operand squarings * +* Squaring Algorithm Dispatcher * *************************************************/ -void handle_small_sqr(word z[], u32bit z_size, - const word x[], u32bit x_size, u32bit x_sw) +void bigint_sqr(word z[], u32bit z_size, word workspace[], + const word x[], u32bit x_size, u32bit x_sw) { if(x_sw == 1) + { bigint_linmul3(z, x, x_sw, x[0]); + } else if(x_sw <= 4 && x_size >= 4 && z_size >= 8) + { bigint_comba_sqr4(z, x); + } else if(x_sw <= 6 && x_size >= 6 && z_size >= 12) + { bigint_comba_sqr6(z, x); + } else if(x_sw <= 8 && x_size >= 8 && z_size >= 16) + { bigint_comba_sqr8(z, x); - else - bigint_simple_sqr(z, x, x_sw); - } - -} - -/************************************************* -* Multiplication Algorithm Dispatcher * -*************************************************/ -void bigint_mul(word z[], u32bit z_size, word workspace[], - const word x[], u32bit x_size, u32bit x_sw, - const word y[], u32bit y_size, u32bit y_sw) - { - if(x_size <= 8 || y_size <= 8) + } + else if(x_sw <= 16 && x_size >= 16 && z_size >= 32) { - handle_small_mul(z, z_size, x, x_size, x_sw, y, y_size, y_sw); - return; + bigint_comba_sqr16(z, x); } - - const u32bit N = karatsuba_size(z_size, x_size, x_sw, y_size, y_sw); - - if(N) + else if(x_size < BOTAN_KARAT_SQR_THRESHOLD) { - clear_mem(workspace, 2*N); - karatsuba_mul(z, x, y, N, workspace); + bigint_simple_sqr(z, x, x_sw); } else - bigint_simple_mul(z, x, x_sw, y, y_sw); - } - -/************************************************* -* Squaring Algorithm Dispatcher * -*************************************************/ -void bigint_sqr(word z[], u32bit z_size, word workspace[], - const word x[], u32bit x_size, u32bit x_sw) - { - if(x_size <= 8 || x_sw <= 8) { - handle_small_sqr(z, z_size, x, x_size, x_sw); - return; - } - - const u32bit N = karatsuba_size(z_size, x_size, x_sw); + const u32bit N = karatsuba_size(z_size, x_size, x_sw); - if(N) - { - clear_mem(workspace, 2*N); - karatsuba_sqr(z, x, N, workspace); + if(N) + { + clear_mem(workspace, 2*N); + karatsuba_sqr(z, x, N, workspace); + } + else + bigint_simple_sqr(z, x, x_sw); } - else - bigint_simple_sqr(z, x, x_sw); } } |