diff options
author | lloyd <[email protected]> | 2014-11-15 14:35:19 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2014-11-15 14:35:19 +0000 |
commit | 1518c30f1c90c2d0e5e06731e3dffe21353b34db (patch) | |
tree | c2f819f2a2011a7af6052ede3b32638412b546d0 /src/lib/math/mp | |
parent | 17349a1fc49d604f8160f2077538fdf397b702c6 (diff) |
Add specialized reduction for P-521 along with 9x9 Comba routines.
Roughly 35-50% faster on my laptop (depending on if mlock is enabled,
the overhead in that allocator is becoming much more of a hotspot).
Diffstat (limited to 'src/lib/math/mp')
-rw-r--r-- | src/lib/math/mp/mp_comba.cpp | 214 | ||||
-rw-r--r-- | src/lib/math/mp/mp_core.h | 2 | ||||
-rw-r--r-- | src/lib/math/mp/mp_karat.cpp | 9 |
3 files changed, 224 insertions, 1 deletions
diff --git a/src/lib/math/mp/mp_comba.cpp b/src/lib/math/mp/mp_comba.cpp index 99dcda176..b56a59291 100644 --- a/src/lib/math/mp/mp_comba.cpp +++ b/src/lib/math/mp/mp_comba.cpp @@ -1,6 +1,6 @@ /* * Comba Multiplication and Squaring -* (C) 1999-2007,2011 Jack Lloyd +* (C) 1999-2007,2011,2014 Jack Lloyd * * Distributed under the terms of the Botan license */ @@ -382,6 +382,218 @@ void bigint_comba_mul8(word z[16], const word x[8], const word y[8]) } /* +* Comba 9x9 Squaring +*/ +void bigint_comba_sqr9(word z[18], const word x[9]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 1]); + z[ 1] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], x[ 1]); + z[ 2] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]); + z[ 3] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 4]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], x[ 2]); + z[ 4] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 5]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 4]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 3]); + z[ 5] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]); + z[ 6] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 7]); + word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 6]); + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 5]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 4]); + z[ 7] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 8]); + word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 7]); + word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 6]); + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], x[ 4]); + z[ 8] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 7]); + word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 6]); + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]); + z[ 9] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 8]); + word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 7]); + word3_muladd_2(&w0, &w2, &w1, x[ 4], x[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 5], x[ 5]); + z[10] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 8]); + word3_muladd_2(&w1, &w0, &w2, x[ 4], x[ 7]); + word3_muladd_2(&w1, &w0, &w2, x[ 5], x[ 6]); + z[11] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 8]); + word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], x[ 6]); + z[12] = w0; w0 = 0; + + word3_muladd_2(&w0, &w2, &w1, x[ 5], x[ 8]); + word3_muladd_2(&w0, &w2, &w1, x[ 6], x[ 7]); + z[13] = w1; w1 = 0; + + word3_muladd_2(&w1, &w0, &w2, x[ 6], x[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 7], x[ 7]); + z[14] = w2; w2 = 0; + + word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 8]); + z[15] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 8], x[ 8]); + z[16] = w1; + z[17] = w2; + } + +/* +* Comba 9x9 Multiplication +*/ +void bigint_comba_mul9(word z[18], const word x[9], const word y[9]) + { + word w2 = 0, w1 = 0, w0 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]); + z[ 0] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 0]); + z[ 1] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 0]); + z[ 2] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]); + z[ 3] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 0]); + z[ 4] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 0]); + z[ 5] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 0], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 0]); + z[ 6] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 0], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 1], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 2]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 1]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 0]); + z[ 7] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 0], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 1], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 2], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 3]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 2]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 1]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 0]); + z[ 8] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 1], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 2], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 3], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 3]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 2]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 1]); + z[ 9] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 2], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[ 3], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 4], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 5]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 4]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 3]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[ 2]); + z[10] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 3], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 4], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 5], y[ 6]); + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 5]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 4]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 3]); + z[11] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 4], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 5], y[ 7]); + word3_muladd(&w2, &w1, &w0, x[ 6], y[ 6]); + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 5]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 4]); + z[12] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 5], y[ 8]); + word3_muladd(&w0, &w2, &w1, x[ 6], y[ 7]); + word3_muladd(&w0, &w2, &w1, x[ 7], y[ 6]); + word3_muladd(&w0, &w2, &w1, x[ 8], y[ 5]); + z[13] = w1; w1 = 0; + + word3_muladd(&w1, &w0, &w2, x[ 6], y[ 8]); + word3_muladd(&w1, &w0, &w2, x[ 7], y[ 7]); + word3_muladd(&w1, &w0, &w2, x[ 8], y[ 6]); + z[14] = w2; w2 = 0; + + word3_muladd(&w2, &w1, &w0, x[ 7], y[ 8]); + word3_muladd(&w2, &w1, &w0, x[ 8], y[ 7]); + z[15] = w0; w0 = 0; + + word3_muladd(&w0, &w2, &w1, x[ 8], y[ 8]); + z[16] = w1; + z[17] = w2; + } + +/* * Comba 16x16 Squaring */ void bigint_comba_sqr16(word z[32], const word x[16]) diff --git a/src/lib/math/mp/mp_core.h b/src/lib/math/mp/mp_core.h index c25cb994f..64b29b869 100644 --- a/src/lib/math/mp/mp_core.h +++ b/src/lib/math/mp/mp_core.h @@ -151,11 +151,13 @@ word bigint_modop(word n1, word n0, word d); void bigint_comba_mul4(word z[8], const word x[4], const word y[4]); void bigint_comba_mul6(word z[12], const word x[6], const word y[6]); void bigint_comba_mul8(word z[16], const word x[8], const word y[8]); +void bigint_comba_mul9(word z[18], const word x[9], const word y[9]); void bigint_comba_mul16(word z[32], const word x[16], const word y[16]); void bigint_comba_sqr4(word out[8], const word in[4]); void bigint_comba_sqr6(word out[12], const word in[6]); void bigint_comba_sqr8(word out[16], const word in[8]); +void bigint_comba_sqr9(word out[18], const word in[9]); void bigint_comba_sqr16(word out[32], const word in[16]); } diff --git a/src/lib/math/mp/mp_karat.cpp b/src/lib/math/mp/mp_karat.cpp index b549a05c8..62620f83d 100644 --- a/src/lib/math/mp/mp_karat.cpp +++ b/src/lib/math/mp/mp_karat.cpp @@ -237,6 +237,11 @@ void bigint_mul(word z[], size_t z_size, word workspace[], { bigint_comba_mul8(z, x, y); } + else if(x_sw <= 9 && x_size >= 9 && + y_sw <= 9 && y_size >= 9 && z_size >= 18) + { + bigint_comba_mul9(z, x, y); + } else if(x_sw <= 16 && x_size >= 16 && y_sw <= 16 && y_size >= 16 && z_size >= 32) { @@ -281,6 +286,10 @@ void bigint_sqr(word z[], size_t z_size, word workspace[], { bigint_comba_sqr8(z, x); } + else if(x_sw == 9 && x_size >= 9 && z_size >= 18) + { + bigint_comba_sqr9(z, x); + } else if(x_sw <= 16 && x_size >= 16 && z_size >= 32) { bigint_comba_sqr16(z, x); |