aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/math/mp
diff options
context:
space:
mode:
authorlloyd <[email protected]>2014-11-15 14:35:19 +0000
committerlloyd <[email protected]>2014-11-15 14:35:19 +0000
commit1518c30f1c90c2d0e5e06731e3dffe21353b34db (patch)
treec2f819f2a2011a7af6052ede3b32638412b546d0 /src/lib/math/mp
parent17349a1fc49d604f8160f2077538fdf397b702c6 (diff)
Add specialized reduction for P-521 along with 9x9 Comba routines.
Roughly 35-50% faster on my laptop (depending on if mlock is enabled, the overhead in that allocator is becoming much more of a hotspot).
Diffstat (limited to 'src/lib/math/mp')
-rw-r--r--src/lib/math/mp/mp_comba.cpp214
-rw-r--r--src/lib/math/mp/mp_core.h2
-rw-r--r--src/lib/math/mp/mp_karat.cpp9
3 files changed, 224 insertions, 1 deletions
diff --git a/src/lib/math/mp/mp_comba.cpp b/src/lib/math/mp/mp_comba.cpp
index 99dcda176..b56a59291 100644
--- a/src/lib/math/mp/mp_comba.cpp
+++ b/src/lib/math/mp/mp_comba.cpp
@@ -1,6 +1,6 @@
/*
* Comba Multiplication and Squaring
-* (C) 1999-2007,2011 Jack Lloyd
+* (C) 1999-2007,2011,2014 Jack Lloyd
*
* Distributed under the terms of the Botan license
*/
@@ -382,6 +382,218 @@ void bigint_comba_mul8(word z[16], const word x[8], const word y[8])
}
/*
+* Comba 9x9 Squaring
+*/
+void bigint_comba_sqr9(word z[18], const word x[9])
+ {
+ word w2 = 0, w1 = 0, w0 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 0], x[ 0]);
+ z[ 0] = w0; w0 = 0;
+
+ word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 1]);
+ z[ 1] = w1; w1 = 0;
+
+ word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 2]);
+ word3_muladd(&w1, &w0, &w2, x[ 1], x[ 1]);
+ z[ 2] = w2; w2 = 0;
+
+ word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 3]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 2]);
+ z[ 3] = w0; w0 = 0;
+
+ word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 4]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 3]);
+ word3_muladd(&w0, &w2, &w1, x[ 2], x[ 2]);
+ z[ 4] = w1; w1 = 0;
+
+ word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 5]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 4]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 3]);
+ z[ 5] = w2; w2 = 0;
+
+ word3_muladd_2(&w2, &w1, &w0, x[ 0], x[ 6]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 5]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 4]);
+ word3_muladd(&w2, &w1, &w0, x[ 3], x[ 3]);
+ z[ 6] = w0; w0 = 0;
+
+ word3_muladd_2(&w0, &w2, &w1, x[ 0], x[ 7]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 1], x[ 6]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 5]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 4]);
+ z[ 7] = w1; w1 = 0;
+
+ word3_muladd_2(&w1, &w0, &w2, x[ 0], x[ 8]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 1], x[ 7]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 2], x[ 6]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 5]);
+ word3_muladd(&w1, &w0, &w2, x[ 4], x[ 4]);
+ z[ 8] = w2; w2 = 0;
+
+ word3_muladd_2(&w2, &w1, &w0, x[ 1], x[ 8]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 2], x[ 7]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 3], x[ 6]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 5]);
+ z[ 9] = w0; w0 = 0;
+
+ word3_muladd_2(&w0, &w2, &w1, x[ 2], x[ 8]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 3], x[ 7]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 4], x[ 6]);
+ word3_muladd(&w0, &w2, &w1, x[ 5], x[ 5]);
+ z[10] = w1; w1 = 0;
+
+ word3_muladd_2(&w1, &w0, &w2, x[ 3], x[ 8]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 4], x[ 7]);
+ word3_muladd_2(&w1, &w0, &w2, x[ 5], x[ 6]);
+ z[11] = w2; w2 = 0;
+
+ word3_muladd_2(&w2, &w1, &w0, x[ 4], x[ 8]);
+ word3_muladd_2(&w2, &w1, &w0, x[ 5], x[ 7]);
+ word3_muladd(&w2, &w1, &w0, x[ 6], x[ 6]);
+ z[12] = w0; w0 = 0;
+
+ word3_muladd_2(&w0, &w2, &w1, x[ 5], x[ 8]);
+ word3_muladd_2(&w0, &w2, &w1, x[ 6], x[ 7]);
+ z[13] = w1; w1 = 0;
+
+ word3_muladd_2(&w1, &w0, &w2, x[ 6], x[ 8]);
+ word3_muladd(&w1, &w0, &w2, x[ 7], x[ 7]);
+ z[14] = w2; w2 = 0;
+
+ word3_muladd_2(&w2, &w1, &w0, x[ 7], x[ 8]);
+ z[15] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 8], x[ 8]);
+ z[16] = w1;
+ z[17] = w2;
+ }
+
+/*
+* Comba 9x9 Multiplication
+*/
+void bigint_comba_mul9(word z[18], const word x[9], const word y[9])
+ {
+ word w2 = 0, w1 = 0, w0 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 0], y[ 0]);
+ z[ 0] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 0], y[ 1]);
+ word3_muladd(&w0, &w2, &w1, x[ 1], y[ 0]);
+ z[ 1] = w1; w1 = 0;
+
+ word3_muladd(&w1, &w0, &w2, x[ 0], y[ 2]);
+ word3_muladd(&w1, &w0, &w2, x[ 1], y[ 1]);
+ word3_muladd(&w1, &w0, &w2, x[ 2], y[ 0]);
+ z[ 2] = w2; w2 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 0], y[ 3]);
+ word3_muladd(&w2, &w1, &w0, x[ 1], y[ 2]);
+ word3_muladd(&w2, &w1, &w0, x[ 2], y[ 1]);
+ word3_muladd(&w2, &w1, &w0, x[ 3], y[ 0]);
+ z[ 3] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 0], y[ 4]);
+ word3_muladd(&w0, &w2, &w1, x[ 1], y[ 3]);
+ word3_muladd(&w0, &w2, &w1, x[ 2], y[ 2]);
+ word3_muladd(&w0, &w2, &w1, x[ 3], y[ 1]);
+ word3_muladd(&w0, &w2, &w1, x[ 4], y[ 0]);
+ z[ 4] = w1; w1 = 0;
+
+ word3_muladd(&w1, &w0, &w2, x[ 0], y[ 5]);
+ word3_muladd(&w1, &w0, &w2, x[ 1], y[ 4]);
+ word3_muladd(&w1, &w0, &w2, x[ 2], y[ 3]);
+ word3_muladd(&w1, &w0, &w2, x[ 3], y[ 2]);
+ word3_muladd(&w1, &w0, &w2, x[ 4], y[ 1]);
+ word3_muladd(&w1, &w0, &w2, x[ 5], y[ 0]);
+ z[ 5] = w2; w2 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 0], y[ 6]);
+ word3_muladd(&w2, &w1, &w0, x[ 1], y[ 5]);
+ word3_muladd(&w2, &w1, &w0, x[ 2], y[ 4]);
+ word3_muladd(&w2, &w1, &w0, x[ 3], y[ 3]);
+ word3_muladd(&w2, &w1, &w0, x[ 4], y[ 2]);
+ word3_muladd(&w2, &w1, &w0, x[ 5], y[ 1]);
+ word3_muladd(&w2, &w1, &w0, x[ 6], y[ 0]);
+ z[ 6] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 0], y[ 7]);
+ word3_muladd(&w0, &w2, &w1, x[ 1], y[ 6]);
+ word3_muladd(&w0, &w2, &w1, x[ 2], y[ 5]);
+ word3_muladd(&w0, &w2, &w1, x[ 3], y[ 4]);
+ word3_muladd(&w0, &w2, &w1, x[ 4], y[ 3]);
+ word3_muladd(&w0, &w2, &w1, x[ 5], y[ 2]);
+ word3_muladd(&w0, &w2, &w1, x[ 6], y[ 1]);
+ word3_muladd(&w0, &w2, &w1, x[ 7], y[ 0]);
+ z[ 7] = w1; w1 = 0;
+
+ word3_muladd(&w1, &w0, &w2, x[ 0], y[ 8]);
+ word3_muladd(&w1, &w0, &w2, x[ 1], y[ 7]);
+ word3_muladd(&w1, &w0, &w2, x[ 2], y[ 6]);
+ word3_muladd(&w1, &w0, &w2, x[ 3], y[ 5]);
+ word3_muladd(&w1, &w0, &w2, x[ 4], y[ 4]);
+ word3_muladd(&w1, &w0, &w2, x[ 5], y[ 3]);
+ word3_muladd(&w1, &w0, &w2, x[ 6], y[ 2]);
+ word3_muladd(&w1, &w0, &w2, x[ 7], y[ 1]);
+ word3_muladd(&w1, &w0, &w2, x[ 8], y[ 0]);
+ z[ 8] = w2; w2 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 1], y[ 8]);
+ word3_muladd(&w2, &w1, &w0, x[ 2], y[ 7]);
+ word3_muladd(&w2, &w1, &w0, x[ 3], y[ 6]);
+ word3_muladd(&w2, &w1, &w0, x[ 4], y[ 5]);
+ word3_muladd(&w2, &w1, &w0, x[ 5], y[ 4]);
+ word3_muladd(&w2, &w1, &w0, x[ 6], y[ 3]);
+ word3_muladd(&w2, &w1, &w0, x[ 7], y[ 2]);
+ word3_muladd(&w2, &w1, &w0, x[ 8], y[ 1]);
+ z[ 9] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 2], y[ 8]);
+ word3_muladd(&w0, &w2, &w1, x[ 3], y[ 7]);
+ word3_muladd(&w0, &w2, &w1, x[ 4], y[ 6]);
+ word3_muladd(&w0, &w2, &w1, x[ 5], y[ 5]);
+ word3_muladd(&w0, &w2, &w1, x[ 6], y[ 4]);
+ word3_muladd(&w0, &w2, &w1, x[ 7], y[ 3]);
+ word3_muladd(&w0, &w2, &w1, x[ 8], y[ 2]);
+ z[10] = w1; w1 = 0;
+
+ word3_muladd(&w1, &w0, &w2, x[ 3], y[ 8]);
+ word3_muladd(&w1, &w0, &w2, x[ 4], y[ 7]);
+ word3_muladd(&w1, &w0, &w2, x[ 5], y[ 6]);
+ word3_muladd(&w1, &w0, &w2, x[ 6], y[ 5]);
+ word3_muladd(&w1, &w0, &w2, x[ 7], y[ 4]);
+ word3_muladd(&w1, &w0, &w2, x[ 8], y[ 3]);
+ z[11] = w2; w2 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 4], y[ 8]);
+ word3_muladd(&w2, &w1, &w0, x[ 5], y[ 7]);
+ word3_muladd(&w2, &w1, &w0, x[ 6], y[ 6]);
+ word3_muladd(&w2, &w1, &w0, x[ 7], y[ 5]);
+ word3_muladd(&w2, &w1, &w0, x[ 8], y[ 4]);
+ z[12] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 5], y[ 8]);
+ word3_muladd(&w0, &w2, &w1, x[ 6], y[ 7]);
+ word3_muladd(&w0, &w2, &w1, x[ 7], y[ 6]);
+ word3_muladd(&w0, &w2, &w1, x[ 8], y[ 5]);
+ z[13] = w1; w1 = 0;
+
+ word3_muladd(&w1, &w0, &w2, x[ 6], y[ 8]);
+ word3_muladd(&w1, &w0, &w2, x[ 7], y[ 7]);
+ word3_muladd(&w1, &w0, &w2, x[ 8], y[ 6]);
+ z[14] = w2; w2 = 0;
+
+ word3_muladd(&w2, &w1, &w0, x[ 7], y[ 8]);
+ word3_muladd(&w2, &w1, &w0, x[ 8], y[ 7]);
+ z[15] = w0; w0 = 0;
+
+ word3_muladd(&w0, &w2, &w1, x[ 8], y[ 8]);
+ z[16] = w1;
+ z[17] = w2;
+ }
+
+/*
* Comba 16x16 Squaring
*/
void bigint_comba_sqr16(word z[32], const word x[16])
diff --git a/src/lib/math/mp/mp_core.h b/src/lib/math/mp/mp_core.h
index c25cb994f..64b29b869 100644
--- a/src/lib/math/mp/mp_core.h
+++ b/src/lib/math/mp/mp_core.h
@@ -151,11 +151,13 @@ word bigint_modop(word n1, word n0, word d);
void bigint_comba_mul4(word z[8], const word x[4], const word y[4]);
void bigint_comba_mul6(word z[12], const word x[6], const word y[6]);
void bigint_comba_mul8(word z[16], const word x[8], const word y[8]);
+void bigint_comba_mul9(word z[18], const word x[9], const word y[9]);
void bigint_comba_mul16(word z[32], const word x[16], const word y[16]);
void bigint_comba_sqr4(word out[8], const word in[4]);
void bigint_comba_sqr6(word out[12], const word in[6]);
void bigint_comba_sqr8(word out[16], const word in[8]);
+void bigint_comba_sqr9(word out[18], const word in[9]);
void bigint_comba_sqr16(word out[32], const word in[16]);
}
diff --git a/src/lib/math/mp/mp_karat.cpp b/src/lib/math/mp/mp_karat.cpp
index b549a05c8..62620f83d 100644
--- a/src/lib/math/mp/mp_karat.cpp
+++ b/src/lib/math/mp/mp_karat.cpp
@@ -237,6 +237,11 @@ void bigint_mul(word z[], size_t z_size, word workspace[],
{
bigint_comba_mul8(z, x, y);
}
+ else if(x_sw <= 9 && x_size >= 9 &&
+ y_sw <= 9 && y_size >= 9 && z_size >= 18)
+ {
+ bigint_comba_mul9(z, x, y);
+ }
else if(x_sw <= 16 && x_size >= 16 &&
y_sw <= 16 && y_size >= 16 && z_size >= 32)
{
@@ -281,6 +286,10 @@ void bigint_sqr(word z[], size_t z_size, word workspace[],
{
bigint_comba_sqr8(z, x);
}
+ else if(x_sw == 9 && x_size >= 9 && z_size >= 18)
+ {
+ bigint_comba_sqr9(z, x);
+ }
else if(x_sw <= 16 && x_size >= 16 && z_size >= 32)
{
bigint_comba_sqr16(z, x);