aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/math
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2018-03-02 17:24:03 -0500
committerJack Lloyd <[email protected]>2018-03-02 17:53:13 -0500
commit76ecd3fedb0476d92afc30c1c68c82b5eddbff71 (patch)
treed5b662cd9987e63b432619dc2ee78c34a992431b /src/lib/math
parent537833f1bef3757adb7d12b5f27a2fa1b8d994b1 (diff)
Implement product-scanning Montgomery reduction
Results in 10-20% improvement for DH and RSA, 5% for ECC curves that use Montgomery form.
Diffstat (limited to 'src/lib/math')
-rw-r--r--src/lib/math/mp/mp_asmi.h36
-rw-r--r--src/lib/math/mp/mp_monty.cpp82
2 files changed, 88 insertions, 30 deletions
diff --git a/src/lib/math/mp/mp_asmi.h b/src/lib/math/mp/mp_asmi.h
index a69d82a15..0cbce3053 100644
--- a/src/lib/math/mp/mp_asmi.h
+++ b/src/lib/math/mp/mp_asmi.h
@@ -750,6 +750,42 @@ inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y)
}
/*
+* 3-word addition
+* (w2,w1,w0) += x
+*/
+inline void word3_add(word* w2, word* w1, word* w0, word x)
+ {
+#if defined(BOTAN_MP_USE_X86_32_ASM)
+ asm(
+ ASM("addl %[x],%[w0]")
+ ASM("adcl $0,%[w1]")
+ ASM("adcl $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"r"(x), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+
+#elif defined(BOTAN_MP_USE_X86_64_ASM)
+
+ asm(
+ ASM("addq %[x],%[w0]")
+ ASM("adcq $0,%[w1]")
+ ASM("adcq $0,%[w2]")
+
+ : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2)
+ : [x]"r"(x), "0"(*w0), "1"(*w1), "2"(*w2)
+ : "cc");
+
+#else
+ *w0 += x;
+ word c1 = (*w0 < x);
+ *w1 += c1;
+ word c2 = (*w1 < c1);
+ *w2 += c2;
+#endif
+ }
+
+/*
* Multiply-Add Accumulator
* (w2,w1,w0) += 2 * x * y
*/
diff --git a/src/lib/math/mp/mp_monty.cpp b/src/lib/math/mp/mp_monty.cpp
index e2f93a9a7..ba94d1528 100644
--- a/src/lib/math/mp/mp_monty.cpp
+++ b/src/lib/math/mp/mp_monty.cpp
@@ -8,7 +8,6 @@
*/
#include <botan/bigint.h>
-#include <botan/internal/mp_core.h>
#include <botan/internal/mp_madd.h>
#include <botan/internal/mp_asmi.h>
#include <botan/internal/ct_utils.h>
@@ -32,60 +31,83 @@ void bigint_monty_redc(word z[],
CT::poison(p, p_size);
CT::poison(ws, 2*(p_size+1));
- const size_t blocks_of_8 = p_size - (p_size % 8);
+ /*
+ Montgomery reduction - product scanning form
- for(size_t i = 0; i != p_size; ++i)
- {
- word* z_i = z + i;
+ https://www.iacr.org/archive/ches2005/006.pdf
+ https://eprint.iacr.org/2013/882.pdf
+ https://www.microsoft.com/en-us/research/wp-content/uploads/1996/01/j37acmon.pdf
+ */
+
+ word w2 = 0, w1 = 0, w0 = 0;
- const word y = z_i[0] * p_dash;
+ w0 = z[0];
- /*
- bigint_linmul3(ws, p, p_size, y);
- bigint_add2(z_i, z_size - i, ws, p_size+1);
- */
+ ws[0] = w0 * p_dash;
- word carry = 0;
+ word3_muladd(&w2, &w1, &w0, ws[0], p[0]);
+
+ w0 = w1;
+ w1 = w2;
+ w2 = 0;
+
+ for(size_t i = 1; i != p_size; ++i)
+ {
+ for(size_t j = 0; j < i; ++j)
+ {
+ word3_muladd(&w2, &w1, &w0, ws[j], p[i-j]);
+ }
- for(size_t j = 0; j != blocks_of_8; j += 8)
- carry = word8_madd3(z_i + j, p + j, y, carry);
+ word3_add(&w2, &w1, &w0, z[i]);
- for(size_t j = blocks_of_8; j != p_size; ++j)
- z_i[j] = word_madd3(p[j], y, z_i[j], &carry);
+ ws[i] = w0 * p_dash;
- word z_sum = z_i[p_size] + carry;
- carry = (z_sum < z_i[p_size]);
- z_i[p_size] = z_sum;
+ word3_muladd(&w2, &w1, &w0, ws[i], p[0]);
- for(size_t j = p_size + 1; j < z_size - i; ++j)
+ w0 = w1;
+ w1 = w2;
+ w2 = 0;
+ }
+
+ for(size_t i = p_size; i != 2*p_size; ++i)
+ {
+ for(size_t j = i - p_size + 1; j != p_size; ++j)
{
- z_i[j] += carry;
- carry = (z_i[j] < carry);
+ word3_muladd(&w2, &w1, &w0, ws[j], p[i-j]);
}
+
+ word3_add(&w2, &w1, &w0, z[i]);
+
+ ws[i-p_size] = w0;
+ w0 = w1;
+ w1 = w2;
+ w2 = 0;
}
+ word3_add(&w2, &w1, &w0, z[z_size-1]);
+
+ ws[p_size] = w0;
+ ws[p_size+1] = w1;
+
/*
* The result might need to be reduced mod p. To avoid a timing
* channel, always perform the subtraction. If in the compution
* of x - p a borrow is required then x was already < p.
*
- * x - p starts at ws[0] and is p_size+1 bytes long
- * x starts at ws[p_size+1] and is also p_size+1 bytes log
- * (that's the copy_mem)
+ * x starts at ws[0] and is p_size+1 bytes long.
+ * x - p starts at ws[p_size+1] and is also p_size+1 bytes log
*
* Select which address to copy from indexing off of the final
* borrow.
*/
+ // word borrow = bigint_sub3(ws + p_size + 1, ws, p_size + 1, p, p_size);
word borrow = 0;
for(size_t i = 0; i != p_size; ++i)
- ws[i] = word_sub(z[p_size + i], p[i], &borrow);
-
- ws[p_size] = word_sub(z[p_size+p_size], 0, &borrow);
-
- copy_mem(ws + p_size + 1, z + p_size, p_size + 1);
+ ws[p_size + 1 + i] = word_sub(ws[i], p[i], &borrow);
+ ws[2*p_size+1] = word_sub(ws[p_size], 0, &borrow);
- CT::conditional_copy_mem(borrow, z, ws + (p_size + 1), ws, (p_size + 1));
+ CT::conditional_copy_mem(borrow, z, ws, ws + (p_size + 1), (p_size + 1));
clear_mem(z + p_size + 1, z_size - p_size - 1);
CT::unpoison(z, z_size);