diff options
author | Jack Lloyd <[email protected]> | 2018-03-02 17:24:03 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2018-03-02 17:53:13 -0500 |
commit | 76ecd3fedb0476d92afc30c1c68c82b5eddbff71 (patch) | |
tree | d5b662cd9987e63b432619dc2ee78c34a992431b /src/lib/math | |
parent | 537833f1bef3757adb7d12b5f27a2fa1b8d994b1 (diff) |
Implement product-scanning Montgomery reduction
Results in 10-20% improvement for DH and RSA, 5% for ECC curves
that use Montgomery form.
Diffstat (limited to 'src/lib/math')
-rw-r--r-- | src/lib/math/mp/mp_asmi.h | 36 | ||||
-rw-r--r-- | src/lib/math/mp/mp_monty.cpp | 82 |
2 files changed, 88 insertions, 30 deletions
diff --git a/src/lib/math/mp/mp_asmi.h b/src/lib/math/mp/mp_asmi.h index a69d82a15..0cbce3053 100644 --- a/src/lib/math/mp/mp_asmi.h +++ b/src/lib/math/mp/mp_asmi.h @@ -750,6 +750,42 @@ inline void word3_muladd(word* w2, word* w1, word* w0, word x, word y) } /* +* 3-word addition +* (w2,w1,w0) += x +*/ +inline void word3_add(word* w2, word* w1, word* w0, word x) + { +#if defined(BOTAN_MP_USE_X86_32_ASM) + asm( + ASM("addl %[x],%[w0]") + ASM("adcl $0,%[w1]") + ASM("adcl $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"r"(x), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + +#elif defined(BOTAN_MP_USE_X86_64_ASM) + + asm( + ASM("addq %[x],%[w0]") + ASM("adcq $0,%[w1]") + ASM("adcq $0,%[w2]") + + : [w0]"=r"(*w0), [w1]"=r"(*w1), [w2]"=r"(*w2) + : [x]"r"(x), "0"(*w0), "1"(*w1), "2"(*w2) + : "cc"); + +#else + *w0 += x; + word c1 = (*w0 < x); + *w1 += c1; + word c2 = (*w1 < c1); + *w2 += c2; +#endif + } + +/* * Multiply-Add Accumulator * (w2,w1,w0) += 2 * x * y */ diff --git a/src/lib/math/mp/mp_monty.cpp b/src/lib/math/mp/mp_monty.cpp index e2f93a9a7..ba94d1528 100644 --- a/src/lib/math/mp/mp_monty.cpp +++ b/src/lib/math/mp/mp_monty.cpp @@ -8,7 +8,6 @@ */ #include <botan/bigint.h> -#include <botan/internal/mp_core.h> #include <botan/internal/mp_madd.h> #include <botan/internal/mp_asmi.h> #include <botan/internal/ct_utils.h> @@ -32,60 +31,83 @@ void bigint_monty_redc(word z[], CT::poison(p, p_size); CT::poison(ws, 2*(p_size+1)); - const size_t blocks_of_8 = p_size - (p_size % 8); + /* + Montgomery reduction - product scanning form - for(size_t i = 0; i != p_size; ++i) - { - word* z_i = z + i; + https://www.iacr.org/archive/ches2005/006.pdf + https://eprint.iacr.org/2013/882.pdf + https://www.microsoft.com/en-us/research/wp-content/uploads/1996/01/j37acmon.pdf + */ + + word w2 = 0, w1 = 0, w0 = 0; - const word y = z_i[0] * p_dash; + w0 = z[0]; - /* - bigint_linmul3(ws, p, p_size, y); - bigint_add2(z_i, z_size - i, ws, p_size+1); - */ + ws[0] = w0 * p_dash; - word carry = 0; + word3_muladd(&w2, &w1, &w0, ws[0], p[0]); + + w0 = w1; + w1 = w2; + w2 = 0; + + for(size_t i = 1; i != p_size; ++i) + { + for(size_t j = 0; j < i; ++j) + { + word3_muladd(&w2, &w1, &w0, ws[j], p[i-j]); + } - for(size_t j = 0; j != blocks_of_8; j += 8) - carry = word8_madd3(z_i + j, p + j, y, carry); + word3_add(&w2, &w1, &w0, z[i]); - for(size_t j = blocks_of_8; j != p_size; ++j) - z_i[j] = word_madd3(p[j], y, z_i[j], &carry); + ws[i] = w0 * p_dash; - word z_sum = z_i[p_size] + carry; - carry = (z_sum < z_i[p_size]); - z_i[p_size] = z_sum; + word3_muladd(&w2, &w1, &w0, ws[i], p[0]); - for(size_t j = p_size + 1; j < z_size - i; ++j) + w0 = w1; + w1 = w2; + w2 = 0; + } + + for(size_t i = p_size; i != 2*p_size; ++i) + { + for(size_t j = i - p_size + 1; j != p_size; ++j) { - z_i[j] += carry; - carry = (z_i[j] < carry); + word3_muladd(&w2, &w1, &w0, ws[j], p[i-j]); } + + word3_add(&w2, &w1, &w0, z[i]); + + ws[i-p_size] = w0; + w0 = w1; + w1 = w2; + w2 = 0; } + word3_add(&w2, &w1, &w0, z[z_size-1]); + + ws[p_size] = w0; + ws[p_size+1] = w1; + /* * The result might need to be reduced mod p. To avoid a timing * channel, always perform the subtraction. If in the compution * of x - p a borrow is required then x was already < p. * - * x - p starts at ws[0] and is p_size+1 bytes long - * x starts at ws[p_size+1] and is also p_size+1 bytes log - * (that's the copy_mem) + * x starts at ws[0] and is p_size+1 bytes long. + * x - p starts at ws[p_size+1] and is also p_size+1 bytes log * * Select which address to copy from indexing off of the final * borrow. */ + // word borrow = bigint_sub3(ws + p_size + 1, ws, p_size + 1, p, p_size); word borrow = 0; for(size_t i = 0; i != p_size; ++i) - ws[i] = word_sub(z[p_size + i], p[i], &borrow); - - ws[p_size] = word_sub(z[p_size+p_size], 0, &borrow); - - copy_mem(ws + p_size + 1, z + p_size, p_size + 1); + ws[p_size + 1 + i] = word_sub(ws[i], p[i], &borrow); + ws[2*p_size+1] = word_sub(ws[p_size], 0, &borrow); - CT::conditional_copy_mem(borrow, z, ws + (p_size + 1), ws, (p_size + 1)); + CT::conditional_copy_mem(borrow, z, ws, ws + (p_size + 1), (p_size + 1)); clear_mem(z + p_size + 1, z_size - p_size - 1); CT::unpoison(z, z_size); |