aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/mac/poly1305/poly1305.cpp
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2018-09-20 18:17:53 -0400
committerJack Lloyd <[email protected]>2018-09-20 18:18:34 -0400
commit649c9301a7f48a7d28d70e693f1187ca3d8398f5 (patch)
tree48d50a1dc724a70ff58bd4a39b7a8dd64f758a8e /src/lib/mac/poly1305/poly1305.cpp
parent4cffc7c057716d8ee3eac500a323cbab97084bc8 (diff)
Optimization for Poly1305
Rearranging this code seems to let both GCC and Clang do a little better on the core loop, 4-7% depending on buffer size on my i7-6700k
Diffstat (limited to 'src/lib/mac/poly1305/poly1305.cpp')
-rw-r--r--src/lib/mac/poly1305/poly1305.cpp41
1 files changed, 24 insertions, 17 deletions
diff --git a/src/lib/mac/poly1305/poly1305.cpp b/src/lib/mac/poly1305/poly1305.cpp
index b91222092..bdda3720c 100644
--- a/src/lib/mac/poly1305/poly1305.cpp
+++ b/src/lib/mac/poly1305/poly1305.cpp
@@ -49,6 +49,9 @@ void poly1305_blocks(secure_vector<uint64_t>& X, const uint8_t *m, size_t blocks
const uint64_t r1 = X[1];
const uint64_t r2 = X[2];
+ const uint64_t M44 = 0xfffffffffff;
+ const uint64_t M42 = 0x3ffffffffff;
+
uint64_t h0 = X[3+0];
uint64_t h1 = X[3+1];
uint64_t h2 = X[3+2];
@@ -56,27 +59,31 @@ void poly1305_blocks(secure_vector<uint64_t>& X, const uint8_t *m, size_t blocks
const uint64_t s1 = r1 * (5 << 2);
const uint64_t s2 = r2 * (5 << 2);
- while(blocks--)
+ for(size_t i = 0; i != blocks; ++i)
{
- /* h += m[i] */
const uint64_t t0 = load_le<uint64_t>(m, 0);
const uint64_t t1 = load_le<uint64_t>(m, 1);
- h0 += (( t0 ) & 0xfffffffffff);
- h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
- h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
-
- /* h *= r */
- uint128_t d0 = uint128_t(h0) * r0 + uint128_t(h1) * s2 + uint128_t(h2) * s1;
- uint128_t d1 = uint128_t(h0) * r1 + uint128_t(h1) * r0 + uint128_t(h2) * s2;
- uint128_t d2 = uint128_t(h0) * r2 + uint128_t(h1) * r1 + uint128_t(h2) * r0;
-
- /* (partial) h %= p */
- uint64_t c = carry_shift(d0, 44); h0 = d0 & 0xfffffffffff;
- d1 += c; c = carry_shift(d1, 44); h1 = d1 & 0xfffffffffff;
- d2 += c; c = carry_shift(d2, 42); h2 = d2 & 0x3ffffffffff;
- h0 += c * 5; c = carry_shift(h0, 44); h0 = h0 & 0xfffffffffff;
- h1 += c;
+ h0 += (( t0 ) & M44);
+ h1 += (((t0 >> 44) | (t1 << 20)) & M44);
+ h2 += (((t1 >> 24) ) & M42) | hibit;
+
+ const uint128_t d0 = uint128_t(h0) * r0 + uint128_t(h1) * s2 + uint128_t(h2) * s1;
+ const uint64_t c0 = carry_shift(d0, 44);
+
+ const uint128_t d1 = uint128_t(h0) * r1 + uint128_t(h1) * r0 + uint128_t(h2) * s2 + c0;
+ const uint64_t c1 = carry_shift(d1, 44);
+
+ const uint128_t d2 = uint128_t(h0) * r2 + uint128_t(h1) * r1 + uint128_t(h2) * r0 + c1;
+ const uint64_t c2 = carry_shift(d2, 42);
+
+ h0 = d0 & M44;
+ h1 = d1 & M44;
+ h2 = d2 & M42;
+
+ h0 += c2 * 5;
+ h1 += carry_shift(h0, 44);
+ h0 = h0 & M44;
m += 16;
}