2 files changed, 414 insertions, 411 deletions
diff --git a/src/lib/pubkey/curve25519/donna.cpp b/src/lib/pubkey/curve25519/donna.cpp
index 6807d56f6..4305ea143 100644
--- a/src/lib/pubkey/curve25519/donna.cpp
+++ b/src/lib/pubkey/curve25519/donna.cpp
@@ -1,462 +1,465 @@
 /*
-* curve25519-donna-c64.c from github.com/agl/curve25519-donna
+* Based on curve25519-donna-c64.c from github.com/agl/curve25519-donna
 * revision 80ad9b9930c9baef5829dd2a235b6b7646d32a8e
+*
+* Further changes
+* (C) 2014,2018 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
 */
 
 /* Copyright 2008, Google Inc.
- * All rights reserved.
- *
- * Code released into the public domain.
- *
- * curve25519-donna: Curve25519 elliptic curve, public key function
- *
- * https://code.google.com/p/curve25519-donna/
- *
- * Adam Langley <[email protected]>
- *
- * Derived from public domain C code by Daniel J. Bernstein <[email protected]>
- *
- * More information about curve25519 can be found here
- *   https://cr.yp.to/ecdh.html
- *
- * djb's sample implementation of curve25519 is written in a special assembly
- * language called qhasm and uses the floating point registers.
- *
- * This is, almost, a clean room reimplementation from the curve25519 paper. It
- * uses many of the tricks described therein. Only the crecip function is taken
- * from the sample implementation.
- */
+* All rights reserved.
+*
+* Code released into the public domain.
+*
+* curve25519-donna: Curve25519 elliptic curve, public key function
+*
+* https://code.google.com/p/curve25519-donna/
+*
+* Adam Langley <[email protected]>
+*
+* Derived from public domain C code by Daniel J. Bernstein <[email protected]>
+*
+* More information about curve25519 can be found here
+*   https://cr.yp.to/ecdh.html
+*
+* djb's sample implementation of curve25519 is written in a special assembly
+* language called qhasm and uses the floating point registers.
+*
+* This is, almost, a clean room reimplementation from the curve25519 paper. It
+* uses many of the tricks described therein. Only the crecip function is taken
+* from the sample implementation.
+*/
 
 #include <botan/curve25519.h>
 #include <botan/mul128.h>
-#include <botan/internal/donna128.h>
 #include <botan/internal/ct_utils.h>
+#include <botan/internal/donna128.h>
 #include <botan/loadstor.h>
 
 namespace Botan {
 
-typedef uint8_t u8;
-typedef uint64_t limb;
-typedef limb felem[5];
-
-typedef struct
-   {
-       limb* x;
-       limb* z;
-   } fmonty_pair_t;
-
-typedef struct
-   {
-     fmonty_pair_t q;
-     fmonty_pair_t q_dash;
-     const limb* q_minus_q_dash;
-   } fmonty_in_t;
-
-typedef struct
-   {
-     fmonty_pair_t two_q;
-     fmonty_pair_t q_plus_q_dash;
-   } fmonty_out_t;
-
+namespace {
 
 #if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
 typedef donna128 uint128_t;
 #endif
 
-/* Sum two numbers: output += in */
-static inline void
-fsum(limb *output, const limb *in) {
-  output[0] += in[0];
-  output[1] += in[1];
-  output[2] += in[2];
-  output[3] += in[3];
-  output[4] += in[4];
-}
-
-/* Find the difference of two numbers: output = in - output
- * (note the order of the arguments!)
- *
- * Assumes that out[i] < 2**52
- * On return, out[i] < 2**55
- */
-static inline void
-fdifference_backwards(felem out, const felem in) {
-  /* 152 is 19 << 3 */
-  static const limb two54m152 = (static_cast<limb>(1) << 54) - 152;
-  static const limb two54m8 = (static_cast<limb>(1) << 54) - 8;
-
-  out[0] = in[0] + two54m152 - out[0];
-  out[1] = in[1] + two54m8 - out[1];
-  out[2] = in[2] + two54m8 - out[2];
-  out[3] = in[3] + two54m8 - out[3];
-  out[4] = in[4] + two54m8 - out[4];
-}
-
-/* Multiply a number by a scalar: output = in * scalar */
-static inline void
-fscalar_product(felem output, const felem in, const limb scalar) {
-  uint128_t a = uint128_t(in[0]) * scalar;
-  output[0] = a & 0x7ffffffffffff;
+struct fmonty_pair_t
+   {
+   uint64_t* x;
+   uint64_t* z;
+   };
 
-  a = uint128_t(in[1]) * scalar + carry_shift(a, 51);
-  output[1] = a & 0x7ffffffffffff;
+struct fmonty_in_t
+   {
+   fmonty_pair_t q;
+   fmonty_pair_t q_dash;
+   const uint64_t* q_minus_q_dash;
+   };
 
-  a = uint128_t(in[2]) * scalar + carry_shift(a, 51);
-  output[2] = a & 0x7ffffffffffff;
+struct fmonty_out_t
+   {
+   fmonty_pair_t two_q;
+   fmonty_pair_t q_plus_q_dash;
+   };
 
-  a = uint128_t(in[3]) * scalar + carry_shift(a, 51);
-  output[3] = a & 0x7ffffffffffff;
+/* Sum two numbers: output += in */
+inline void fsum(uint64_t out[5], const uint64_t in[5])
+   {
+   out[0] += in[0];
+   out[1] += in[1];
+   out[2] += in[2];
+   out[3] += in[3];
+   out[4] += in[4];
+   }
+
+/* Find the difference of two numbers: out = in - out
+* (note the order of the arguments!)
+*
+* Assumes that out[i] < 2**52
+* On return, out[i] < 2**55
+*/
+inline void fdifference_backwards(uint64_t out[5], const uint64_t in[5])
+   {
+   /* 152 is 19 << 3 */
+   const uint64_t two54m152 = (static_cast<uint64_t>(1) << 54) - 152;
+   const uint64_t two54m8   = (static_cast<uint64_t>(1) << 54) - 8;
+
+   out[0] = in[0] + two54m152 - out[0];
+   out[1] = in[1] + two54m8 - out[1];
+   out[2] = in[2] + two54m8 - out[2];
+   out[3] = in[3] + two54m8 - out[3];
+   out[4] = in[4] + two54m8 - out[4];
+   }
+
+/* Multiply a number by a scalar: out = in * scalar */
+inline void fscalar_product(uint64_t out[5], const uint64_t in[5], const uint64_t scalar)
+   {
+   uint128_t a = uint128_t(in[0]) * scalar;
+   out[0] = a & 0x7ffffffffffff;
 
-  a = uint128_t(in[4]) * scalar + carry_shift(a, 51);
-  output[4] = a & 0x7ffffffffffff;
+   a = uint128_t(in[1]) * scalar + carry_shift(a, 51);
+   out[1] = a & 0x7ffffffffffff;
 
-  output[0] += carry_shift(a, 51) * 19;
-}
+   a = uint128_t(in[2]) * scalar + carry_shift(a, 51);
+   out[2] = a & 0x7ffffffffffff;
 
-/* Multiply two numbers: output = in2 * in
- *
- * output must be distinct to both inputs. The inputs are reduced coefficient
- * form, the output is not.
- *
- * Assumes that in[i] < 2**55 and likewise for in2.
- * On return, output[i] < 2**52
- */
-static inline void
-fmul(felem output, const felem in2, const felem in) {
-  uint128_t t[5];
-  limb r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c;
-
-  r0 = in[0];
-  r1 = in[1];
-  r2 = in[2];
-  r3 = in[3];
-  r4 = in[4];
-
-  s0 = in2[0];
-  s1 = in2[1];
-  s2 = in2[2];
-  s3 = in2[3];
-  s4 = in2[4];
-
-  t[0]  =  uint128_t(r0) * s0;
-  t[1]  =  uint128_t(r0) * s1 + uint128_t(r1) * s0;
-  t[2]  =  uint128_t(r0) * s2 + uint128_t(r2) * s0 + uint128_t(r1) * s1;
-  t[3]  =  uint128_t(r0) * s3 + uint128_t(r3) * s0 + uint128_t(r1) * s2 + uint128_t(r2) * s1;
-  t[4]  =  uint128_t(r0) * s4 + uint128_t(r4) * s0 + uint128_t(r3) * s1 + uint128_t(r1) * s3 + uint128_t(r2) * s2;
-
-  r4 *= 19;
-  r1 *= 19;
-  r2 *= 19;
-  r3 *= 19;
-
-  t[0] += uint128_t(r4) * s1 + uint128_t(r1) * s4 + uint128_t(r2) * s3 + uint128_t(r3) * s2;
-  t[1] += uint128_t(r4) * s2 + uint128_t(r2) * s4 + uint128_t(r3) * s3;
-  t[2] += uint128_t(r4) * s3 + uint128_t(r3) * s4;
-  t[3] += uint128_t(r4) * s4;
-
-                  r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51);
-  t[1] += c;      r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51);
-  t[2] += c;      r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51);
-  t[3] += c;      r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51);
-  t[4] += c;      r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51);
-  r0 +=   c * 19; c = carry_shift(r0, 51); r0 = r0 & 0x7ffffffffffff;
-  r1 +=   c;      c = carry_shift(r1, 51); r1 = r1 & 0x7ffffffffffff;
-  r2 +=   c;
-
-  output[0] = r0;
-  output[1] = r1;
-  output[2] = r2;
-  output[3] = r3;
-  output[4] = r4;
-}
+   a = uint128_t(in[3]) * scalar + carry_shift(a, 51);
+   out[3] = a & 0x7ffffffffffff;
 
-static inline void fsquare_times(felem output, const felem in, limb count) {
-  uint128_t t[5];
-  limb r0,r1,r2,r3,r4,c;
-  limb d0,d1,d2,d4,d419;
-
-  r0 = in[0];
-  r1 = in[1];
-  r2 = in[2];
-  r3 = in[3];
-  r4 = in[4];
-
-  do {
-    d0 = r0 * 2;
-    d1 = r1 * 2;
-    d2 = r2 * 2 * 19;
-    d419 = r4 * 19;
-    d4 = d419 * 2;
-
-    t[0] = uint128_t(r0) * r0 + uint128_t(d4) * r1 + uint128_t(d2) * (r3     );
-    t[1] = uint128_t(d0) * r1 + uint128_t(d4) * r2 + uint128_t(r3) * (r3 * 19);
-    t[2] = uint128_t(d0) * r2 + uint128_t(r1) * r1 + uint128_t(d4) * (r3     );
-    t[3] = uint128_t(d0) * r3 + uint128_t(d1) * r2 + uint128_t(r4) * (d419   );
-    t[4] = uint128_t(d0) * r4 + uint128_t(d1) * r3 + uint128_t(r2) * (r2     );
-
-                    r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51);
-    t[1] += c;      r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51);
-    t[2] += c;      r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51);
-    t[3] += c;      r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51);
-    t[4] += c;      r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51);
-    r0 +=   c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff;
-    r1 +=   c;      c = r1 >> 51; r1 = r1 & 0x7ffffffffffff;
-    r2 +=   c;
-  } while(--count);
-
-  output[0] = r0;
-  output[1] = r1;
-  output[2] = r2;
-  output[3] = r3;
-  output[4] = r4;
-}
+   a = uint128_t(in[4]) * scalar + carry_shift(a, 51);
+   out[4] = a & 0x7ffffffffffff;
 
-/* Load a little-endian 64-bit number  */
-static limb
-load_limb(const u8 *in) {
-  return load_le<uint64_t>(in, 0);
-}
+   out[0] += carry_shift(a, 51) * 19;
+   }
 
-static void
-store_limb(u8 *out, limb in) {
-  store_le(in, out);
-}
+/* Multiply two numbers: out = in2 * in
+*
+* out must be distinct to both inputs. The inputs are reduced coefficient
+* form, the out is not.
+*
+* Assumes that in[i] < 2**55 and likewise for in2.
+* On return, out[i] < 2**52
+*/
+inline void fmul(uint64_t out[5], const uint64_t in2[5], const uint64_t in[5])
+   {
+   uint128_t t[5];
+   uint64_t r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c;
+
+   r0 = in[0];
+   r1 = in[1];
+   r2 = in[2];
+   r3 = in[3];
+   r4 = in[4];
+
+   s0 = in2[0];
+   s1 = in2[1];
+   s2 = in2[2];
+   s3 = in2[3];
+   s4 = in2[4];
+
+   t[0]  =  uint128_t(r0) * s0;
+   t[1]  =  uint128_t(r0) * s1 + uint128_t(r1) * s0;
+   t[2]  =  uint128_t(r0) * s2 + uint128_t(r2) * s0 + uint128_t(r1) * s1;
+   t[3]  =  uint128_t(r0) * s3 + uint128_t(r3) * s0 + uint128_t(r1) * s2 + uint128_t(r2) * s1;
+   t[4]  =  uint128_t(r0) * s4 + uint128_t(r4) * s0 + uint128_t(r3) * s1 + uint128_t(r1) * s3 + uint128_t(r2) * s2;
+
+   r4 *= 19;
+   r1 *= 19;
+   r2 *= 19;
+   r3 *= 19;
+
+   t[0] += uint128_t(r4) * s1 + uint128_t(r1) * s4 + uint128_t(r2) * s3 + uint128_t(r3) * s2;
+   t[1] += uint128_t(r4) * s2 + uint128_t(r2) * s4 + uint128_t(r3) * s3;
+   t[2] += uint128_t(r4) * s3 + uint128_t(r3) * s4;
+   t[3] += uint128_t(r4) * s4;
+
+   r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51);
+   t[1] += c;      r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51);
+   t[2] += c;      r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51);
+   t[3] += c;      r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51);
+   t[4] += c;      r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51);
+   r0 +=   c * 19; c = carry_shift(r0, 51); r0 = r0 & 0x7ffffffffffff;
+   r1 +=   c;      c = carry_shift(r1, 51); r1 = r1 & 0x7ffffffffffff;
+   r2 +=   c;
+
+   out[0] = r0;
+   out[1] = r1;
+   out[2] = r2;
+   out[3] = r3;
+   out[4] = r4;
+   }
+
+inline void fsquare_times(uint64_t out[5], const uint64_t in[5], size_t count)
+   {
+   uint64_t r0 = in[0];
+   uint64_t r1 = in[1];
+   uint64_t r2 = in[2];
+   uint64_t r3 = in[3];
+   uint64_t r4 = in[4];
+
+   for(size_t i = 0; i != count; ++i)
+      {
+      const uint64_t d0 = r0 * 2;
+      const uint64_t d1 = r1 * 2;
+      const uint64_t d2 = r2 * 2 * 19;
+      const uint64_t d419 = r4 * 19;
+      const uint64_t d4 = d419 * 2;
+
+      uint128_t t0 = uint128_t(r0) * r0 + uint128_t(d4) * r1 + uint128_t(d2) * (r3     );
+      uint128_t t1 = uint128_t(d0) * r1 + uint128_t(d4) * r2 + uint128_t(r3) * (r3 * 19);
+      uint128_t t2 = uint128_t(d0) * r2 + uint128_t(r1) * r1 + uint128_t(d4) * (r3     );
+      uint128_t t3 = uint128_t(d0) * r3 + uint128_t(d1) * r2 + uint128_t(r4) * (d419   );
+      uint128_t t4 = uint128_t(d0) * r4 + uint128_t(d1) * r3 + uint128_t(r2) * (r2     );
+
+      r0 = t0 & 0x7ffffffffffff; t1 += carry_shift(t0, 51);
+      r1 = t1 & 0x7ffffffffffff; t2 += carry_shift(t1, 51);
+      r2 = t2 & 0x7ffffffffffff; t3 += carry_shift(t2, 51);
+      r3 = t3 & 0x7ffffffffffff; t4 += carry_shift(t3, 51);
+      r4 = t4 & 0x7ffffffffffff;
+
+      uint64_t c = carry_shift(t4, 51);
+
+      r0 +=   c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff;
+      r1 +=   c;      c = r1 >> 51; r1 = r1 & 0x7ffffffffffff;
+      r2 +=   c;
+      }
+
+   out[0] = r0;
+   out[1] = r1;
+   out[2] = r2;
+   out[3] = r3;
+   out[4] = r4;
+   }
 
 /* Take a little-endian, 32-byte number and expand it into polynomial form */
-static void
-fexpand(limb *output, const u8 *in) {
-  output[0] = load_limb(in) & 0x7ffffffffffff;
-  output[1] = (load_limb(in+6) >> 3) & 0x7ffffffffffff;
-  output[2] = (load_limb(in+12) >> 6) & 0x7ffffffffffff;
-  output[3] = (load_limb(in+19) >> 1) & 0x7ffffffffffff;
-  output[4] = (load_limb(in+24) >> 12) & 0x7ffffffffffff;
-}
+inline void fexpand(uint64_t *out, const uint8_t *in)
+   {
+   out[0] = load_le<uint64_t>(in, 0) & 0x7ffffffffffff;
+   out[1] = (load_le<uint64_t>(in+6, 0) >> 3) & 0x7ffffffffffff;
+   out[2] = (load_le<uint64_t>(in+12, 0) >> 6) & 0x7ffffffffffff;
+   out[3] = (load_le<uint64_t>(in+19, 0) >> 1) & 0x7ffffffffffff;
+   out[4] = (load_le<uint64_t>(in+24, 0) >> 12) & 0x7ffffffffffff;
+   }
 
 /* Take a fully reduced polynomial form number and contract it into a
- * little-endian, 32-byte array
- */
-static void
-fcontract(u8 *output, const felem input) {
-  uint128_t t[5];
-
-  t[0] = input[0];
-  t[1] = input[1];
-  t[2] = input[2];
-  t[3] = input[3];
-  t[4] = input[4];
-
-  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
-  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
-  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
-  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
-  t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff;
-
-  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
-  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
-  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
-  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
-  t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff;
-
-  /* now t is between 0 and 2^255-1, properly carried. */
-  /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
-
-  t[0] += 19;
-
-  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
-  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
-  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
-  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
-  t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff;
-
-  /* now between 19 and 2^255-1 in both cases, and offset by 19. */
-
-  t[0] += 0x8000000000000 - 19;
-  t[1] += 0x8000000000000 - 1;
-  t[2] += 0x8000000000000 - 1;
-  t[3] += 0x8000000000000 - 1;
-  t[4] += 0x8000000000000 - 1;
-
-  /* now between 2^255 and 2^256-20, and offset by 2^255. */
-
-  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
-  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
-  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
-  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
-  t[4] &= 0x7ffffffffffff;
-
-  store_limb(output,    combine_lower(t[0], 0, t[1], 51));
-  store_limb(output+8,  combine_lower(t[1], 13, t[2], 38));
-  store_limb(output+16, combine_lower(t[2], 26, t[3], 25));
-  store_limb(output+24, combine_lower(t[3], 39, t[4], 12));
-}
+* little-endian, 32-byte array
+*/
+inline void fcontract(uint8_t *out, const uint64_t input[5])
+   {
+   uint128_t t0 = input[0];
+   uint128_t t1 = input[1];
+   uint128_t t2 = input[2];
+   uint128_t t3 = input[3];
+   uint128_t t4 = input[4];
+
+   t1 += t0 >> 51;        t0 &= 0x7ffffffffffff;
+   t2 += t1 >> 51;        t1 &= 0x7ffffffffffff;
+   t3 += t2 >> 51;        t2 &= 0x7ffffffffffff;
+   t4 += t3 >> 51;        t3 &= 0x7ffffffffffff;
+   t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+
+   t1 += t0 >> 51;        t0 &= 0x7ffffffffffff;
+   t2 += t1 >> 51;        t1 &= 0x7ffffffffffff;
+   t3 += t2 >> 51;        t2 &= 0x7ffffffffffff;
+   t4 += t3 >> 51;        t3 &= 0x7ffffffffffff;
+   t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+
+   /* now t is between 0 and 2^255-1, properly carried. */
+   /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
+
+   t0 += 19;
+
+   t1 += t0 >> 51; t0 &= 0x7ffffffffffff;
+   t2 += t1 >> 51; t1 &= 0x7ffffffffffff;
+   t3 += t2 >> 51; t2 &= 0x7ffffffffffff;
+   t4 += t3 >> 51; t3 &= 0x7ffffffffffff;
+   t0 += (t4 >> 51) * 19; t4 &= 0x7ffffffffffff;
+
+   /* now between 19 and 2^255-1 in both cases, and offset by 19. */
+
+   t0 += 0x8000000000000 - 19;
+   t1 += 0x8000000000000 - 1;
+   t2 += 0x8000000000000 - 1;
+   t3 += 0x8000000000000 - 1;
+   t4 += 0x8000000000000 - 1;
+
+   /* now between 2^255 and 2^256-20, and offset by 2^255. */
+
+   t1 += t0 >> 51; t0 &= 0x7ffffffffffff;
+   t2 += t1 >> 51; t1 &= 0x7ffffffffffff;
+   t3 += t2 >> 51; t2 &= 0x7ffffffffffff;
+   t4 += t3 >> 51; t3 &= 0x7ffffffffffff;
+   t4 &= 0x7ffffffffffff;
+
+   store_le(out,
+            combine_lower(t0,  0, t1, 51),
+            combine_lower(t1, 13, t2, 38),
+            combine_lower(t2, 26, t3, 25),
+            combine_lower(t3, 39, t4, 12));
+   }
 
 /* Input: Q, Q', Q-Q'
- * Output: 2Q, Q+Q'
- *
- *   result.two_q (2*Q): long form
- *   result.q_plus_q_dash (Q + Q): long form
- *   in.q: short form, destroyed
- *   in.q_dash: short form, destroyed
- *   in.q_minus_q_dash: short form, preserved
- */
-static void
+* Out: 2Q, Q+Q'
+*
+*   result.two_q (2*Q): long form
+*   result.q_plus_q_dash (Q + Q): long form
+*   in.q: short form, destroyed
+*   in.q_dash: short form, destroyed
+*   in.q_minus_q_dash: short form, preserved
+*/
+void
 fmonty(fmonty_out_t& result, fmonty_in_t& in)
-{
-  limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5],
-       zzprime[5], zzzprime[5];
-
-  copy_mem(origx, in.q.x, 5);
-  fsum(in.q.x, in.q.z);
-  fdifference_backwards(in.q.z, origx);  // does x - z
-
-  copy_mem(origxprime, in.q_dash.x, 5);
-  fsum(in.q_dash.x, in.q_dash.z);
-  fdifference_backwards(in.q_dash.z, origxprime);
-  fmul(xxprime, in.q_dash.x, in.q.z);
-  fmul(zzprime, in.q.x, in.q_dash.z);
-  copy_mem(origxprime, xxprime, 5);
-  fsum(xxprime, zzprime);
-  fdifference_backwards(zzprime, origxprime);
-  fsquare_times(result.q_plus_q_dash.x, xxprime, 1);
-  fsquare_times(zzzprime, zzprime, 1);
-  fmul(result.q_plus_q_dash.z, zzzprime, in.q_minus_q_dash);
-
-  fsquare_times(xx, in.q.x, 1);
-  fsquare_times(zz, in.q.z, 1);
-  fmul(result.two_q.x, xx, zz);
-  fdifference_backwards(zz, xx);  // does zz = xx - zz
-  fscalar_product(zzz, zz, 121665);
-  fsum(zzz, xx);
-  fmul(result.two_q.z, zz, zzz);
-}
+   {
+   uint64_t origx[5];
+   uint64_t origxprime[5];
+   uint64_t zzz[5];
+   uint64_t xx[5];
+   uint64_t zz[5];
+   uint64_t xxprime[5];
+   uint64_t zzprime[5];
+   uint64_t zzzprime[5];
+
+   copy_mem(origx, in.q.x, 5);
+   fsum(in.q.x, in.q.z);
+   fdifference_backwards(in.q.z, origx);  // does x - z
+
+   copy_mem(origxprime, in.q_dash.x, 5);
+   fsum(in.q_dash.x, in.q_dash.z);
+   fdifference_backwards(in.q_dash.z, origxprime);
+   fmul(xxprime, in.q_dash.x, in.q.z);
+   fmul(zzprime, in.q.x, in.q_dash.z);
+   copy_mem(origxprime, xxprime, 5);
+   fsum(xxprime, zzprime);
+   fdifference_backwards(zzprime, origxprime);
+   fsquare_times(result.q_plus_q_dash.x, xxprime, 1);
+   fsquare_times(zzzprime, zzprime, 1);
+   fmul(result.q_plus_q_dash.z, zzzprime, in.q_minus_q_dash);
+
+   fsquare_times(xx, in.q.x, 1);
+   fsquare_times(zz, in.q.z, 1);
+   fmul(result.two_q.x, xx, zz);
+   fdifference_backwards(zz, xx);  // does zz = xx - zz
+   fscalar_product(zzz, zz, 121665);
+   fsum(zzz, xx);
+   fmul(result.two_q.z, zz, zzz);
+   }
 
 // -----------------------------------------------------------------------------
-// Maybe swap the contents of two limb arrays (@a and @b), each @len elements
+// Maybe swap the contents of two uint64_t arrays (@a and @b), each @len elements
 // long. Perform the swap iff @swap is non-zero.
 //
 // This function performs the swap without leaking any side-channel
 // information.
 // -----------------------------------------------------------------------------
-static void
-swap_conditional(limb a[5], limb b[5], limb iswap) {
-  unsigned i;
-  const limb swap = static_cast<limb>(-iswap);
-
-  for (i = 0; i < 5; ++i) {
-    const limb x = swap & (a[i] ^ b[i]);
-    a[i] ^= x;
-    b[i] ^= x;
-  }
-}
+void swap_conditional(uint64_t a[5], uint64_t b[5], uint64_t iswap)
+   {
+   const uint64_t swap = static_cast<uint64_t>(-iswap);
+
+   for(size_t i = 0; i < 5; ++i)
+      {
+      const uint64_t x = swap & (a[i] ^ b[i]);
+      a[i] ^= x;
+      b[i] ^= x;
+      }
+   }
 
 /* Calculates nQ where Q is the x-coordinate of a point on the curve
- *
- *   resultx/resultz: the x coordinate of the resulting curve point (short form)
- *   n: a little endian, 32-byte number
- *   q: a point of the curve (short form)
- */
-static void
-cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
-  limb a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0};
-  limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
-  limb e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1};
-  limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
-
-  unsigned i, j;
-
-  copy_mem(nqpqx, q, 5);
-
-  for (i = 0; i < 32; ++i) {
-    u8 byteval = n[31 - i];
-    for (j = 0; j < 8; ++j) {
-      const limb bit = byteval >> 7;
-
-      swap_conditional(nqx, nqpqx, bit);
-      swap_conditional(nqz, nqpqz, bit);
-
-      fmonty_out_t result { {nqx2, nqz2}, {nqpqx2, nqpqz2} };
-      fmonty_in_t in { { nqx, nqz }, { nqpqx, nqpqz }, q };
-      fmonty(result, in);
-      swap_conditional(nqx2, nqpqx2, bit);
-      swap_conditional(nqz2, nqpqz2, bit);
-
-      t = nqx;
-      nqx = nqx2;
-      nqx2 = t;
-      t = nqz;
-      nqz = nqz2;
-      nqz2 = t;
-      t = nqpqx;
-      nqpqx = nqpqx2;
-      nqpqx2 = t;
-      t = nqpqz;
-      nqpqz = nqpqz2;
-      nqpqz2 = t;
-
-      byteval <<= 1;
-    }
-  }
-
-  copy_mem(resultx, nqx, 5);
-  copy_mem(resultz, nqz, 5);
-}
+*
+*   resultx/resultz: the x coordinate of the resulting curve point (short form)
+*   n: a little endian, 32-byte number
+*   q: a point of the curve (short form)
+*/
+void cmult(uint64_t *resultx, uint64_t *resultz, const uint8_t *n, const uint64_t *q)
+   {
+   uint64_t a[5] = {0};
+   uint64_t b[5] = {1};
+   uint64_t c[5] = {1};
+   uint64_t d[5] = {0};
+   uint64_t e[5] = {0};
+   uint64_t f[5] = {1};
+   uint64_t g[5] = {0};
+   uint64_t h[5] = {1};
+
+   uint64_t *nqpqx = a;
+   uint64_t *nqpqz = b;
+   uint64_t *nqx = c;
+   uint64_t *nqz = d;
+   uint64_t *nqpqx2 = e;
+   uint64_t *nqpqz2 = f;
+   uint64_t *nqx2 = g;
+   uint64_t *nqz2 = h;
+
+   copy_mem(nqpqx, q, 5);
+
+   for(size_t i = 0; i < 32; ++i)
+      {
+      for(size_t j = 0; j < 8; ++j)
+         {
+         const uint64_t bit = (n[31 - i] >> (7-j)) & 1;
+
+         swap_conditional(nqx, nqpqx, bit);
+         swap_conditional(nqz, nqpqz, bit);
+
+         fmonty_out_t result { {nqx2, nqz2}, {nqpqx2, nqpqz2} };
+         fmonty_in_t in { { nqx, nqz }, { nqpqx, nqpqz }, q };
+         fmonty(result, in);
+         swap_conditional(nqx2, nqpqx2, bit);
+         swap_conditional(nqz2, nqpqz2, bit);
+
+         std::swap(nqx, nqx2);
+         std::swap(nqz, nqz2);
+         std::swap(nqpqx, nqpqx2);
+         std::swap(nqpqz, nqpqz2);
+         }
+      }
+
+   copy_mem(resultx, nqx, 5);
+   copy_mem(resultz, nqz, 5);
+   }
 
 
 // -----------------------------------------------------------------------------
 // Shamelessly copied from djb's code, tightened a little
 // -----------------------------------------------------------------------------
-static void
-crecip(felem out, const felem z) {
-  felem a,t0,b,c;
-
-  /* 2 */ fsquare_times(a, z, 1); // a = 2
-  /* 8 */ fsquare_times(t0, a, 2);
-  /* 9 */ fmul(b, t0, z); // b = 9
-  /* 11 */ fmul(a, b, a); // a = 11
-  /* 22 */ fsquare_times(t0, a, 1);
-  /* 2^5 - 2^0 = 31 */ fmul(b, t0, b);
-  /* 2^10 - 2^5 */ fsquare_times(t0, b, 5);
-  /* 2^10 - 2^0 */ fmul(b, t0, b);
-  /* 2^20 - 2^10 */ fsquare_times(t0, b, 10);
-  /* 2^20 - 2^0 */ fmul(c, t0, b);
-  /* 2^40 - 2^20 */ fsquare_times(t0, c, 20);
-  /* 2^40 - 2^0 */ fmul(t0, t0, c);
-  /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10);
-  /* 2^50 - 2^0 */ fmul(b, t0, b);
-  /* 2^100 - 2^50 */ fsquare_times(t0, b, 50);
-  /* 2^100 - 2^0 */ fmul(c, t0, b);
-  /* 2^200 - 2^100 */ fsquare_times(t0, c, 100);
-  /* 2^200 - 2^0 */ fmul(t0, t0, c);
-  /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50);
-  /* 2^250 - 2^0 */ fmul(t0, t0, b);
-  /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5);
-  /* 2^255 - 21 */ fmul(out, t0, a);
+void crecip(uint64_t out[5], const uint64_t z[5])
+   {
+   uint64_t a[5];
+   uint64_t b[5];
+   uint64_t c[5];
+   uint64_t t0[5];
+
+   /* 2 */ fsquare_times(a, z, 1); // a = 2
+   /* 8 */ fsquare_times(t0, a, 2);
+   /* 9 */ fmul(b, t0, z); // b = 9
+   /* 11 */ fmul(a, b, a); // a = 11
+   /* 22 */ fsquare_times(t0, a, 1);
+   /* 2^5 - 2^0 = 31 */ fmul(b, t0, b);
+   /* 2^10 - 2^5 */ fsquare_times(t0, b, 5);
+   /* 2^10 - 2^0 */ fmul(b, t0, b);
+   /* 2^20 - 2^10 */ fsquare_times(t0, b, 10);
+   /* 2^20 - 2^0 */ fmul(c, t0, b);
+   /* 2^40 - 2^20 */ fsquare_times(t0, c, 20);
+   /* 2^40 - 2^0 */ fmul(t0, t0, c);
+   /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10);
+   /* 2^50 - 2^0 */ fmul(b, t0, b);
+   /* 2^100 - 2^50 */ fsquare_times(t0, b, 50);
+   /* 2^100 - 2^0 */ fmul(c, t0, b);
+   /* 2^200 - 2^100 */ fsquare_times(t0, c, 100);
+   /* 2^200 - 2^0 */ fmul(t0, t0, c);
+   /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50);
+   /* 2^250 - 2^0 */ fmul(t0, t0, b);
+   /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5);
+   /* 2^255 - 21 */ fmul(out, t0, a);
+   }
+
 }
 
 void
-curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
-
-  CT::poison(secret, 32);
-  CT::poison(basepoint, 32);
-
-  limb bp[5], x[5], z[5], zmone[5];
-  uint8_t e[32];
-  int i;
-
-  for (i = 0;i < 32;++i) e[i] = secret[i];
-  e[0] &= 248;
-  e[31] &= 127;
-  e[31] |= 64;
-
-  fexpand(bp, basepoint);
-  cmult(x, z, e, bp);
-  crecip(zmone, z);
-  fmul(z, x, zmone);
-  fcontract(mypublic, z);
-
-  CT::unpoison(secret, 32);
-  CT::unpoison(basepoint, 32);
-  CT::unpoison(mypublic, 32);
-}
+curve25519_donna(uint8_t *mypublic, const uint8_t *secret, const uint8_t *basepoint)
+   {
+   CT::poison(secret, 32);
+   CT::poison(basepoint, 32);
+
+   uint64_t bp[5], x[5], z[5], zmone[5];
+   uint8_t e[32];
+
+   copy_mem(e, secret, 32);
+   e[ 0] &= 248;
+   e[31] &= 127;
+   e[31] |= 64;
+
+   fexpand(bp, basepoint);
+   cmult(x, z, e, bp);
+   crecip(zmone, z);
+   fmul(z, x, zmone);
+   fcontract(mypublic, z);
+
+   CT::unpoison(secret, 32);
+   CT::unpoison(basepoint, 32);
+   CT::unpoison(mypublic, 32);
+   }
 
 }
diff --git a/src/lib/utils/donna128.h b/src/lib/utils/donna128.h
index 53e0a085b..feaf11d05 100644
--- a/src/lib/utils/donna128.h
+++ b/src/lib/utils/donna128.h
@@ -114,7 +114,7 @@ inline uint64_t carry_shift(const donna128& a, size_t shift)
    }
 
 inline uint64_t combine_lower(const donna128& a, size_t s1,
-                            const donna128& b, size_t s2)
+                              const donna128& b, size_t s2)
    {
    donna128 z = (a >> s1) | (b << s2);
    return z.lo();
@@ -127,7 +127,7 @@ inline uint64_t carry_shift(const uint128_t a, size_t shift)
    }
 
 inline uint64_t combine_lower(const uint128_t a, size_t s1,
-                            const uint128_t b, size_t s2)
+                              const uint128_t b, size_t s2)
    {
    return static_cast<uint64_t>((a >> s1) | (b << s2));
    }