diff options
author | Jack Lloyd <[email protected]> | 2017-10-13 12:08:30 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-13 12:16:39 -0400 |
commit | 577828a93755549f0e9d8413488e3e4485c67263 (patch) | |
tree | dbb1d6284914e0aa89212bfd33016e1a1a2c45c5 /src/lib/modes/aead/gcm/pmull | |
parent | 742420b4b631d6d9139fe5f63ca5650f4fb56b9d (diff) |
Optimize GCM
By allowing multiple blocks for clmul, slight speedup there though still
far behind optimum.
Precompute a table of multiples of H, 3-4x faster on systems without clmul
(and still no secret indexes).
Refactor GMAC to not derive from GHASH
Diffstat (limited to 'src/lib/modes/aead/gcm/pmull')
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.cpp | 109 | ||||
-rw-r--r-- | src/lib/modes/aead/gcm/pmull/pmull.h | 3 |
2 files changed, 62 insertions, 50 deletions
diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp index 54e841650..12d6ff7d1 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.cpp +++ b/src/lib/modes/aead/gcm/pmull/pmull.cpp @@ -10,62 +10,73 @@ namespace Botan { BOTAN_FUNC_ISA("+crypto") -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]) +void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks) { /* * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf */ - const uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); + uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x)))); const uint64x2_t b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(H+8)), vrev64_u8(vld1_u8(H)))); - uint64x2_t T0, T1, T2, T3, T4, T5; - - T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); - T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); - T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); - T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); - - T1 = veorq_u64(T1, T2); - T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); - T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); - T0 = veorq_u64(T0, T2); - T3 = veorq_u64(T3, T1); - - T4 = vshrq_n_u64(T0, 31); - T0 = vshlq_n_u64(T0, 1); - - T5 = vshrq_n_u64(T3, 31); - T3 = vshlq_n_u64(T3, 1); - - T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); - T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); - T0 = vorrq_u64(T0, T4); - T3 = vorrq_u64(T3, T5); - T3 = vorrq_u64(T3, T2); - - T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); - T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); - T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); - - T4 = veorq_u64(T4, T5); - T4 = veorq_u64(T4, T2); - T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); - T3 = veorq_u64(T3, T5); - T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); - T0 = veorq_u64(T0, T4); - T3 = veorq_u64(T3, T0); - - T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); - T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); - T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); - T3 = veorq_u64(T3, T1); - T3 = veorq_u64(T3, T2); - T3 = veorq_u64(T3, T4); - - vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(T3)))); - vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(T3)))); + for(size_t i = 0; i != blocks; ++i) + { + const uint64x2_t m64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(input+8)), vrev64_u8(vld1_u8(input)))); + input += 16; + + a64 = veorq_u64(a64, m64); + + uint64x2_t T0, T1, T2, T3, T4, T5; + + T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0)); + T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0)); + T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1)); + T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1)); + + T1 = veorq_u64(T1, T2); + T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8)); + T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8)); + T0 = veorq_u64(T0, T2); + T3 = veorq_u64(T3, T1); + + T4 = vshrq_n_u64(T0, 31); + T0 = vshlq_n_u64(T0, 1); + + T5 = vshrq_n_u64(T3, 31); + T3 = vshlq_n_u64(T3, 1); + + T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12)); + T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12)); + T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12)); + T0 = vorrq_u64(T0, T4); + T3 = vorrq_u64(T3, T5); + T3 = vorrq_u64(T3, T2); + + T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31)); + T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30)); + T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25)); + + T4 = veorq_u64(T4, T5); + T4 = veorq_u64(T4, T2); + T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4)); + T3 = veorq_u64(T3, T5); + T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4)); + T0 = veorq_u64(T0, T4); + T3 = veorq_u64(T3, T0); + + T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1)); + T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2)); + T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7)); + T3 = veorq_u64(T3, T1); + T3 = veorq_u64(T3, T2); + T3 = veorq_u64(T3, T4); + + a64 = T3; + } + + vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(a64)))); + vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(a64)))); } } diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h index 4ddcc8f27..638b845cd 100644 --- a/src/lib/modes/aead/gcm/pmull/pmull.h +++ b/src/lib/modes/aead/gcm/pmull/pmull.h @@ -12,7 +12,8 @@ namespace Botan { -void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]); +void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16], + const uint8_t input[], size_t blocks); } |