aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/modes
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-10-13 12:08:30 -0400
committerJack Lloyd <[email protected]>2017-10-13 12:16:39 -0400
commit577828a93755549f0e9d8413488e3e4485c67263 (patch)
treedbb1d6284914e0aa89212bfd33016e1a1a2c45c5 /src/lib/modes
parent742420b4b631d6d9139fe5f63ca5650f4fb56b9d (diff)
Optimize GCM
By allowing multiple blocks for clmul, slight speedup there though still far behind optimum. Precompute a table of multiples of H, 3-4x faster on systems without clmul (and still no secret indexes). Refactor GMAC to not derive from GHASH
Diffstat (limited to 'src/lib/modes')
-rw-r--r--src/lib/modes/aead/gcm/clmul/clmul.cpp113
-rw-r--r--src/lib/modes/aead/gcm/clmul/clmul.h3
-rw-r--r--src/lib/modes/aead/gcm/gcm.cpp111
-rw-r--r--src/lib/modes/aead/gcm/gcm.h27
-rw-r--r--src/lib/modes/aead/gcm/pmull/pmull.cpp109
-rw-r--r--src/lib/modes/aead/gcm/pmull/pmull.h3
6 files changed, 215 insertions, 151 deletions
diff --git a/src/lib/modes/aead/gcm/clmul/clmul.cpp b/src/lib/modes/aead/gcm/clmul/clmul.cpp
index ed3473b4e..33378d833 100644
--- a/src/lib/modes/aead/gcm/clmul/clmul.cpp
+++ b/src/lib/modes/aead/gcm/clmul/clmul.cpp
@@ -12,67 +12,76 @@
namespace Botan {
BOTAN_FUNC_ISA("pclmul,ssse3")
-void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16])
+void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16],
+ const uint8_t input[], size_t blocks)
{
/*
* Algorithms 1 and 5 from Intel's CLMUL guide
*/
const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ const __m128i b = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H)), BSWAP_MASK);
+
__m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x));
- __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(H));
+ a = _mm_shuffle_epi8(a, BSWAP_MASK);
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input) + i);
+ m = _mm_shuffle_epi8(m, BSWAP_MASK);
+
+ a = _mm_xor_si128(a, m);
+
+ __m128i T0, T1, T2, T3, T4, T5;
+
+ T0 = _mm_clmulepi64_si128(a, b, 0x00);
+ T1 = _mm_clmulepi64_si128(a, b, 0x01);
+ T2 = _mm_clmulepi64_si128(a, b, 0x10);
+ T3 = _mm_clmulepi64_si128(a, b, 0x11);
+
+ T1 = _mm_xor_si128(T1, T2);
+ T2 = _mm_slli_si128(T1, 8);
+ T1 = _mm_srli_si128(T1, 8);
+ T0 = _mm_xor_si128(T0, T2);
+ T3 = _mm_xor_si128(T3, T1);
+
+ T4 = _mm_srli_epi32(T0, 31);
+ T0 = _mm_slli_epi32(T0, 1);
+
+ T5 = _mm_srli_epi32(T3, 31);
+ T3 = _mm_slli_epi32(T3, 1);
+
+ T2 = _mm_srli_si128(T4, 12);
+ T5 = _mm_slli_si128(T5, 4);
+ T4 = _mm_slli_si128(T4, 4);
+ T0 = _mm_or_si128(T0, T4);
+ T3 = _mm_or_si128(T3, T5);
+ T3 = _mm_or_si128(T3, T2);
+
+ T4 = _mm_slli_epi32(T0, 31);
+ T5 = _mm_slli_epi32(T0, 30);
+ T2 = _mm_slli_epi32(T0, 25);
+
+ T4 = _mm_xor_si128(T4, T5);
+ T4 = _mm_xor_si128(T4, T2);
+ T5 = _mm_srli_si128(T4, 4);
+ T3 = _mm_xor_si128(T3, T5);
+ T4 = _mm_slli_si128(T4, 12);
+ T0 = _mm_xor_si128(T0, T4);
+ T3 = _mm_xor_si128(T3, T0);
+
+ T4 = _mm_srli_epi32(T0, 1);
+ T1 = _mm_srli_epi32(T0, 2);
+ T2 = _mm_srli_epi32(T0, 7);
+ T3 = _mm_xor_si128(T3, T1);
+ T3 = _mm_xor_si128(T3, T2);
+ T3 = _mm_xor_si128(T3, T4);
+
+ a = T3;
+ }
a = _mm_shuffle_epi8(a, BSWAP_MASK);
- b = _mm_shuffle_epi8(b, BSWAP_MASK);
-
- __m128i T0, T1, T2, T3, T4, T5;
-
- T0 = _mm_clmulepi64_si128(a, b, 0x00);
- T1 = _mm_clmulepi64_si128(a, b, 0x01);
- T2 = _mm_clmulepi64_si128(a, b, 0x10);
- T3 = _mm_clmulepi64_si128(a, b, 0x11);
-
- T1 = _mm_xor_si128(T1, T2);
- T2 = _mm_slli_si128(T1, 8);
- T1 = _mm_srli_si128(T1, 8);
- T0 = _mm_xor_si128(T0, T2);
- T3 = _mm_xor_si128(T3, T1);
-
- T4 = _mm_srli_epi32(T0, 31);
- T0 = _mm_slli_epi32(T0, 1);
-
- T5 = _mm_srli_epi32(T3, 31);
- T3 = _mm_slli_epi32(T3, 1);
-
- T2 = _mm_srli_si128(T4, 12);
- T5 = _mm_slli_si128(T5, 4);
- T4 = _mm_slli_si128(T4, 4);
- T0 = _mm_or_si128(T0, T4);
- T3 = _mm_or_si128(T3, T5);
- T3 = _mm_or_si128(T3, T2);
-
- T4 = _mm_slli_epi32(T0, 31);
- T5 = _mm_slli_epi32(T0, 30);
- T2 = _mm_slli_epi32(T0, 25);
-
- T4 = _mm_xor_si128(T4, T5);
- T4 = _mm_xor_si128(T4, T2);
- T5 = _mm_srli_si128(T4, 4);
- T3 = _mm_xor_si128(T3, T5);
- T4 = _mm_slli_si128(T4, 12);
- T0 = _mm_xor_si128(T0, T4);
- T3 = _mm_xor_si128(T3, T0);
-
- T4 = _mm_srli_epi32(T0, 1);
- T1 = _mm_srli_epi32(T0, 2);
- T2 = _mm_srli_epi32(T0, 7);
- T3 = _mm_xor_si128(T3, T1);
- T3 = _mm_xor_si128(T3, T2);
- T3 = _mm_xor_si128(T3, T4);
-
- T3 = _mm_shuffle_epi8(T3, BSWAP_MASK);
-
- _mm_storeu_si128(reinterpret_cast<__m128i*>(x), T3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(x), a);
}
}
diff --git a/src/lib/modes/aead/gcm/clmul/clmul.h b/src/lib/modes/aead/gcm/clmul/clmul.h
index b47c73f27..d68e021d2 100644
--- a/src/lib/modes/aead/gcm/clmul/clmul.h
+++ b/src/lib/modes/aead/gcm/clmul/clmul.h
@@ -12,7 +12,8 @@
namespace Botan {
-void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16]);
+void gcm_multiply_clmul(uint8_t x[16], const uint8_t H[16],
+ const uint8_t input[], size_t blocks);
}
diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp
index bb832245e..9079aa039 100644
--- a/src/lib/modes/aead/gcm/gcm.cpp
+++ b/src/lib/modes/aead/gcm/gcm.cpp
@@ -24,52 +24,52 @@ namespace Botan {
static const size_t GCM_BS = 16;
-void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
+void GHASH::gcm_multiply(secure_vector<uint8_t>& x,
+ const uint8_t input[],
+ size_t blocks)
{
+ if(blocks == 0)
+ return;
+
#if defined(BOTAN_HAS_GCM_CLMUL)
if(CPUID::has_clmul())
- return gcm_multiply_clmul(x.data(), m_H.data());
+ return gcm_multiply_clmul(x.data(), m_H.data(), input, blocks);
#elif defined(BOTAN_HAS_GCM_PMULL)
if(CPUID::has_arm_pmull())
- return gcm_multiply_pmull(x.data(), m_H.data());
+ return gcm_multiply_pmull(x.data(), m_H.data(), input, blocks);
#endif
- static const uint64_t R = 0xE100000000000000;
-
- uint64_t H[2] = {
- load_be<uint64_t>(m_H.data(), 0),
- load_be<uint64_t>(m_H.data(), 1)
- };
-
- uint64_t Z[2] = { 0, 0 };
-
- CT::poison(H, 2);
- CT::poison(Z, 2);
CT::poison(x.data(), x.size());
// SSE2 might be useful here
- for(size_t i = 0; i != 2; ++i)
- {
- const uint64_t X = load_be<uint64_t>(x.data(), i);
+ const uint64_t ones = 0xFFFFFFFFFFFFFFFF;
- uint64_t mask = 0x8000000000000000;
- for(size_t j = 0; j != 64; ++j)
- {
- const uint64_t XMASK = CT::expand_mask<uint64_t>(X & mask);
- mask >>= 1;
- Z[0] ^= H[0] & XMASK;
- Z[1] ^= H[1] & XMASK;
+ uint64_t X0 = load_be<uint64_t>(x.data(), 0);
+ uint64_t X1 = load_be<uint64_t>(x.data(), 1);
- // GCM's bit ops are reversed so we carry out of the bottom
- const uint64_t carry = R & CT::expand_mask<uint64_t>(H[1] & 1);
+ for(size_t b = 0; b != blocks; ++b)
+ {
+ X0 ^= load_be<uint64_t>(input, 2*b);
+ X1 ^= load_be<uint64_t>(input, 2*b+1);
+
+ uint64_t Z[2] = { 0, 0 };
- H[1] = (H[1] >> 1) | (H[0] << 63);
- H[0] = (H[0] >> 1) ^ carry;
+ for(size_t i = 0; i != 64; ++i)
+ {
+ const uint64_t X0MASK = ones * ((X0 >> (63-i)) & 1);
+ const uint64_t X1MASK = ones * ((X1 >> (63-i)) & 1);
+ Z[0] ^= m_HM[4*i ] & X0MASK;
+ Z[1] ^= m_HM[4*i+1] & X0MASK;
+ Z[0] ^= m_HM[4*i+2] & X1MASK;
+ Z[1] ^= m_HM[4*i+3] & X1MASK;
}
+
+ X0 = Z[0];
+ X1 = Z[1];
}
- store_be<uint64_t>(x.data(), Z[0], Z[1]);
+ store_be<uint64_t>(x.data(), X0, X1);
CT::unpoison(x.data(), x.size());
}
@@ -80,16 +80,20 @@ void GHASH::ghash_update(secure_vector<uint8_t>& ghash,
This assumes if less than block size input then we're just on the
final block and should pad with zeros
*/
- while(length)
- {
- const size_t to_proc = std::min(length, GCM_BS);
- xor_buf(ghash.data(), input, to_proc);
+ const size_t full_blocks = length / GCM_BS;
+ const size_t final_bytes = length - (full_blocks * GCM_BS);
- gcm_multiply(ghash);
+ if(full_blocks > 0)
+ {
+ gcm_multiply(ghash, input, full_blocks);
+ }
- input += to_proc;
- length -= to_proc;
+ if(final_bytes)
+ {
+ secure_vector<uint8_t> last_block(GCM_BS);
+ copy_mem(last_block.data(), input + full_blocks * GCM_BS, final_bytes);
+ gcm_multiply(ghash, last_block.data(), 1);
}
}
@@ -99,6 +103,32 @@ void GHASH::key_schedule(const uint8_t key[], size_t length)
m_H_ad.resize(GCM_BS);
m_ad_len = 0;
m_text_len = 0;
+
+ uint64_t H0 = load_be<uint64_t>(m_H.data(), 0);
+ uint64_t H1 = load_be<uint64_t>(m_H.data(), 1);
+
+ const uint64_t R = 0xE100000000000000;
+
+ m_HM.resize(256);
+
+ // precompute the multiples of H
+ for(size_t i = 0; i != 2; ++i)
+ {
+ for(size_t j = 0; j != 64; ++j)
+ {
+ /*
+ we interleave H^1, H^65, H^2, H^66, ...
+ to make indexing nicer in the multiplication code
+ */
+ m_HM[4*j+2*i] = H0;
+ m_HM[4*j+2*i+1] = H1;
+
+ // GCM's bit ops are reversed so we carry out of the bottom
+ const uint64_t carry = R * (H1 & 1);
+ H1 = (H1 >> 1) | (H0 << 63);
+ H0 = (H0 >> 1) ^ carry;
+ }
+ }
}
void GHASH::start(const uint8_t nonce[], size_t len)
@@ -115,12 +145,17 @@ void GHASH::set_associated_data(const uint8_t input[], size_t length)
m_ad_len = length;
}
-void GHASH::update(const uint8_t input[], size_t length)
+void GHASH::update_associated_data(const uint8_t ad[], size_t length)
{
BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set");
+ m_ad_len += length;
+ ghash_update(m_ghash, ad, length);
+ }
+void GHASH::update(const uint8_t input[], size_t length)
+ {
+ BOTAN_ASSERT(m_ghash.size() == GCM_BS, "Key was set");
m_text_len += length;
-
ghash_update(m_ghash, input, length);
}
diff --git a/src/lib/modes/aead/gcm/gcm.h b/src/lib/modes/aead/gcm/gcm.h
index deedfdade..eac2add93 100644
--- a/src/lib/modes/aead/gcm/gcm.h
+++ b/src/lib/modes/aead/gcm/gcm.h
@@ -111,9 +111,10 @@ class BOTAN_PUBLIC_API(2,0) GCM_Decryption final : public GCM_Mode
/**
* GCM's GHASH
-* Maybe a Transform?
+* This is not intended for general use, but is exposed to allow
+* shared code between GCM and GMAC
*/
-class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm
+class BOTAN_PUBLIC_API(2,0) GHASH final : public SymmetricAlgorithm
{
public:
void set_associated_data(const uint8_t ad[], size_t ad_len);
@@ -127,6 +128,11 @@ class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm
*/
void update(const uint8_t in[], size_t len);
+ /*
+ * Incremental update of associated data
+ */
+ void update_associated_data(const uint8_t ad[], size_t len);
+
secure_vector<uint8_t> final();
Key_Length_Specification key_spec() const override
@@ -137,24 +143,25 @@ class BOTAN_PUBLIC_API(2,0) GHASH : public SymmetricAlgorithm
void reset();
std::string name() const override { return "GHASH"; }
- protected:
+
void ghash_update(secure_vector<uint8_t>& x,
const uint8_t input[], size_t input_len);
void add_final_block(secure_vector<uint8_t>& x,
size_t ad_len, size_t pt_len);
-
- secure_vector<uint8_t> m_H;
- secure_vector<uint8_t> m_H_ad;
- secure_vector<uint8_t> m_ghash;
- size_t m_ad_len = 0;
-
private:
void key_schedule(const uint8_t key[], size_t key_len) override;
- void gcm_multiply(secure_vector<uint8_t>& x) const;
+ void gcm_multiply(secure_vector<uint8_t>& x,
+ const uint8_t input[],
+ size_t blocks);
+ secure_vector<uint8_t> m_H;
+ secure_vector<uint8_t> m_H_ad;
+ secure_vector<uint8_t> m_ghash;
secure_vector<uint8_t> m_nonce;
+ secure_vector<uint64_t> m_HM;
+ size_t m_ad_len = 0;
size_t m_text_len = 0;
};
diff --git a/src/lib/modes/aead/gcm/pmull/pmull.cpp b/src/lib/modes/aead/gcm/pmull/pmull.cpp
index 54e841650..12d6ff7d1 100644
--- a/src/lib/modes/aead/gcm/pmull/pmull.cpp
+++ b/src/lib/modes/aead/gcm/pmull/pmull.cpp
@@ -10,62 +10,73 @@
namespace Botan {
BOTAN_FUNC_ISA("+crypto")
-void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16])
+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16],
+ const uint8_t input[], size_t blocks)
{
/*
* Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf
*/
- const uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x))));
+ uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x))));
const uint64x2_t b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(H+8)), vrev64_u8(vld1_u8(H))));
- uint64x2_t T0, T1, T2, T3, T4, T5;
-
- T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0));
- T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0));
- T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1));
- T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1));
-
- T1 = veorq_u64(T1, T2);
- T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8));
- T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8));
- T0 = veorq_u64(T0, T2);
- T3 = veorq_u64(T3, T1);
-
- T4 = vshrq_n_u64(T0, 31);
- T0 = vshlq_n_u64(T0, 1);
-
- T5 = vshrq_n_u64(T3, 31);
- T3 = vshlq_n_u64(T3, 1);
-
- T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12));
- T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12));
- T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12));
- T0 = vorrq_u64(T0, T4);
- T3 = vorrq_u64(T3, T5);
- T3 = vorrq_u64(T3, T2);
-
- T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31));
- T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30));
- T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25));
-
- T4 = veorq_u64(T4, T5);
- T4 = veorq_u64(T4, T2);
- T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4));
- T3 = veorq_u64(T3, T5);
- T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4));
- T0 = veorq_u64(T0, T4);
- T3 = veorq_u64(T3, T0);
-
- T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1));
- T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2));
- T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7));
- T3 = veorq_u64(T3, T1);
- T3 = veorq_u64(T3, T2);
- T3 = veorq_u64(T3, T4);
-
- vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(T3))));
- vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(T3))));
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ const uint64x2_t m64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(input+8)), vrev64_u8(vld1_u8(input))));
+ input += 16;
+
+ a64 = veorq_u64(a64, m64);
+
+ uint64x2_t T0, T1, T2, T3, T4, T5;
+
+ T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0));
+ T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0));
+ T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1));
+ T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1));
+
+ T1 = veorq_u64(T1, T2);
+ T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8));
+ T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8));
+ T0 = veorq_u64(T0, T2);
+ T3 = veorq_u64(T3, T1);
+
+ T4 = vshrq_n_u64(T0, 31);
+ T0 = vshlq_n_u64(T0, 1);
+
+ T5 = vshrq_n_u64(T3, 31);
+ T3 = vshlq_n_u64(T3, 1);
+
+ T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12));
+ T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12));
+ T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12));
+ T0 = vorrq_u64(T0, T4);
+ T3 = vorrq_u64(T3, T5);
+ T3 = vorrq_u64(T3, T2);
+
+ T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31));
+ T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30));
+ T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25));
+
+ T4 = veorq_u64(T4, T5);
+ T4 = veorq_u64(T4, T2);
+ T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4));
+ T3 = veorq_u64(T3, T5);
+ T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4));
+ T0 = veorq_u64(T0, T4);
+ T3 = veorq_u64(T3, T0);
+
+ T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1));
+ T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2));
+ T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7));
+ T3 = veorq_u64(T3, T1);
+ T3 = veorq_u64(T3, T2);
+ T3 = veorq_u64(T3, T4);
+
+ a64 = T3;
+ }
+
+ vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(a64))));
+ vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(a64))));
}
}
diff --git a/src/lib/modes/aead/gcm/pmull/pmull.h b/src/lib/modes/aead/gcm/pmull/pmull.h
index 4ddcc8f27..638b845cd 100644
--- a/src/lib/modes/aead/gcm/pmull/pmull.h
+++ b/src/lib/modes/aead/gcm/pmull/pmull.h
@@ -12,7 +12,8 @@
namespace Botan {
-void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]);
+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16],
+ const uint8_t input[], size_t blocks);
}