Merge GH #2099 Misc performance tweaks

author: Jack Lloyd <[email protected]> 2019-09-07 10:33:11 -0400
committer: Jack Lloyd <[email protected]> 2019-09-07 10:33:11 -0400
commit: e75392bc03ff597138328b05093b499e77aeea4a (patch)
tree: 3885338f029cb35704e99c06df5bf883ed538eb6 /src
parent: ce9af808c11b6a2e033c42711f1b1dfeeb31ac4b (diff)
parent: b5dfa2c612ab83fd68b084f86f003a7fc4c36b50 (diff)
3 files changed, 67 insertions, 68 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
index b7e82876c..10e1e5c26 100644
--- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -105,17 +105,15 @@ const SIMD_4x32 rcon[10] = {
    SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
 };
 
-const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
-const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
-const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B);
-
 inline SIMD_4x32 low_nibs(SIMD_4x32 x)
    {
+   const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
    return lo_nibs_mask & x;
    }
 
 inline SIMD_4x32 high_nibs(SIMD_4x32 x)
    {
+   const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
    return (hi_nibs_mask & x).shr<4>();
    }
 
@@ -418,7 +416,7 @@ SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
    const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
    const SIMD_4x32 srx(sr[round_no % 4]);
 
-   SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0);
+   SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
    SIMD_4x32 t2 = t;
    t = shuffle(t, mc_forward0);
    t2 = t ^ t2 ^ shuffle(t, mc_forward0);
@@ -461,7 +459,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
    const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
 
    k = shuffle(k, sr[round_no % 4]);
-   k ^= xor_5B;
+   k ^= SIMD_4x32::splat_u8(0x5B);
    return aes_schedule_transform(k, out_tr1, out_tr2);
    }
 
@@ -470,7 +468,7 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
    const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
    const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
 
-   k ^= xor_5B;
+   k ^= SIMD_4x32::splat_u8(0x5B);
    return aes_schedule_transform(k, deskew1, deskew2);
    }
 
@@ -478,20 +476,17 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
    {
    SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2);
    smeared ^= shift_elems_left<2>(smeared);
-   smeared ^= xor_5B;
-
-   SIMD_4x32 t = high_nibs(input1);
-   input1 = low_nibs(input1);
+   smeared ^= SIMD_4x32::splat_u8(0x5B);
 
-   SIMD_4x32 t2 = shuffle(k_inv2, input1);
+   const SIMD_4x32 Bh = high_nibs(input1);
+   SIMD_4x32 Bl = low_nibs(input1);
 
-   input1 ^= t;
+   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
 
-   SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
-   SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1);
+   Bl ^= Bh;
 
-   SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3);
-   SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
+   SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
 
    return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
    }
diff --git a/src/lib/stream/ctr/ctr.cpp b/src/lib/stream/ctr/ctr.cpp
index f1d8d97b8..ca187ea47 100644
--- a/src/lib/stream/ctr/ctr.cpp
+++ b/src/lib/stream/ctr/ctr.cpp
@@ -141,31 +141,27 @@ void CTR_BE::add_counter(const uint64_t counter)
 
    if(ctr_size == 4)
       {
-      size_t off = (BS - 4);
-      uint32_t low32 = static_cast<uint32_t>(counter + load_be<uint32_t>(&m_counter[off], 0));
+      const size_t off = (BS - 4);
+      const uint32_t low32 = static_cast<uint32_t>(counter + load_be<uint32_t>(&m_counter[off], 0));
 
       for(size_t i = 0; i != ctr_blocks; ++i)
          {
-         store_be(low32, &m_counter[off]);
-         off += BS;
-         low32 += 1;
+         store_be(uint32_t(low32 + i), &m_counter[i*BS+off]);
          }
       }
    else if(ctr_size == 8)
       {
-      size_t off = (BS - 8);
-      uint64_t low64 = counter + load_be<uint64_t>(&m_counter[off], 0);
+      const size_t off = (BS - 8);
+      const uint64_t low64 = counter + load_be<uint64_t>(&m_counter[off], 0);
 
       for(size_t i = 0; i != ctr_blocks; ++i)
          {
-         store_be(low64, &m_counter[off]);
-         off += BS;
-         low64 += 1;
+         store_be(uint64_t(low64 + i), &m_counter[i*BS+off]);
          }
       }
    else if(ctr_size == 16)
       {
-      size_t off = (BS - 16);
+      const size_t off = (BS - 16);
       uint64_t b0 = load_be<uint64_t>(&m_counter[off], 0);
       uint64_t b1 = load_be<uint64_t>(&m_counter[off], 1);
       b1 += counter;
@@ -173,9 +169,8 @@ void CTR_BE::add_counter(const uint64_t counter)
 
       for(size_t i = 0; i != ctr_blocks; ++i)
          {
-         store_be(b0, &m_counter[off]);
-         store_be(b1, &m_counter[off+8]);
-         off += BS;
+         store_be(b0, &m_counter[i*BS+off]);
+         store_be(b1, &m_counter[i*BS+off+8]);
          b1 += 1;
          b0 += (b1 == 0); // carry
          }
diff --git a/src/lib/utils/mem_ops.h b/src/lib/utils/mem_ops.h
index 31a1efcc9..569cb409b 100644
--- a/src/lib/utils/mem_ops.h
+++ b/src/lib/utils/mem_ops.h
@@ -127,19 +127,24 @@ template<typename T> inline void copy_mem(T* out, const T* in, size_t n)
       }
    }
 
-template<typename T> inline void typecast_copy(uint8_t out[], T in)
+template<typename T> inline void typecast_copy(uint8_t out[], T in[], size_t N)
    {
-   std::memcpy(out, &in, sizeof(T));
+   std::memcpy(out, in, sizeof(T)*N);
    }
 
-template<typename T> inline void typecast_copy(T& out, const uint8_t in[])
+template<typename T> inline void typecast_copy(T out[], const uint8_t in[], size_t N)
    {
-   std::memcpy(&out, in, sizeof(T));
+   std::memcpy(out, in, sizeof(T)*N);
    }
 
-template<typename T> inline void typecast_copy(T out[], const uint8_t in[], size_t N)
+template<typename T> inline void typecast_copy(uint8_t out[], T in)
    {
-   std::memcpy(out, in, sizeof(T)*N);
+   typecast_copy(out, &in, 1);
+   }
+
+template<typename T> inline void typecast_copy(T& out, const uint8_t in[])
+   {
+   typecast_copy(&out, in, 1);
    }
 
 /**
@@ -203,28 +208,27 @@ inline void xor_buf(uint8_t out[],
                     const uint8_t in[],
                     size_t length)
    {
-   while(length >= 16)
+   const size_t blocks = length - (length % 32);
+
+   for(size_t i = 0; i != blocks; i += 32)
       {
-      uint64_t x0, x1, y0, y1;
-
-      typecast_copy(x0, in);
-      typecast_copy(x1, in + 8);
-      typecast_copy(y0, out);
-      typecast_copy(y1, out + 8);
-
-      y0 ^= x0;
-      y1 ^= x1;
-      typecast_copy(out, y0);
-      typecast_copy(out + 8, y1);
-      out += 16; in += 16; length -= 16;
+      uint64_t x[4];
+      uint64_t y[4];
+
+      typecast_copy(x, out + i, 4);
+      typecast_copy(y, in + i, 4);
+
+      x[0] ^= y[0];
+      x[1] ^= y[1];
+      x[2] ^= y[2];
+      x[3] ^= y[3];
+
+      typecast_copy(out + i, x, 4);
       }
 
-   while(length > 0)
+   for(size_t i = blocks; i != length; ++i)
       {
-      out[0] ^= in[0];
-      out += 1;
-      in += 1;
-      length -= 1;
+      out[i] ^= in[i];
       }
    }
 
@@ -240,23 +244,28 @@ inline void xor_buf(uint8_t out[],
                     const uint8_t in2[],
                     size_t length)
    {
-   while(length >= 16)
+   const size_t blocks = length - (length % 32);
+
+   for(size_t i = 0; i != blocks; i += 32)
       {
-      uint64_t x0, x1, y0, y1;
-      typecast_copy(x0, in);
-      typecast_copy(x1, in + 8);
-      typecast_copy(y0, in2);
-      typecast_copy(y1, in2 + 8);
-
-      x0 ^= y0;
-      x1 ^= y1;
-      typecast_copy(out, x0);
-      typecast_copy(out + 8, x1);
-      out += 16; in += 16; in2 += 16; length -= 16;
+      uint64_t x[4];
+      uint64_t y[4];
+
+      typecast_copy(x, in + i, 4);
+      typecast_copy(y, in2 + i, 4);
+
+      x[0] ^= y[0];
+      x[1] ^= y[1];
+      x[2] ^= y[2];
+      x[3] ^= y[3];
+
+      typecast_copy(out + i, x, 4);
       }
 
-   for(size_t i = 0; i != length; ++i)
+   for(size_t i = blocks; i != length; ++i)
+      {
       out[i] = in[i] ^ in2[i];
+      }
    }
 
 template<typename Alloc, typename Alloc2>
author	Jack Lloyd <[email protected]>	2019-09-07 10:33:11 -0400
committer	Jack Lloyd <[email protected]>	2019-09-07 10:33:11 -0400
commit	e75392bc03ff597138328b05093b499e77aeea4a (patch)
tree	3885338f029cb35704e99c06df5bf883ed538eb6 /src
parent	ce9af808c11b6a2e033c42711f1b1dfeeb31ac4b (diff)
parent	b5dfa2c612ab83fd68b084f86f003a7fc4c36b50 (diff)