aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2019-09-07 10:33:11 -0400
committerJack Lloyd <[email protected]>2019-09-07 10:33:11 -0400
commite75392bc03ff597138328b05093b499e77aeea4a (patch)
tree3885338f029cb35704e99c06df5bf883ed538eb6 /src
parentce9af808c11b6a2e033c42711f1b1dfeeb31ac4b (diff)
parentb5dfa2c612ab83fd68b084f86f003a7fc4c36b50 (diff)
Merge GH #2099 Misc performance tweaks
Diffstat (limited to 'src')
-rw-r--r--src/lib/block/aes/aes_vperm/aes_vperm.cpp29
-rw-r--r--src/lib/stream/ctr/ctr.cpp23
-rw-r--r--src/lib/utils/mem_ops.h83
3 files changed, 67 insertions, 68 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
index b7e82876c..10e1e5c26 100644
--- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -105,17 +105,15 @@ const SIMD_4x32 rcon[10] = {
SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
};
-const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
-const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
-const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B);
-
inline SIMD_4x32 low_nibs(SIMD_4x32 x)
{
+ const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
return lo_nibs_mask & x;
}
inline SIMD_4x32 high_nibs(SIMD_4x32 x)
{
+ const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
return (hi_nibs_mask & x).shr<4>();
}
@@ -418,7 +416,7 @@ SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
const SIMD_4x32 srx(sr[round_no % 4]);
- SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0);
+ SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
SIMD_4x32 t2 = t;
t = shuffle(t, mc_forward0);
t2 = t ^ t2 ^ shuffle(t, mc_forward0);
@@ -461,7 +459,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
k = shuffle(k, sr[round_no % 4]);
- k ^= xor_5B;
+ k ^= SIMD_4x32::splat_u8(0x5B);
return aes_schedule_transform(k, out_tr1, out_tr2);
}
@@ -470,7 +468,7 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
- k ^= xor_5B;
+ k ^= SIMD_4x32::splat_u8(0x5B);
return aes_schedule_transform(k, deskew1, deskew2);
}
@@ -478,20 +476,17 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
{
SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2);
smeared ^= shift_elems_left<2>(smeared);
- smeared ^= xor_5B;
-
- SIMD_4x32 t = high_nibs(input1);
- input1 = low_nibs(input1);
+ smeared ^= SIMD_4x32::splat_u8(0x5B);
- SIMD_4x32 t2 = shuffle(k_inv2, input1);
+ const SIMD_4x32 Bh = high_nibs(input1);
+ SIMD_4x32 Bl = low_nibs(input1);
- input1 ^= t;
+ const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
- SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
- SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1);
+ Bl ^= Bh;
- SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3);
- SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
+ SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
}
diff --git a/src/lib/stream/ctr/ctr.cpp b/src/lib/stream/ctr/ctr.cpp
index f1d8d97b8..ca187ea47 100644
--- a/src/lib/stream/ctr/ctr.cpp
+++ b/src/lib/stream/ctr/ctr.cpp
@@ -141,31 +141,27 @@ void CTR_BE::add_counter(const uint64_t counter)
if(ctr_size == 4)
{
- size_t off = (BS - 4);
- uint32_t low32 = static_cast<uint32_t>(counter + load_be<uint32_t>(&m_counter[off], 0));
+ const size_t off = (BS - 4);
+ const uint32_t low32 = static_cast<uint32_t>(counter + load_be<uint32_t>(&m_counter[off], 0));
for(size_t i = 0; i != ctr_blocks; ++i)
{
- store_be(low32, &m_counter[off]);
- off += BS;
- low32 += 1;
+ store_be(uint32_t(low32 + i), &m_counter[i*BS+off]);
}
}
else if(ctr_size == 8)
{
- size_t off = (BS - 8);
- uint64_t low64 = counter + load_be<uint64_t>(&m_counter[off], 0);
+ const size_t off = (BS - 8);
+ const uint64_t low64 = counter + load_be<uint64_t>(&m_counter[off], 0);
for(size_t i = 0; i != ctr_blocks; ++i)
{
- store_be(low64, &m_counter[off]);
- off += BS;
- low64 += 1;
+ store_be(uint64_t(low64 + i), &m_counter[i*BS+off]);
}
}
else if(ctr_size == 16)
{
- size_t off = (BS - 16);
+ const size_t off = (BS - 16);
uint64_t b0 = load_be<uint64_t>(&m_counter[off], 0);
uint64_t b1 = load_be<uint64_t>(&m_counter[off], 1);
b1 += counter;
@@ -173,9 +169,8 @@ void CTR_BE::add_counter(const uint64_t counter)
for(size_t i = 0; i != ctr_blocks; ++i)
{
- store_be(b0, &m_counter[off]);
- store_be(b1, &m_counter[off+8]);
- off += BS;
+ store_be(b0, &m_counter[i*BS+off]);
+ store_be(b1, &m_counter[i*BS+off+8]);
b1 += 1;
b0 += (b1 == 0); // carry
}
diff --git a/src/lib/utils/mem_ops.h b/src/lib/utils/mem_ops.h
index 31a1efcc9..569cb409b 100644
--- a/src/lib/utils/mem_ops.h
+++ b/src/lib/utils/mem_ops.h
@@ -127,19 +127,24 @@ template<typename T> inline void copy_mem(T* out, const T* in, size_t n)
}
}
-template<typename T> inline void typecast_copy(uint8_t out[], T in)
+template<typename T> inline void typecast_copy(uint8_t out[], T in[], size_t N)
{
- std::memcpy(out, &in, sizeof(T));
+ std::memcpy(out, in, sizeof(T)*N);
}
-template<typename T> inline void typecast_copy(T& out, const uint8_t in[])
+template<typename T> inline void typecast_copy(T out[], const uint8_t in[], size_t N)
{
- std::memcpy(&out, in, sizeof(T));
+ std::memcpy(out, in, sizeof(T)*N);
}
-template<typename T> inline void typecast_copy(T out[], const uint8_t in[], size_t N)
+template<typename T> inline void typecast_copy(uint8_t out[], T in)
{
- std::memcpy(out, in, sizeof(T)*N);
+ typecast_copy(out, &in, 1);
+ }
+
+template<typename T> inline void typecast_copy(T& out, const uint8_t in[])
+ {
+ typecast_copy(&out, in, 1);
}
/**
@@ -203,28 +208,27 @@ inline void xor_buf(uint8_t out[],
const uint8_t in[],
size_t length)
{
- while(length >= 16)
+ const size_t blocks = length - (length % 32);
+
+ for(size_t i = 0; i != blocks; i += 32)
{
- uint64_t x0, x1, y0, y1;
-
- typecast_copy(x0, in);
- typecast_copy(x1, in + 8);
- typecast_copy(y0, out);
- typecast_copy(y1, out + 8);
-
- y0 ^= x0;
- y1 ^= x1;
- typecast_copy(out, y0);
- typecast_copy(out + 8, y1);
- out += 16; in += 16; length -= 16;
+ uint64_t x[4];
+ uint64_t y[4];
+
+ typecast_copy(x, out + i, 4);
+ typecast_copy(y, in + i, 4);
+
+ x[0] ^= y[0];
+ x[1] ^= y[1];
+ x[2] ^= y[2];
+ x[3] ^= y[3];
+
+ typecast_copy(out + i, x, 4);
}
- while(length > 0)
+ for(size_t i = blocks; i != length; ++i)
{
- out[0] ^= in[0];
- out += 1;
- in += 1;
- length -= 1;
+ out[i] ^= in[i];
}
}
@@ -240,23 +244,28 @@ inline void xor_buf(uint8_t out[],
const uint8_t in2[],
size_t length)
{
- while(length >= 16)
+ const size_t blocks = length - (length % 32);
+
+ for(size_t i = 0; i != blocks; i += 32)
{
- uint64_t x0, x1, y0, y1;
- typecast_copy(x0, in);
- typecast_copy(x1, in + 8);
- typecast_copy(y0, in2);
- typecast_copy(y1, in2 + 8);
-
- x0 ^= y0;
- x1 ^= y1;
- typecast_copy(out, x0);
- typecast_copy(out + 8, x1);
- out += 16; in += 16; in2 += 16; length -= 16;
+ uint64_t x[4];
+ uint64_t y[4];
+
+ typecast_copy(x, in + i, 4);
+ typecast_copy(y, in2 + i, 4);
+
+ x[0] ^= y[0];
+ x[1] ^= y[1];
+ x[2] ^= y[2];
+ x[3] ^= y[3];
+
+ typecast_copy(out + i, x, 4);
}
- for(size_t i = 0; i != length; ++i)
+ for(size_t i = blocks; i != length; ++i)
+ {
out[i] = in[i] ^ in2[i];
+ }
}
template<typename Alloc, typename Alloc2>