diff options
author | Jack Lloyd <[email protected]> | 2019-09-07 10:33:11 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2019-09-07 10:33:11 -0400 |
commit | e75392bc03ff597138328b05093b499e77aeea4a (patch) | |
tree | 3885338f029cb35704e99c06df5bf883ed538eb6 /src | |
parent | ce9af808c11b6a2e033c42711f1b1dfeeb31ac4b (diff) | |
parent | b5dfa2c612ab83fd68b084f86f003a7fc4c36b50 (diff) |
Merge GH #2099 Misc performance tweaks
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/block/aes/aes_vperm/aes_vperm.cpp | 29 | ||||
-rw-r--r-- | src/lib/stream/ctr/ctr.cpp | 23 | ||||
-rw-r--r-- | src/lib/utils/mem_ops.h | 83 |
3 files changed, 67 insertions, 68 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp index b7e82876c..10e1e5c26 100644 --- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp +++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp @@ -105,17 +105,15 @@ const SIMD_4x32 rcon[10] = { SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000), }; -const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); -const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); -const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B); - inline SIMD_4x32 low_nibs(SIMD_4x32 x) { + const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); return lo_nibs_mask & x; } inline SIMD_4x32 high_nibs(SIMD_4x32 x) { + const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); return (hi_nibs_mask & x).shr<4>(); } @@ -418,7 +416,7 @@ SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); const SIMD_4x32 srx(sr[round_no % 4]); - SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0); + SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0); SIMD_4x32 t2 = t; t = shuffle(t, mc_forward0); t2 = t ^ t2 ^ shuffle(t, mc_forward0); @@ -461,7 +459,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); k = shuffle(k, sr[round_no % 4]); - k ^= xor_5B; + k ^= SIMD_4x32::splat_u8(0x5B); return aes_schedule_transform(k, out_tr1, out_tr2); } @@ -470,7 +468,7 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k) const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A); const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB); - k ^= xor_5B; + k ^= SIMD_4x32::splat_u8(0x5B); return aes_schedule_transform(k, deskew1, deskew2); } @@ -478,20 +476,17 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) { SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2); smeared ^= shift_elems_left<2>(smeared); - smeared ^= xor_5B; - - SIMD_4x32 t = high_nibs(input1); - input1 = low_nibs(input1); + smeared ^= SIMD_4x32::splat_u8(0x5B); - SIMD_4x32 t2 = shuffle(k_inv2, input1); + const SIMD_4x32 Bh = high_nibs(input1); + SIMD_4x32 Bl = low_nibs(input1); - input1 ^= t; + const SIMD_4x32 t2 = shuffle(k_inv2, Bl); - SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t); - SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1); + Bl ^= Bh; - SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3); - SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4); + SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); + SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6); } diff --git a/src/lib/stream/ctr/ctr.cpp b/src/lib/stream/ctr/ctr.cpp index f1d8d97b8..ca187ea47 100644 --- a/src/lib/stream/ctr/ctr.cpp +++ b/src/lib/stream/ctr/ctr.cpp @@ -141,31 +141,27 @@ void CTR_BE::add_counter(const uint64_t counter) if(ctr_size == 4) { - size_t off = (BS - 4); - uint32_t low32 = static_cast<uint32_t>(counter + load_be<uint32_t>(&m_counter[off], 0)); + const size_t off = (BS - 4); + const uint32_t low32 = static_cast<uint32_t>(counter + load_be<uint32_t>(&m_counter[off], 0)); for(size_t i = 0; i != ctr_blocks; ++i) { - store_be(low32, &m_counter[off]); - off += BS; - low32 += 1; + store_be(uint32_t(low32 + i), &m_counter[i*BS+off]); } } else if(ctr_size == 8) { - size_t off = (BS - 8); - uint64_t low64 = counter + load_be<uint64_t>(&m_counter[off], 0); + const size_t off = (BS - 8); + const uint64_t low64 = counter + load_be<uint64_t>(&m_counter[off], 0); for(size_t i = 0; i != ctr_blocks; ++i) { - store_be(low64, &m_counter[off]); - off += BS; - low64 += 1; + store_be(uint64_t(low64 + i), &m_counter[i*BS+off]); } } else if(ctr_size == 16) { - size_t off = (BS - 16); + const size_t off = (BS - 16); uint64_t b0 = load_be<uint64_t>(&m_counter[off], 0); uint64_t b1 = load_be<uint64_t>(&m_counter[off], 1); b1 += counter; @@ -173,9 +169,8 @@ void CTR_BE::add_counter(const uint64_t counter) for(size_t i = 0; i != ctr_blocks; ++i) { - store_be(b0, &m_counter[off]); - store_be(b1, &m_counter[off+8]); - off += BS; + store_be(b0, &m_counter[i*BS+off]); + store_be(b1, &m_counter[i*BS+off+8]); b1 += 1; b0 += (b1 == 0); // carry } diff --git a/src/lib/utils/mem_ops.h b/src/lib/utils/mem_ops.h index 31a1efcc9..569cb409b 100644 --- a/src/lib/utils/mem_ops.h +++ b/src/lib/utils/mem_ops.h @@ -127,19 +127,24 @@ template<typename T> inline void copy_mem(T* out, const T* in, size_t n) } } -template<typename T> inline void typecast_copy(uint8_t out[], T in) +template<typename T> inline void typecast_copy(uint8_t out[], T in[], size_t N) { - std::memcpy(out, &in, sizeof(T)); + std::memcpy(out, in, sizeof(T)*N); } -template<typename T> inline void typecast_copy(T& out, const uint8_t in[]) +template<typename T> inline void typecast_copy(T out[], const uint8_t in[], size_t N) { - std::memcpy(&out, in, sizeof(T)); + std::memcpy(out, in, sizeof(T)*N); } -template<typename T> inline void typecast_copy(T out[], const uint8_t in[], size_t N) +template<typename T> inline void typecast_copy(uint8_t out[], T in) { - std::memcpy(out, in, sizeof(T)*N); + typecast_copy(out, &in, 1); + } + +template<typename T> inline void typecast_copy(T& out, const uint8_t in[]) + { + typecast_copy(&out, in, 1); } /** @@ -203,28 +208,27 @@ inline void xor_buf(uint8_t out[], const uint8_t in[], size_t length) { - while(length >= 16) + const size_t blocks = length - (length % 32); + + for(size_t i = 0; i != blocks; i += 32) { - uint64_t x0, x1, y0, y1; - - typecast_copy(x0, in); - typecast_copy(x1, in + 8); - typecast_copy(y0, out); - typecast_copy(y1, out + 8); - - y0 ^= x0; - y1 ^= x1; - typecast_copy(out, y0); - typecast_copy(out + 8, y1); - out += 16; in += 16; length -= 16; + uint64_t x[4]; + uint64_t y[4]; + + typecast_copy(x, out + i, 4); + typecast_copy(y, in + i, 4); + + x[0] ^= y[0]; + x[1] ^= y[1]; + x[2] ^= y[2]; + x[3] ^= y[3]; + + typecast_copy(out + i, x, 4); } - while(length > 0) + for(size_t i = blocks; i != length; ++i) { - out[0] ^= in[0]; - out += 1; - in += 1; - length -= 1; + out[i] ^= in[i]; } } @@ -240,23 +244,28 @@ inline void xor_buf(uint8_t out[], const uint8_t in2[], size_t length) { - while(length >= 16) + const size_t blocks = length - (length % 32); + + for(size_t i = 0; i != blocks; i += 32) { - uint64_t x0, x1, y0, y1; - typecast_copy(x0, in); - typecast_copy(x1, in + 8); - typecast_copy(y0, in2); - typecast_copy(y1, in2 + 8); - - x0 ^= y0; - x1 ^= y1; - typecast_copy(out, x0); - typecast_copy(out + 8, x1); - out += 16; in += 16; in2 += 16; length -= 16; + uint64_t x[4]; + uint64_t y[4]; + + typecast_copy(x, in + i, 4); + typecast_copy(y, in2 + i, 4); + + x[0] ^= y[0]; + x[1] ^= y[1]; + x[2] ^= y[2]; + x[3] ^= y[3]; + + typecast_copy(out + i, x, 4); } - for(size_t i = 0; i != length; ++i) + for(size_t i = blocks; i != length; ++i) + { out[i] = in[i] ^ in2[i]; + } } template<typename Alloc, typename Alloc2> |