diff options
author | Jack Lloyd <[email protected]> | 2017-08-14 07:59:21 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-08-14 07:59:21 -0400 |
commit | 9ab8ec3de32cad721b6b52401be67c5219c9f77b (patch) | |
tree | 82ac3677c15ff2af322926b114a15bacb9cb50c6 | |
parent | b8a691fb0d44eb67886179810768ac9453963cea (diff) |
Add 2x unrolling for SHACAL2 on x86
-rw-r--r-- | src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp | 73 |
1 files changed, 71 insertions, 2 deletions
diff --git a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp index 5a346698a..a917955e3 100644 --- a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp +++ b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp @@ -17,13 +17,82 @@ require a different instruction void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const { - // TODO x4 unrolling - const __m128i BSWAP_MASK = _mm_set_epi64x(0x0C0D0E0F08090A0B, 0x0405060700010203); const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); + while(blocks >= 2) + { + __m128i B0_0 = _mm_loadu_si128(in_mm); + __m128i B0_1 = _mm_loadu_si128(in_mm+1); + __m128i B1_0 = _mm_loadu_si128(in_mm+2); + __m128i B1_1 = _mm_loadu_si128(in_mm+3); + + B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK); + B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK); + B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK); + B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK); + + B0_0 = _mm_shuffle_epi32(B0_0, 0xB1); // CDAB + B0_1 = _mm_shuffle_epi32(B0_1, 0x1B); // EFGH + B1_0 = _mm_shuffle_epi32(B1_0, 0xB1); // CDAB + B1_1 = _mm_shuffle_epi32(B1_1, 0x1B); // EFGH + + __m128i TMP = _mm_alignr_epi8(B0_0, B0_1, 8); // ABEF + B0_1 = _mm_blend_epi16(B0_1, B0_0, 0xF0); // CDGH + B0_0 = TMP; + + TMP = _mm_alignr_epi8(B1_0, B1_1, 8); // ABEF + B1_1 = _mm_blend_epi16(B1_1, B1_0, 0xF0); // CDGH + B1_0 = TMP; + + for(size_t i = 0; i != 8; ++i) + { + const __m128i RK0 = _mm_set_epi32(0,0,m_RK[8*i+1],m_RK[8*i+0]); + const __m128i RK1 = _mm_set_epi32(0,0,m_RK[8*i+3],m_RK[8*i+2]); + const __m128i RK2 = _mm_set_epi32(0,0,m_RK[8*i+5],m_RK[8*i+4]); + const __m128i RK3 = _mm_set_epi32(0,0,m_RK[8*i+7],m_RK[8*i+6]); + + B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK0); + B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK0); + + B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK1); + B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK1); + + B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK2); + B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK2); + + B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK3); + B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK3); + } + + TMP = _mm_shuffle_epi32(B0_0, 0x1B); // FEBA + B0_1 = _mm_shuffle_epi32(B0_1, 0xB1); // DCHG + B0_0 = _mm_blend_epi16(TMP, B0_1, 0xF0); // DCBA + B0_1 = _mm_alignr_epi8(B0_1, TMP, 8); // ABEF + + TMP = _mm_shuffle_epi32(B1_0, 0x1B); // FEBA + B1_1 = _mm_shuffle_epi32(B1_1, 0xB1); // DCHG + B1_0 = _mm_blend_epi16(TMP, B1_1, 0xF0); // DCBA + B1_1 = _mm_alignr_epi8(B1_1, TMP, 8); // ABEF + + B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK); + B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK); + B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK); + B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK); + + // Save state + _mm_storeu_si128(out_mm + 0, B0_0); + _mm_storeu_si128(out_mm + 1, B0_1); + _mm_storeu_si128(out_mm + 2, B1_0); + _mm_storeu_si128(out_mm + 3, B1_1); + + blocks -= 2; + in_mm += 4; + out_mm += 4; + } + while(blocks) { __m128i B0 = _mm_loadu_si128(in_mm); |