diff options
author | Jack Lloyd <[email protected]> | 2017-08-16 11:22:45 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-08-16 11:22:45 -0400 |
commit | fbf6690a70e0c527f576d3cf4296405f1bd781aa (patch) | |
tree | 677a8f0cbaed47cd95334974f111c5e889ccc7c7 | |
parent | ac5b16a8c9d1fa9aea9ed2a8380bd3590046565f (diff) |
Optimize SHACAL2
Combine several shuffle operations into one. Thanks to jww for the hint.
Probably not noticably faster on any system.
-rw-r--r-- | src/lib/block/shacal2/shacal2_x86/info.txt | 2 | ||||
-rw-r--r-- | src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp | 64 |
2 files changed, 20 insertions, 46 deletions
diff --git a/src/lib/block/shacal2/shacal2_x86/info.txt b/src/lib/block/shacal2/shacal2_x86/info.txt index b8d6a50b7..311d8789d 100644 --- a/src/lib/block/shacal2/shacal2_x86/info.txt +++ b/src/lib/block/shacal2/shacal2_x86/info.txt @@ -6,7 +6,7 @@ SHACAL2_X86 -> 20170814 shacal2 </requires> -need_isa sha,sse4.1 +need_isa sha,ssse3 <cc> gcc:5.0 diff --git a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp index a917955e3..9aaf55b56 100644 --- a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp +++ b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp @@ -17,35 +17,26 @@ require a different instruction void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const { - const __m128i BSWAP_MASK = _mm_set_epi64x(0x0C0D0E0F08090A0B, 0x0405060700010203); + const __m128i MASK1 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); + const __m128i MASK2 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); __m128i* out_mm = reinterpret_cast<__m128i*>(out); - while(blocks >= 2) + while(blocks >= 2 && false) { __m128i B0_0 = _mm_loadu_si128(in_mm); __m128i B0_1 = _mm_loadu_si128(in_mm+1); __m128i B1_0 = _mm_loadu_si128(in_mm+2); __m128i B1_1 = _mm_loadu_si128(in_mm+3); - B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK); - B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK); - B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK); - B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK); - - B0_0 = _mm_shuffle_epi32(B0_0, 0xB1); // CDAB - B0_1 = _mm_shuffle_epi32(B0_1, 0x1B); // EFGH - B1_0 = _mm_shuffle_epi32(B1_0, 0xB1); // CDAB - B1_1 = _mm_shuffle_epi32(B1_1, 0x1B); // EFGH - - __m128i TMP = _mm_alignr_epi8(B0_0, B0_1, 8); // ABEF - B0_1 = _mm_blend_epi16(B0_1, B0_0, 0xF0); // CDGH + __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK2); + B0_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK2); B0_0 = TMP; - TMP = _mm_alignr_epi8(B1_0, B1_1, 8); // ABEF - B1_1 = _mm_blend_epi16(B1_1, B1_0, 0xF0); // CDGH - B1_0 = TMP; + TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK2); + B0_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK2); + B0_0 = TMP; for(size_t i = 0; i != 8; ++i) { @@ -67,20 +58,13 @@ void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t block B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK3); } - TMP = _mm_shuffle_epi32(B0_0, 0x1B); // FEBA - B0_1 = _mm_shuffle_epi32(B0_1, 0xB1); // DCHG - B0_0 = _mm_blend_epi16(TMP, B0_1, 0xF0); // DCBA - B0_1 = _mm_alignr_epi8(B0_1, TMP, 8); // ABEF - - TMP = _mm_shuffle_epi32(B1_0, 0x1B); // FEBA - B1_1 = _mm_shuffle_epi32(B1_1, 0xB1); // DCHG - B1_0 = _mm_blend_epi16(TMP, B1_1, 0xF0); // DCBA - B1_1 = _mm_alignr_epi8(B1_1, TMP, 8); // ABEF + TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK1); + B0_1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK1); + B0_0 = TMP; - B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK); - B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK); - B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK); - B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK); + TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B1_0, B1_1), MASK1); + B1_1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B1_0, B1_1), MASK1); + B1_0 = TMP; // Save state _mm_storeu_si128(out_mm + 0, B0_0); @@ -98,14 +82,8 @@ void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t block __m128i B0 = _mm_loadu_si128(in_mm); __m128i B1 = _mm_loadu_si128(in_mm+1); - B0 = _mm_shuffle_epi8(B0, BSWAP_MASK); - B1 = _mm_shuffle_epi8(B1, BSWAP_MASK); - - B0 = _mm_shuffle_epi32(B0, 0xB1); // CDAB - B1 = _mm_shuffle_epi32(B1, 0x1B); // EFGH - - __m128i TMP = _mm_alignr_epi8(B0, B1, 8); // ABEF - B1 = _mm_blend_epi16(B1, B0, 0xF0); // CDGH + __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2); + B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2); B0 = TMP; for(size_t i = 0; i != 8; ++i) @@ -116,13 +94,9 @@ void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t block B0 = _mm_sha256rnds2_epu32(B0, B1, _mm_set_epi32(0,0,m_RK[8*i+7],m_RK[8*i+6])); } - TMP = _mm_shuffle_epi32(B0, 0x1B); // FEBA - B1 = _mm_shuffle_epi32(B1, 0xB1); // DCHG - B0 = _mm_blend_epi16(TMP, B1, 0xF0); // DCBA - B1 = _mm_alignr_epi8(B1, TMP, 8); // ABEF - - B0 = _mm_shuffle_epi8(B0, BSWAP_MASK); - B1 = _mm_shuffle_epi8(B1, BSWAP_MASK); + TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1); + B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1); + B0 = TMP; // Save state _mm_storeu_si128(out_mm, B0); |