aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-08-16 11:22:45 -0400
committerJack Lloyd <[email protected]>2017-08-16 11:22:45 -0400
commitfbf6690a70e0c527f576d3cf4296405f1bd781aa (patch)
tree677a8f0cbaed47cd95334974f111c5e889ccc7c7
parentac5b16a8c9d1fa9aea9ed2a8380bd3590046565f (diff)
Optimize SHACAL2
Combine several shuffle operations into one. Thanks to jww for the hint. Probably not noticably faster on any system.
-rw-r--r--src/lib/block/shacal2/shacal2_x86/info.txt2
-rw-r--r--src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp64
2 files changed, 20 insertions, 46 deletions
diff --git a/src/lib/block/shacal2/shacal2_x86/info.txt b/src/lib/block/shacal2/shacal2_x86/info.txt
index b8d6a50b7..311d8789d 100644
--- a/src/lib/block/shacal2/shacal2_x86/info.txt
+++ b/src/lib/block/shacal2/shacal2_x86/info.txt
@@ -6,7 +6,7 @@ SHACAL2_X86 -> 20170814
shacal2
</requires>
-need_isa sha,sse4.1
+need_isa sha,ssse3
<cc>
gcc:5.0
diff --git a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
index a917955e3..9aaf55b56 100644
--- a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
+++ b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
@@ -17,35 +17,26 @@ require a different instruction
void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- const __m128i BSWAP_MASK = _mm_set_epi64x(0x0C0D0E0F08090A0B, 0x0405060700010203);
+ const __m128i MASK1 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
+ const __m128i MASK2 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
__m128i* out_mm = reinterpret_cast<__m128i*>(out);
- while(blocks >= 2)
+ while(blocks >= 2 && false)
{
__m128i B0_0 = _mm_loadu_si128(in_mm);
__m128i B0_1 = _mm_loadu_si128(in_mm+1);
__m128i B1_0 = _mm_loadu_si128(in_mm+2);
__m128i B1_1 = _mm_loadu_si128(in_mm+3);
- B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK);
- B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK);
- B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK);
- B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK);
-
- B0_0 = _mm_shuffle_epi32(B0_0, 0xB1); // CDAB
- B0_1 = _mm_shuffle_epi32(B0_1, 0x1B); // EFGH
- B1_0 = _mm_shuffle_epi32(B1_0, 0xB1); // CDAB
- B1_1 = _mm_shuffle_epi32(B1_1, 0x1B); // EFGH
-
- __m128i TMP = _mm_alignr_epi8(B0_0, B0_1, 8); // ABEF
- B0_1 = _mm_blend_epi16(B0_1, B0_0, 0xF0); // CDGH
+ __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK2);
+ B0_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK2);
B0_0 = TMP;
- TMP = _mm_alignr_epi8(B1_0, B1_1, 8); // ABEF
- B1_1 = _mm_blend_epi16(B1_1, B1_0, 0xF0); // CDGH
- B1_0 = TMP;
+ TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK2);
+ B0_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK2);
+ B0_0 = TMP;
for(size_t i = 0; i != 8; ++i)
{
@@ -67,20 +58,13 @@ void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t block
B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK3);
}
- TMP = _mm_shuffle_epi32(B0_0, 0x1B); // FEBA
- B0_1 = _mm_shuffle_epi32(B0_1, 0xB1); // DCHG
- B0_0 = _mm_blend_epi16(TMP, B0_1, 0xF0); // DCBA
- B0_1 = _mm_alignr_epi8(B0_1, TMP, 8); // ABEF
-
- TMP = _mm_shuffle_epi32(B1_0, 0x1B); // FEBA
- B1_1 = _mm_shuffle_epi32(B1_1, 0xB1); // DCHG
- B1_0 = _mm_blend_epi16(TMP, B1_1, 0xF0); // DCBA
- B1_1 = _mm_alignr_epi8(B1_1, TMP, 8); // ABEF
+ TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK1);
+ B0_1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK1);
+ B0_0 = TMP;
- B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK);
- B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK);
- B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK);
- B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK);
+ TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B1_0, B1_1), MASK1);
+ B1_1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B1_0, B1_1), MASK1);
+ B1_0 = TMP;
// Save state
_mm_storeu_si128(out_mm + 0, B0_0);
@@ -98,14 +82,8 @@ void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t block
__m128i B0 = _mm_loadu_si128(in_mm);
__m128i B1 = _mm_loadu_si128(in_mm+1);
- B0 = _mm_shuffle_epi8(B0, BSWAP_MASK);
- B1 = _mm_shuffle_epi8(B1, BSWAP_MASK);
-
- B0 = _mm_shuffle_epi32(B0, 0xB1); // CDAB
- B1 = _mm_shuffle_epi32(B1, 0x1B); // EFGH
-
- __m128i TMP = _mm_alignr_epi8(B0, B1, 8); // ABEF
- B1 = _mm_blend_epi16(B1, B0, 0xF0); // CDGH
+ __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2);
+ B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2);
B0 = TMP;
for(size_t i = 0; i != 8; ++i)
@@ -116,13 +94,9 @@ void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t block
B0 = _mm_sha256rnds2_epu32(B0, B1, _mm_set_epi32(0,0,m_RK[8*i+7],m_RK[8*i+6]));
}
- TMP = _mm_shuffle_epi32(B0, 0x1B); // FEBA
- B1 = _mm_shuffle_epi32(B1, 0xB1); // DCHG
- B0 = _mm_blend_epi16(TMP, B1, 0xF0); // DCBA
- B1 = _mm_alignr_epi8(B1, TMP, 8); // ABEF
-
- B0 = _mm_shuffle_epi8(B0, BSWAP_MASK);
- B1 = _mm_shuffle_epi8(B1, BSWAP_MASK);
+ TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1);
+ B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1);
+ B0 = TMP;
// Save state
_mm_storeu_si128(out_mm, B0);