aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-08-14 07:59:21 -0400
committerJack Lloyd <[email protected]>2017-08-14 07:59:21 -0400
commit9ab8ec3de32cad721b6b52401be67c5219c9f77b (patch)
tree82ac3677c15ff2af322926b114a15bacb9cb50c6
parentb8a691fb0d44eb67886179810768ac9453963cea (diff)
Add 2x unrolling for SHACAL2 on x86
-rw-r--r--src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp73
1 files changed, 71 insertions, 2 deletions
diff --git a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
index 5a346698a..a917955e3 100644
--- a/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
+++ b/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
@@ -17,13 +17,82 @@ require a different instruction
void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const
{
- // TODO x4 unrolling
-
const __m128i BSWAP_MASK = _mm_set_epi64x(0x0C0D0E0F08090A0B, 0x0405060700010203);
const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
__m128i* out_mm = reinterpret_cast<__m128i*>(out);
+ while(blocks >= 2)
+ {
+ __m128i B0_0 = _mm_loadu_si128(in_mm);
+ __m128i B0_1 = _mm_loadu_si128(in_mm+1);
+ __m128i B1_0 = _mm_loadu_si128(in_mm+2);
+ __m128i B1_1 = _mm_loadu_si128(in_mm+3);
+
+ B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK);
+ B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK);
+ B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK);
+ B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK);
+
+ B0_0 = _mm_shuffle_epi32(B0_0, 0xB1); // CDAB
+ B0_1 = _mm_shuffle_epi32(B0_1, 0x1B); // EFGH
+ B1_0 = _mm_shuffle_epi32(B1_0, 0xB1); // CDAB
+ B1_1 = _mm_shuffle_epi32(B1_1, 0x1B); // EFGH
+
+ __m128i TMP = _mm_alignr_epi8(B0_0, B0_1, 8); // ABEF
+ B0_1 = _mm_blend_epi16(B0_1, B0_0, 0xF0); // CDGH
+ B0_0 = TMP;
+
+ TMP = _mm_alignr_epi8(B1_0, B1_1, 8); // ABEF
+ B1_1 = _mm_blend_epi16(B1_1, B1_0, 0xF0); // CDGH
+ B1_0 = TMP;
+
+ for(size_t i = 0; i != 8; ++i)
+ {
+ const __m128i RK0 = _mm_set_epi32(0,0,m_RK[8*i+1],m_RK[8*i+0]);
+ const __m128i RK1 = _mm_set_epi32(0,0,m_RK[8*i+3],m_RK[8*i+2]);
+ const __m128i RK2 = _mm_set_epi32(0,0,m_RK[8*i+5],m_RK[8*i+4]);
+ const __m128i RK3 = _mm_set_epi32(0,0,m_RK[8*i+7],m_RK[8*i+6]);
+
+ B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK0);
+ B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK0);
+
+ B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK1);
+ B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK1);
+
+ B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK2);
+ B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK2);
+
+ B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK3);
+ B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK3);
+ }
+
+ TMP = _mm_shuffle_epi32(B0_0, 0x1B); // FEBA
+ B0_1 = _mm_shuffle_epi32(B0_1, 0xB1); // DCHG
+ B0_0 = _mm_blend_epi16(TMP, B0_1, 0xF0); // DCBA
+ B0_1 = _mm_alignr_epi8(B0_1, TMP, 8); // ABEF
+
+ TMP = _mm_shuffle_epi32(B1_0, 0x1B); // FEBA
+ B1_1 = _mm_shuffle_epi32(B1_1, 0xB1); // DCHG
+ B1_0 = _mm_blend_epi16(TMP, B1_1, 0xF0); // DCBA
+ B1_1 = _mm_alignr_epi8(B1_1, TMP, 8); // ABEF
+
+ B0_0 = _mm_shuffle_epi8(B0_0, BSWAP_MASK);
+ B0_1 = _mm_shuffle_epi8(B0_1, BSWAP_MASK);
+ B1_0 = _mm_shuffle_epi8(B1_0, BSWAP_MASK);
+ B1_1 = _mm_shuffle_epi8(B1_1, BSWAP_MASK);
+
+ // Save state
+ _mm_storeu_si128(out_mm + 0, B0_0);
+ _mm_storeu_si128(out_mm + 1, B0_1);
+ _mm_storeu_si128(out_mm + 2, B1_0);
+ _mm_storeu_si128(out_mm + 3, B1_1);
+
+ blocks -= 2;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
while(blocks)
{
__m128i B0 = _mm_loadu_si128(in_mm);