aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/block
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2020-12-21 14:37:37 -0500
committerJack Lloyd <[email protected]>2020-12-21 14:37:37 -0500
commitb92b27a88725067baeeb81ad9f8e561fb4d10e0b (patch)
tree6503d9a1dc65665e74b49248129e00fdb8ca7420 /src/lib/block
parente80783e85c5bd3d1ba3151557de1f897c204a104 (diff)
Unroll by 2x
Diffstat (limited to 'src/lib/block')
-rw-r--r--src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp47
1 files changed, 45 insertions, 2 deletions
diff --git a/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp b/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp
index 656335b6c..be9b62cc6 100644
--- a/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp
+++ b/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp
@@ -19,6 +19,49 @@ void SHACAL2::armv8_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blo
const uint32_t* input32 = reinterpret_cast<const uint32_t*>(in);
uint32_t* output32 = reinterpret_cast<uint32_t*>(out);
+ while(blocks >= 2)
+ {
+ uint32x4_t B0_0 = vld1q_u32(input32 + 0);
+ uint32x4_t B0_1 = vld1q_u32(input32 + 4);
+ uint32x4_t B1_0 = vld1q_u32(input32 + 8);
+ uint32x4_t B1_1 = vld1q_u32(input32 + 12);
+
+ B0_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_0)));
+ B0_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_1)));
+ B1_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_0)));
+ B1_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_1)));
+
+ for(size_t i = 0; i != 8; ++i)
+ {
+ const auto RK0 = vld1q_u32(&m_RK[8*i]);
+ const auto RK1 = vld1q_u32(&m_RK[8*i+4]);
+
+ const auto T0_0 = vsha256hq_u32(B0_0, B0_1, RK0);
+ const auto T0_1 = vsha256h2q_u32(B0_1, B0_0, RK0);
+ const auto T1_0 = vsha256hq_u32(B1_0, B1_1, RK0);
+ const auto T1_1 = vsha256h2q_u32(B1_1, B1_0, RK0);
+
+ B0_0 = vsha256hq_u32(T0_0, T0_1, RK1);
+ B0_1 = vsha256h2q_u32(T0_1, T0_0, RK1);
+ B1_0 = vsha256hq_u32(T1_0, T1_1, RK1);
+ B1_1 = vsha256h2q_u32(T1_1, T1_0, RK1);
+ }
+
+ B0_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_0)));
+ B0_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_1)));
+ B1_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_0)));
+ B1_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_1)));
+
+ vst1q_u32(&output32[ 0], B0_0);
+ vst1q_u32(&output32[ 4], B0_1);
+ vst1q_u32(&output32[ 8], B1_0);
+ vst1q_u32(&output32[12], B1_1);
+
+ blocks -= 2;
+ input32 += 16;
+ output32 += 16;
+ }
+
while(blocks > 0)
{
uint32x4_t B0 = vld1q_u32(input32 + 0);
@@ -46,8 +89,8 @@ void SHACAL2::armv8_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blo
vst1q_u32(&output32[4], B1);
blocks--;
- input32 += 32/4;
- output32 += 32/4;
+ input32 += 8;
+ output32 += 8;
}
}