diff options
author | Jack Lloyd <[email protected]> | 2020-12-21 14:37:37 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2020-12-21 14:37:37 -0500 |
commit | b92b27a88725067baeeb81ad9f8e561fb4d10e0b (patch) | |
tree | 6503d9a1dc65665e74b49248129e00fdb8ca7420 /src/lib/block | |
parent | e80783e85c5bd3d1ba3151557de1f897c204a104 (diff) |
Unroll by 2x
Diffstat (limited to 'src/lib/block')
-rw-r--r-- | src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp | 47 |
1 files changed, 45 insertions, 2 deletions
diff --git a/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp b/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp index 656335b6c..be9b62cc6 100644 --- a/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp +++ b/src/lib/block/shacal2/shacal2_armv8/shacal2_arvm8.cpp @@ -19,6 +19,49 @@ void SHACAL2::armv8_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blo const uint32_t* input32 = reinterpret_cast<const uint32_t*>(in); uint32_t* output32 = reinterpret_cast<uint32_t*>(out); + while(blocks >= 2) + { + uint32x4_t B0_0 = vld1q_u32(input32 + 0); + uint32x4_t B0_1 = vld1q_u32(input32 + 4); + uint32x4_t B1_0 = vld1q_u32(input32 + 8); + uint32x4_t B1_1 = vld1q_u32(input32 + 12); + + B0_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_0))); + B0_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_1))); + B1_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_0))); + B1_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_1))); + + for(size_t i = 0; i != 8; ++i) + { + const auto RK0 = vld1q_u32(&m_RK[8*i]); + const auto RK1 = vld1q_u32(&m_RK[8*i+4]); + + const auto T0_0 = vsha256hq_u32(B0_0, B0_1, RK0); + const auto T0_1 = vsha256h2q_u32(B0_1, B0_0, RK0); + const auto T1_0 = vsha256hq_u32(B1_0, B1_1, RK0); + const auto T1_1 = vsha256h2q_u32(B1_1, B1_0, RK0); + + B0_0 = vsha256hq_u32(T0_0, T0_1, RK1); + B0_1 = vsha256h2q_u32(T0_1, T0_0, RK1); + B1_0 = vsha256hq_u32(T1_0, T1_1, RK1); + B1_1 = vsha256h2q_u32(T1_1, T1_0, RK1); + } + + B0_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_0))); + B0_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B0_1))); + B1_0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_0))); + B1_1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B1_1))); + + vst1q_u32(&output32[ 0], B0_0); + vst1q_u32(&output32[ 4], B0_1); + vst1q_u32(&output32[ 8], B1_0); + vst1q_u32(&output32[12], B1_1); + + blocks -= 2; + input32 += 16; + output32 += 16; + } + while(blocks > 0) { uint32x4_t B0 = vld1q_u32(input32 + 0); @@ -46,8 +89,8 @@ void SHACAL2::armv8_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blo vst1q_u32(&output32[4], B1); blocks--; - input32 += 32/4; - output32 += 32/4; + input32 += 8; + output32 += 8; } } |