diff options
-rw-r--r-- | src/lib/utils/simd/simd_32.h | 20 |
1 files changed, 19 insertions, 1 deletions
diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h index 51d1f1b6b..01fadc54a 100644 --- a/src/lib/utils/simd/simd_32.h +++ b/src/lib/utils/simd/simd_32.h @@ -585,15 +585,33 @@ class SIMD_4x32 final B2.m_vmx = vec_mergeh(T2, T3); B3.m_vmx = vec_mergel(T2, T3); #elif defined(BOTAN_SIMD_USE_NEON) + +#if defined(BOTAN_TARGET_ARCH_IS_ARM32) + + const uint32x4x2_t T0 = vzipq_u32(B0.m_neon, B2.m_neon); + const uint32x4x2_t T1 = vzipq_u32(B1.m_neon, B3.m_neon); + const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]); + const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T3.val[1]); + + B0.m_neon = O0.val[0]; + B1.m_neon = O0.val[1]; + B2.m_neon = O1.val[0]; + B3.m_neon = O1.val[1]; + +#elif defined(BOTAN_TARGET_ARCH_IS_ARM64) const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon); - const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon); const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon); + + const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon); const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon); B0.m_neon = vzip1q_u32(T0, T1); B1.m_neon = vzip2q_u32(T0, T1); + B2.m_neon = vzip1q_u32(T2, T3); B3.m_neon = vzip2q_u32(T2, T3); +#endif + #else // scalar SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]); |