diff options
author | Jack Lloyd <[email protected]> | 2017-01-29 17:32:12 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-01-29 17:32:48 -0500 |
commit | 0fac6866a7127b65cbbe6364c4678ff09c2d94a4 (patch) | |
tree | 1323b86aba6c4c99077a6f0f7715f1e443399108 /src/lib/utils | |
parent | 847dd4fca82f71dcf061e5cde4113a85ee1265ae (diff) |
Transpose for 32-bit NEON
Different intrinsics API
Diffstat (limited to 'src/lib/utils')
-rw-r--r-- | src/lib/utils/simd/simd_32.h | 20 |
1 files changed, 19 insertions, 1 deletions
diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h index 51d1f1b6b..01fadc54a 100644 --- a/src/lib/utils/simd/simd_32.h +++ b/src/lib/utils/simd/simd_32.h @@ -585,15 +585,33 @@ class SIMD_4x32 final B2.m_vmx = vec_mergeh(T2, T3); B3.m_vmx = vec_mergel(T2, T3); #elif defined(BOTAN_SIMD_USE_NEON) + +#if defined(BOTAN_TARGET_ARCH_IS_ARM32) + + const uint32x4x2_t T0 = vzipq_u32(B0.m_neon, B2.m_neon); + const uint32x4x2_t T1 = vzipq_u32(B1.m_neon, B3.m_neon); + const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]); + const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T3.val[1]); + + B0.m_neon = O0.val[0]; + B1.m_neon = O0.val[1]; + B2.m_neon = O1.val[0]; + B3.m_neon = O1.val[1]; + +#elif defined(BOTAN_TARGET_ARCH_IS_ARM64) const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon); - const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon); const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon); + + const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon); const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon); B0.m_neon = vzip1q_u32(T0, T1); B1.m_neon = vzip2q_u32(T0, T1); + B2.m_neon = vzip1q_u32(T2, T3); B3.m_neon = vzip2q_u32(T2, T3); +#endif + #else // scalar SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]); |