aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-01-29 17:32:12 -0500
committerJack Lloyd <[email protected]>2017-01-29 17:32:48 -0500
commit0fac6866a7127b65cbbe6364c4678ff09c2d94a4 (patch)
tree1323b86aba6c4c99077a6f0f7715f1e443399108 /src
parent847dd4fca82f71dcf061e5cde4113a85ee1265ae (diff)
Transpose for 32-bit NEON
Different intrinsics API
Diffstat (limited to 'src')
-rw-r--r--src/lib/utils/simd/simd_32.h20
1 files changed, 19 insertions, 1 deletions
diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h
index 51d1f1b6b..01fadc54a 100644
--- a/src/lib/utils/simd/simd_32.h
+++ b/src/lib/utils/simd/simd_32.h
@@ -585,15 +585,33 @@ class SIMD_4x32 final
B2.m_vmx = vec_mergeh(T2, T3);
B3.m_vmx = vec_mergel(T2, T3);
#elif defined(BOTAN_SIMD_USE_NEON)
+
+#if defined(BOTAN_TARGET_ARCH_IS_ARM32)
+
+ const uint32x4x2_t T0 = vzipq_u32(B0.m_neon, B2.m_neon);
+ const uint32x4x2_t T1 = vzipq_u32(B1.m_neon, B3.m_neon);
+ const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
+ const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T3.val[1]);
+
+ B0.m_neon = O0.val[0];
+ B1.m_neon = O0.val[1];
+ B2.m_neon = O1.val[0];
+ B3.m_neon = O1.val[1];
+
+#elif defined(BOTAN_TARGET_ARCH_IS_ARM64)
const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon);
- const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon);
const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon);
+
+ const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon);
const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon);
B0.m_neon = vzip1q_u32(T0, T1);
B1.m_neon = vzip2q_u32(T0, T1);
+
B2.m_neon = vzip1q_u32(T2, T3);
B3.m_neon = vzip2q_u32(T2, T3);
+#endif
+
#else
// scalar
SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]);