diff options
author | Jack Lloyd <[email protected]> | 2017-01-27 20:44:25 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-01-29 17:32:47 -0500 |
commit | 2b848242fe4f6c7023e86d7e916c73af30fb9cf0 (patch) | |
tree | ab138d0a73230de92d123a0afce38c7e04453395 | |
parent | 3cf1917b4e0ab45f853f1fe7cb7faed342987dd9 (diff) |
Add support for NEON in SIMD_4x32
Tested on qemu-aarch64
-rwxr-xr-x | configure.py | 4 | ||||
-rw-r--r-- | src/build-data/arch/arm64.txt | 4 | ||||
-rw-r--r-- | src/build-data/cc/gcc.txt | 2 | ||||
-rw-r--r-- | src/lib/utils/cpuid.h | 11 | ||||
-rw-r--r-- | src/lib/utils/simd/simd_32.h | 532 | ||||
-rw-r--r-- | src/tests/test_simd.cpp | 161 | ||||
-rw-r--r-- | src/tests/tests.cpp | 7 | ||||
-rw-r--r-- | src/tests/tests.h | 6 |
8 files changed, 534 insertions, 193 deletions
diff --git a/configure.py b/configure.py index 2a3be5ce5..83fe6b053 100755 --- a/configure.py +++ b/configure.py @@ -261,7 +261,7 @@ def process_command_line(args): target_group.add_option('--without-os-features', action='append', metavar='FEAT', help='specify OS features to disable') - for isa_extn_name in ['SSE2', 'SSSE3', 'AVX2', 'AES-NI', 'AltiVec']: + for isa_extn_name in ['SSE2', 'SSSE3', 'AVX2', 'AES-NI', 'AltiVec', 'NEON']: isa_extn = isa_extn_name.lower() target_group.add_option('--disable-%s' % (isa_extn), @@ -1349,7 +1349,7 @@ def gen_makefile_lists(var, build_config, options, modules, cc, arch, osinfo): def simd_dependencies(): - for simd32_impl in ['sse2', 'altivec']: + for simd32_impl in ['sse2', 'altivec', 'neon']: if simd32_impl in arch.isa_extensions and cc.isa_flags_for(simd32_impl, arch.basename) is not None: return [simd32_impl] diff --git a/src/build-data/arch/arm64.txt b/src/build-data/arch/arm64.txt index 362cf88d3..f556c864e 100644 --- a/src/build-data/arch/arm64.txt +++ b/src/build-data/arch/arm64.txt @@ -10,3 +10,7 @@ aarch64 <submodels> armv8-a </submodels> + +<isa_extensions> +neon +</isa_extensions> diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt index 0a53e15c1..08d089d03 100644 --- a/src/build-data/cc/gcc.txt +++ b/src/build-data/cc/gcc.txt @@ -67,6 +67,8 @@ rdrand -> "-mrdrnd" rdseed -> "-mrdseed" sha -> "-msha" altivec -> "-maltivec" + +neon -> "" </isa_flags> <mach_opt> diff --git a/src/lib/utils/cpuid.h b/src/lib/utils/cpuid.h index 98b5e14ce..f618ac35c 100644 --- a/src/lib/utils/cpuid.h +++ b/src/lib/utils/cpuid.h @@ -79,6 +79,15 @@ class BOTAN_DLL CPUID return g_little_endian; } + static bool is_big_endian() + { + /* + * We do not support PDP endian, so the endian is + * always either big or little. + */ + return is_little_endian() == false; + } + enum CPUID_bits : uint64_t { #if defined(BOTAN_TARGET_CPU_IS_X86_FAMILY) // These values have no relation to cpuid bitfields @@ -261,7 +270,7 @@ class BOTAN_DLL CPUID /* * Don't call this function, use CPUID::has_xxx above - * It should have been private. + * It is only exposed for the tests. */ static bool has_cpuid_bit(CPUID_bits elem) { diff --git a/src/lib/utils/simd/simd_32.h b/src/lib/utils/simd/simd_32.h index 2308da652..11ed709f6 100644 --- a/src/lib/utils/simd/simd_32.h +++ b/src/lib/utils/simd/simd_32.h @@ -1,6 +1,6 @@ /* * Lightweight wrappers for SIMD operations -* (C) 2009,2011,2016 Jack Lloyd +* (C) 2009,2011,2016,2017 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -11,6 +11,7 @@ #include <botan/types.h> #include <botan/loadstor.h> #include <botan/bswap.h> +#include <botan/cpuid.h> #if defined(BOTAN_TARGET_SUPPORTS_SSE2) #include <emmintrin.h> @@ -21,68 +22,106 @@ #undef vector #undef bool #define BOTAN_SIMD_USE_ALTIVEC -#endif -// TODO: NEON support +#elif defined(BOTAN_TARGET_SUPPORTS_NEON) + #include <arm_neon.h> + #define BOTAN_SIMD_USE_NEON +#endif namespace Botan { /** +* 4x32 bit SIMD register +* * This class is not a general purpose SIMD type, and only offers * instructions needed for evaluation of specific crypto primitives. * For example it does not currently have equality operators of any * kind. +* +* Implemented for SSE2, VMX (Altivec), and NEON. */ -class SIMD_4x32 +class SIMD_4x32 final { public: + SIMD_4x32(const SIMD_4x32& other) = default; + SIMD_4x32(SIMD_4x32&& other) = default; + SIMD_4x32& operator=(const SIMD_4x32& other) = default; + SIMD_4x32& operator=(SIMD_4x32&& other) = default; + + /** + * Zero initialize SIMD register with 4 32-bit elements + */ SIMD_4x32() // zero initialized { -#if defined(BOTAN_SIMD_USE_SSE2) || defined(BOTAN_SIMD_USE_ALTIVEC) - ::memset(&m_reg, 0, sizeof(m_reg)); +#if defined(BOTAN_SIMD_USE_SSE2) + ::memset(&m_sse, 0, sizeof(m_sse)); +#elif defined(BOTAN_SIMD_USE_ALTIVEC) + m_vmx = vec_splat_u32(0); +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vdupq_n_u32(0); #else - ::memset(m_reg, 0, sizeof(m_reg)); + ::memset(m_scalar, 0, sizeof(m_scalar)); #endif } + /** + * Load SIMD register with 4 32-bit elements + */ explicit SIMD_4x32(const uint32_t B[4]) { #if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B)); + m_sse = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B)); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = (__vector unsigned int){B[0], B[1], B[2], B[3]}; + m_vmx = (__vector unsigned int){B[0], B[1], B[2], B[3]}; +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vld1q_u32(B); #else - m_reg[0] = B[0]; - m_reg[1] = B[1]; - m_reg[2] = B[2]; - m_reg[3] = B[3]; + m_scalar[0] = B[0]; + m_scalar[1] = B[1]; + m_scalar[2] = B[2]; + m_scalar[3] = B[3]; #endif } + /** + * Load SIMD register with 4 32-bit elements + */ SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) { #if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_set_epi32(B0, B1, B2, B3); + m_sse = _mm_set_epi32(B3, B2, B1, B0); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = (__vector unsigned int){B0, B1, B2, B3}; + m_vmx = (__vector unsigned int){B0, B1, B2, B3}; +#elif defined(BOTAN_SIMD_USE_NEON) + // Better way to do this? + const uint32_t B[4] = { B0, B1, B2, B3 }; + m_neon = vld1q_u32(B); #else - m_reg[0] = B0; - m_reg[1] = B1; - m_reg[2] = B2; - m_reg[3] = B3; + m_scalar[0] = B0; + m_scalar[1] = B1; + m_scalar[2] = B2; + m_scalar[3] = B3; #endif } + /** + * Load SIMD register with one 32-bit element repeated + */ static SIMD_4x32 splat(uint32_t B) { #if defined(BOTAN_SIMD_USE_SSE2) return SIMD_4x32(_mm_set1_epi32(B)); +#elif defined(BOTAN_SIMD_USE_ARM) + return SIMD_4x32(vdupq_n_u32(B)); #else return SIMD_4x32(B, B, B, B); #endif } + /** + * Load a SIMD register with little-endian convention + */ static SIMD_4x32 load_le(const void* in) { #if defined(BOTAN_SIMD_USE_SSE2) @@ -95,74 +134,121 @@ class SIMD_4x32 __vector unsigned char perm = vec_lvsl(0, in_32); -#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) - perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector -#endif + if(CPUID::is_big_endian()) + { + perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector + } R0 = vec_perm(R0, R1, perm); return SIMD_4x32(R0); +#elif defined(BOTAN_SIMD_USE_NEON) + + uint32_t in32[4]; + std::memcpy(in32, in, 16); + if(CPUID::is_big_endian()) + { + bswap_4(in32); + } + return SIMD_4x32(vld1q_u32(in32)); + #else SIMD_4x32 out; - Botan::load_le(out.m_reg, static_cast<const uint8_t*>(in), 4); + Botan::load_le(out.m_scalar, static_cast<const uint8_t*>(in), 4); return out; #endif } + /** + * Load a SIMD register with big-endian convention + */ static SIMD_4x32 load_be(const void* in) { #if defined(BOTAN_SIMD_USE_SSE2) + return load_le(in).bswap(); + #elif defined(BOTAN_SIMD_USE_ALTIVEC) - const uint32_t* in_32 = static_cast<const uint32_t*>(in); + const uint32_t* in_32 = static_cast<const uint32_t*>(in); __vector unsigned int R0 = vec_ld(0, in_32); __vector unsigned int R1 = vec_ld(12, in_32); - __vector unsigned char perm = vec_lvsl(0, in_32); -#if defined(BOTAN_TARGET_CPU_IS_LITTLE_ENDIAN) - perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector -#endif + if(CPUID::is_little_endian()) + { + perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector + } R0 = vec_perm(R0, R1, perm); - return SIMD_4x32(R0); +#elif defined(BOTAN_SIMD_USE_NEON) + + uint32_t in32[4]; + std::memcpy(in32, in, 16); + if(CPUID::is_little_endian()) + { + bswap_4(in32); + } + return SIMD_4x32(vld1q_u32(in32)); + #else SIMD_4x32 out; - Botan::load_be(out.m_reg, static_cast<const uint8_t*>(in), 4); + Botan::load_be(out.m_scalar, static_cast<const uint8_t*>(in), 4); return out; #endif } + /** + * Load a SIMD register with little-endian convention + */ void store_le(uint8_t out[]) const { #if defined(BOTAN_SIMD_USE_SSE2) - _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_reg); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_sse); + #elif defined(BOTAN_SIMD_USE_ALTIVEC) - __vector unsigned char perm = vec_lvsl(0, static_cast<uint32_t*>(nullptr)); -#if defined(BOTAN_TARGET_CPU_IS_BIG_ENDIAN) - perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector -#endif + __vector unsigned char perm = vec_lvsl(0, static_cast<uint32_t*>(nullptr)); + if(CPUID::is_big_endian()) + { + perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector + } union { __vector unsigned int V; uint32_t R[4]; } vec; - - vec.V = vec_perm(m_reg, m_reg, perm); - + vec.V = vec_perm(m_vmx, m_vmx, perm); Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]); + +#elif defined(BOTAN_SIMD_USE_NEON) + + if(CPUID::is_big_endian()) + { + SIMD_4x32 swap = bswap(); + swap.store_be(out); + } + else + { + uint32_t out32[4] = { 0 }; + vst1q_u32(out32, m_neon); + copy_out_le(out, 16, out32); + } #else - Botan::store_le(out, m_reg[0], m_reg[1], m_reg[2], m_reg[3]); + Botan::store_le(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]); #endif } + /** + * Load a SIMD register with big-endian convention + */ void store_be(uint8_t out[]) const { #if defined(BOTAN_SIMD_USE_SSE2) + bswap().store_le(out); #elif defined(BOTAN_SIMD_USE_ALTIVEC) @@ -171,195 +257,225 @@ class SIMD_4x32 __vector unsigned int V; uint32_t R[4]; } vec; + vec.V = m_vmx; + Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]); - vec.V = m_reg; +#elif defined(BOTAN_SIMD_USE_NEON) + + if(CPUID::is_little_endian()) + { + SIMD_4x32 swap = bswap(); + swap.store_le(out); + } + else + { + uint32_t out32[4] = { 0 }; + vst1q_u32(out32, m_neon); + copy_out_be(out, 16, out32); + } - Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]); #else - Botan::store_be(out, m_reg[0], m_reg[1], m_reg[2], m_reg[3]); + Botan::store_be(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]); #endif } + /** + * Rotate each element of SIMD register n bits left + */ void rotate_left(size_t rot) { #if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_or_si128(_mm_slli_epi32(m_reg, static_cast<int>(rot)), - _mm_srli_epi32(m_reg, static_cast<int>(32-rot))); + + m_sse = _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(rot)), + _mm_srli_epi32(m_sse, static_cast<int>(32-rot))); #elif defined(BOTAN_SIMD_USE_ALTIVEC) + const unsigned int r = static_cast<unsigned int>(rot); - m_reg = vec_rl(m_reg, (__vector unsigned int){r, r, r, r}); + m_vmx = vec_rl(m_vmx, (__vector unsigned int){r, r, r, r}); + +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(rot)), + vshrq_n_u32(m_neon, static_cast<int>(32-rot))); #else - m_reg[0] = Botan::rotate_left(m_reg[0], rot); - m_reg[1] = Botan::rotate_left(m_reg[1], rot); - m_reg[2] = Botan::rotate_left(m_reg[2], rot); - m_reg[3] = Botan::rotate_left(m_reg[3], rot); + m_scalar[0] = Botan::rotate_left(m_scalar[0], rot); + m_scalar[1] = Botan::rotate_left(m_scalar[1], rot); + m_scalar[2] = Botan::rotate_left(m_scalar[2], rot); + m_scalar[3] = Botan::rotate_left(m_scalar[3], rot); #endif } + /** + * Rotate each element of SIMD register n bits right + */ void rotate_right(size_t rot) { rotate_left(32 - rot); } - void operator+=(const SIMD_4x32& other) + /** + * Add elements of a SIMD vector + */ + SIMD_4x32 operator+(const SIMD_4x32& other) const { -#if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_add_epi32(m_reg, other.m_reg); -#elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = vec_add(m_reg, other.m_reg); -#else - m_reg[0] += other.m_reg[0]; - m_reg[1] += other.m_reg[1]; - m_reg[2] += other.m_reg[2]; - m_reg[3] += other.m_reg[3]; -#endif + SIMD_4x32 retval(*this); + retval += other; + return retval; } - SIMD_4x32 operator+(const SIMD_4x32& other) const + /** + * Subtract elements of a SIMD vector + */ + SIMD_4x32 operator-(const SIMD_4x32& other) const { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_add_epi32(m_reg, other.m_reg)); -#elif defined(BOTAN_SIMD_USE_ALTIVEC) - return SIMD_4x32(vec_add(m_reg, other.m_reg)); -#else - return SIMD_4x32(m_reg[0] + other.m_reg[0], - m_reg[1] + other.m_reg[1], - m_reg[2] + other.m_reg[2], - m_reg[3] + other.m_reg[3]); -#endif + SIMD_4x32 retval(*this); + retval -= other; + return retval; } - void operator-=(const SIMD_4x32& other) + /** + * XOR elements of a SIMD vector + */ + SIMD_4x32 operator^(const SIMD_4x32& other) const { -#if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_sub_epi32(m_reg, other.m_reg); -#elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = vec_sub(m_reg, other.m_reg); -#else - m_reg[0] -= other.m_reg[0]; - m_reg[1] -= other.m_reg[1]; - m_reg[2] -= other.m_reg[2]; - m_reg[3] -= other.m_reg[3]; -#endif + SIMD_4x32 retval(*this); + retval ^= other; + return retval; } - SIMD_4x32 operator-(const SIMD_4x32& other) const + /** + * Binary OR elements of a SIMD vector + */ + SIMD_4x32 operator|(const SIMD_4x32& other) const { -#if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_sub_epi32(m_reg, other.m_reg)); -#elif defined(BOTAN_SIMD_USE_ALTIVEC) - return SIMD_4x32(vec_sub(m_reg, other.m_reg)); -#else - return SIMD_4x32(m_reg[0] - other.m_reg[0], - m_reg[1] - other.m_reg[1], - m_reg[2] - other.m_reg[2], - m_reg[3] - other.m_reg[3]); -#endif + SIMD_4x32 retval(*this); + retval |= other; + return retval; } - void operator^=(const SIMD_4x32& other) + /** + * Binary AND elements of a SIMD vector + */ + SIMD_4x32 operator&(const SIMD_4x32& other) const { -#if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_xor_si128(m_reg, other.m_reg); + SIMD_4x32 retval(*this); + retval &= other; + return retval; + } + void operator+=(const SIMD_4x32& other) + { +#if defined(BOTAN_SIMD_USE_SSE2) + m_sse = _mm_add_epi32(m_sse, other.m_sse); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = vec_xor(m_reg, other.m_reg); + m_vmx = vec_add(m_vmx, other.m_vmx); +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vaddq_u32(m_neon, other.m_neon); #else - m_reg[0] ^= other.m_reg[0]; - m_reg[1] ^= other.m_reg[1]; - m_reg[2] ^= other.m_reg[2]; - m_reg[3] ^= other.m_reg[3]; + m_scalar[0] += other.m_scalar[0]; + m_scalar[1] += other.m_scalar[1]; + m_scalar[2] += other.m_scalar[2]; + m_scalar[3] += other.m_scalar[3]; #endif } - SIMD_4x32 operator^(const SIMD_4x32& other) const + void operator-=(const SIMD_4x32& other) { #if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_xor_si128(m_reg, other.m_reg)); + m_sse = _mm_sub_epi32(m_sse, other.m_sse); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - return SIMD_4x32(vec_xor(m_reg, other.m_reg)); + m_vmx = vec_sub(m_vmx, other.m_vmx); +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vsubq_u32(m_neon, other.m_neon); #else - return SIMD_4x32(m_reg[0] ^ other.m_reg[0], - m_reg[1] ^ other.m_reg[1], - m_reg[2] ^ other.m_reg[2], - m_reg[3] ^ other.m_reg[3]); + m_scalar[0] -= other.m_scalar[0]; + m_scalar[1] -= other.m_scalar[1]; + m_scalar[2] -= other.m_scalar[2]; + m_scalar[3] -= other.m_scalar[3]; #endif } - void operator|=(const SIMD_4x32& other) + void operator^=(const SIMD_4x32& other) { #if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_or_si128(m_reg, other.m_reg); + m_sse = _mm_xor_si128(m_sse, other.m_sse); + #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = vec_or(m_reg, other.m_reg); + m_vmx = vec_xor(m_vmx, other.m_vmx); +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = veorq_u32(m_neon, other.m_neon); #else - m_reg[0] |= other.m_reg[0]; - m_reg[1] |= other.m_reg[1]; - m_reg[2] |= other.m_reg[2]; - m_reg[3] |= other.m_reg[3]; + m_scalar[0] ^= other.m_scalar[0]; + m_scalar[1] ^= other.m_scalar[1]; + m_scalar[2] ^= other.m_scalar[2]; + m_scalar[3] ^= other.m_scalar[3]; #endif } - SIMD_4x32 operator&(const SIMD_4x32& other) + void operator|=(const SIMD_4x32& other) { #if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_and_si128(m_reg, other.m_reg)); - + m_sse = _mm_or_si128(m_sse, other.m_sse); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - return SIMD_4x32(vec_and(m_reg, other.m_reg)); + m_vmx = vec_or(m_vmx, other.m_vmx); +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vorrq_u32(m_neon, other.m_neon); #else - return SIMD_4x32(m_reg[0] & other.m_reg[0], - m_reg[1] & other.m_reg[1], - m_reg[2] & other.m_reg[2], - m_reg[3] & other.m_reg[3]); + m_scalar[0] |= other.m_scalar[0]; + m_scalar[1] |= other.m_scalar[1]; + m_scalar[2] |= other.m_scalar[2]; + m_scalar[3] |= other.m_scalar[3]; #endif } void operator&=(const SIMD_4x32& other) { #if defined(BOTAN_SIMD_USE_SSE2) - m_reg = _mm_and_si128(m_reg, other.m_reg); + m_sse = _mm_and_si128(m_sse, other.m_sse); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - m_reg = vec_and(m_reg, other.m_reg); + m_vmx = vec_and(m_vmx, other.m_vmx); +#elif defined(BOTAN_SIMD_USE_NEON) + m_neon = vandq_u32(m_neon, other.m_neon); #else - m_reg[0] &= other.m_reg[0]; - m_reg[1] &= other.m_reg[1]; - m_reg[2] &= other.m_reg[2]; - m_reg[3] &= other.m_reg[3]; + m_scalar[0] &= other.m_scalar[0]; + m_scalar[1] &= other.m_scalar[1]; + m_scalar[2] &= other.m_scalar[2]; + m_scalar[3] &= other.m_scalar[3]; #endif } SIMD_4x32 operator<<(size_t shift) const { #if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_slli_epi32(m_reg, static_cast<int>(shift))); + return SIMD_4x32(_mm_slli_epi32(m_sse, static_cast<int>(shift))); #elif defined(BOTAN_SIMD_USE_ALTIVEC) const unsigned int s = static_cast<unsigned int>(shift); - return SIMD_4x32(vec_sl(m_reg, (__vector unsigned int){s, s, s, s})); + return SIMD_4x32(vec_sl(m_vmx, (__vector unsigned int){s, s, s, s})); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vshlq_n_u32(m_neon, static_cast<int>(shift))); #else - return SIMD_4x32(m_reg[0] << shift, - m_reg[1] << shift, - m_reg[2] << shift, - m_reg[3] << shift); + return SIMD_4x32(m_scalar[0] << shift, + m_scalar[1] << shift, + m_scalar[2] << shift, + m_scalar[3] << shift); #endif } SIMD_4x32 operator>>(size_t shift) const { #if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_srli_epi32(m_reg, static_cast<int>(shift))); + return SIMD_4x32(_mm_srli_epi32(m_sse, static_cast<int>(shift))); #elif defined(BOTAN_SIMD_USE_ALTIVEC) const unsigned int s = static_cast<unsigned int>(shift); - return SIMD_4x32(vec_sr(m_reg, (__vector unsigned int){s, s, s, s})); + return SIMD_4x32(vec_sr(m_vmx, (__vector unsigned int){s, s, s, s})); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vshrq_n_u32(m_neon, static_cast<int>(shift))); #else - return SIMD_4x32(m_reg[0] >> shift, - m_reg[1] >> shift, - m_reg[2] >> shift, - m_reg[3] >> shift); + return SIMD_4x32(m_scalar[0] >> shift, m_scalar[1] >> shift, + m_scalar[2] >> shift, m_scalar[3] >> shift); #endif } @@ -367,89 +483,120 @@ class SIMD_4x32 SIMD_4x32 operator~() const { #if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_xor_si128(m_reg, _mm_set1_epi32(0xFFFFFFFF))); + return SIMD_4x32(_mm_xor_si128(m_sse, _mm_set1_epi32(0xFFFFFFFF))); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - return SIMD_4x32(vec_nor(m_reg, m_reg)); + return SIMD_4x32(vec_nor(m_vmx, m_vmx)); +#elif defined(BOTAN_SIMD_USE_NEON) + return SIMD_4x32(vmvnq_u32(m_neon)); #else - return SIMD_4x32(~m_reg[0], - ~m_reg[1], - ~m_reg[2], - ~m_reg[3]); + return SIMD_4x32(~m_scalar[0], ~m_scalar[1], ~m_scalar[2], ~m_scalar[3]); #endif } // (~reg) & other - SIMD_4x32 andc(const SIMD_4x32& other) + SIMD_4x32 andc(const SIMD_4x32& other) const { #if defined(BOTAN_SIMD_USE_SSE2) - return SIMD_4x32(_mm_andnot_si128(m_reg, other.m_reg)); + return SIMD_4x32(_mm_andnot_si128(m_sse, other.m_sse)); #elif defined(BOTAN_SIMD_USE_ALTIVEC) /* AltiVec does arg1 & ~arg2 rather than SSE's ~arg1 & arg2 so swap the arguments */ - return SIMD_4x32(vec_andc(other.m_reg, m_reg)); + return SIMD_4x32(vec_andc(other.m_vmx, m_vmx)); +#elif defined(BOTAN_SIMD_USE_NEON) + // NEON is also a & ~b + return SIMD_4x32(vbicq_u32(other.m_neon, m_neon)); #else - return SIMD_4x32((~m_reg[0]) & other.m_reg[0], - (~m_reg[1]) & other.m_reg[1], - (~m_reg[2]) & other.m_reg[2], - (~m_reg[3]) & other.m_reg[3]); + return SIMD_4x32((~m_scalar[0]) & other.m_scalar[0], + (~m_scalar[1]) & other.m_scalar[1], + (~m_scalar[2]) & other.m_scalar[2], + (~m_scalar[3]) & other.m_scalar[3]); #endif } + /** + * Return copy *this with each word byte swapped + */ SIMD_4x32 bswap() const { #if defined(BOTAN_SIMD_USE_SSE2) - __m128i T = m_reg; + __m128i T = m_sse; T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1)); T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1)); - - return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8), - _mm_slli_epi16(T, 8))); + return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8))); #elif defined(BOTAN_SIMD_USE_ALTIVEC) __vector unsigned char perm = vec_lvsl(0, static_cast<uint32_t*>(nullptr)); - perm = vec_xor(perm, vec_splat_u8(3)); + return SIMD_4x32(vec_perm(m_vmx, m_vmx, perm)); - return SIMD_4x32(vec_perm(m_reg, m_reg, perm)); +#elif defined(BOTAN_SIMD_USE_NEON) + + //return SIMD_4x32(vrev64q_u32(m_neon)); + + // FIXME this is really slow + SIMD_4x32 ror8(m_neon); + ror8.rotate_right(8); + SIMD_4x32 rol8(m_neon); + rol8.rotate_left(8); + + SIMD_4x32 mask1 = SIMD_4x32::splat(0xFF00FF00); + SIMD_4x32 mask2 = SIMD_4x32::splat(0x00FF00FF); + return (ror8 & mask1) | (rol8 & mask2); #else - return SIMD_4x32(reverse_bytes(m_reg[0]), - reverse_bytes(m_reg[1]), - reverse_bytes(m_reg[2]), - reverse_bytes(m_reg[3])); + // scalar + return SIMD_4x32(reverse_bytes(m_scalar[0]), + reverse_bytes(m_scalar[1]), + reverse_bytes(m_scalar[2]), + reverse_bytes(m_scalar[3])); #endif } + /** + * 4x4 Transposition on SIMD registers + */ static void transpose(SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) { #if defined(BOTAN_SIMD_USE_SSE2) - __m128i T0 = _mm_unpacklo_epi32(B0.m_reg, B1.m_reg); - __m128i T1 = _mm_unpacklo_epi32(B2.m_reg, B3.m_reg); - __m128i T2 = _mm_unpackhi_epi32(B0.m_reg, B1.m_reg); - __m128i T3 = _mm_unpackhi_epi32(B2.m_reg, B3.m_reg); - B0.m_reg = _mm_unpacklo_epi64(T0, T1); - B1.m_reg = _mm_unpackhi_epi64(T0, T1); - B2.m_reg = _mm_unpacklo_epi64(T2, T3); - B3.m_reg = _mm_unpackhi_epi64(T2, T3); + const __m128i T0 = _mm_unpacklo_epi32(B0.m_sse, B1.m_sse); + const __m128i T1 = _mm_unpacklo_epi32(B2.m_sse, B3.m_sse); + const __m128i T2 = _mm_unpackhi_epi32(B0.m_sse, B1.m_sse); + const __m128i T3 = _mm_unpackhi_epi32(B2.m_sse, B3.m_sse); + + B0.m_sse = _mm_unpacklo_epi64(T0, T1); + B1.m_sse = _mm_unpackhi_epi64(T0, T1); + B2.m_sse = _mm_unpacklo_epi64(T2, T3); + B3.m_sse = _mm_unpackhi_epi64(T2, T3); #elif defined(BOTAN_SIMD_USE_ALTIVEC) - __vector unsigned int T0 = vec_mergeh(B0.m_reg, B2.m_reg); - __vector unsigned int T1 = vec_mergel(B0.m_reg, B2.m_reg); - __vector unsigned int T2 = vec_mergeh(B1.m_reg, B3.m_reg); - __vector unsigned int T3 = vec_mergel(B1.m_reg, B3.m_reg); - - B0.m_reg = vec_mergeh(T0, T2); - B1.m_reg = vec_mergel(T0, T2); - B2.m_reg = vec_mergeh(T1, T3); - B3.m_reg = vec_mergel(T1, T3); + const __vector unsigned int T0 = vec_mergeh(B0.m_vmx, B2.m_vmx); + const __vector unsigned int T1 = vec_mergeh(B1.m_vmx, B3.m_vmx); + const __vector unsigned int T2 = vec_mergel(B0.m_vmx, B2.m_vmx); + const __vector unsigned int T3 = vec_mergel(B1.m_vmx, B3.m_vmx); + + B0.m_vmx = vec_mergeh(T0, T1); + B1.m_vmx = vec_mergel(T0, T1); + B2.m_vmx = vec_mergeh(T2, T3); + B3.m_vmx = vec_mergel(T2, T3); +#elif defined(BOTAN_SIMD_USE_NEON) + const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon); + const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon); + const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon); + const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon); + + B0.m_neon = vzip1q_u32(T0, T1); + B1.m_neon = vzip2q_u32(T0, T1); + B2.m_neon = vzip1q_u32(T2, T3); + B3.m_neon = vzip2q_u32(T2, T3); #else - SIMD_4x32 T0(B0.m_reg[0], B1.m_reg[0], B2.m_reg[0], B3.m_reg[0]); - SIMD_4x32 T1(B0.m_reg[1], B1.m_reg[1], B2.m_reg[1], B3.m_reg[1]); - SIMD_4x32 T2(B0.m_reg[2], B1.m_reg[2], B2.m_reg[2], B3.m_reg[2]); - SIMD_4x32 T3(B0.m_reg[3], B1.m_reg[3], B2.m_reg[3], B3.m_reg[3]); + // scalar + SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]); + SIMD_4x32 T1(B0.m_scalar[1], B1.m_scalar[1], B2.m_scalar[1], B3.m_scalar[1]); + SIMD_4x32 T2(B0.m_scalar[2], B1.m_scalar[2], B2.m_scalar[2], B3.m_scalar[2]); + SIMD_4x32 T3(B0.m_scalar[3], B1.m_scalar[3], B2.m_scalar[3], B3.m_scalar[3]); B0 = T0; B1 = T1; @@ -459,18 +606,23 @@ class SIMD_4x32 } private: + #if defined(BOTAN_SIMD_USE_SSE2) - explicit SIMD_4x32(__m128i in) { m_reg = in; } + explicit SIMD_4x32(__m128i in) : m_sse(in) {} #elif defined(BOTAN_SIMD_USE_ALTIVEC) - explicit SIMD_4x32(__vector unsigned int input) { m_reg = input; } + explicit SIMD_4x32(__vector unsigned int in) : m_vmx(in) {} +#elif defined(BOTAN_SIMD_USE_NEON) + explicit SIMD_4x32(uint32x4_t in) : m_neon(in) {} #endif #if defined(BOTAN_SIMD_USE_SSE2) - __m128i m_reg; + __m128i m_sse; #elif defined(BOTAN_SIMD_USE_ALTIVEC) - __vector unsigned int m_reg; + __vector unsigned int m_vmx; +#elif defined(BOTAN_SIMD_USE_NEON) + uint32x4_t m_neon; #else - uint32_t m_reg[4]; + uint32_t m_scalar[4]; #endif }; diff --git a/src/tests/test_simd.cpp b/src/tests/test_simd.cpp new file mode 100644 index 000000000..1465d9269 --- /dev/null +++ b/src/tests/test_simd.cpp @@ -0,0 +1,161 @@ +/* +* (C) 2017 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include "tests.h" + +#if defined(BOTAN_HAS_SIMD_32) + #include <botan/internal/simd_32.h> + #include <botan/cpuid.h> +#endif + +namespace Botan_Tests { + +#if defined(BOTAN_HAS_SIMD_32) + +class SIMD_32_Tests : public Test + { + public: + std::vector<Test::Result> run() override + { + Test::Result result("SIMD_4x32"); + + if(Botan::CPUID::has_simd_32() == false) + { + result.test_note("Skipping SIMD_4x32 tests due to missing CPU support at runtime"); + return {result}; + } + + const uint32_t pat1 = 0xAABBCCDD; + const uint32_t pat2 = 0x87654321; + const uint32_t pat3 = 0x01234567; + const uint32_t pat4 = 0xC0D0E0F0; + + test_eq(result, "default init", Botan::SIMD_4x32(), 0, 0, 0, 0); + test_eq(result, "SIMD scalar constructor", Botan::SIMD_4x32(1,2,3,4), 1, 2, 3, 4); + + const Botan::SIMD_4x32 splat = Botan::SIMD_4x32::splat(pat1); + + test_eq(result, "splat", splat, pat1, pat1, pat1, pat1); + + const Botan::SIMD_4x32 input(pat1, pat2, pat3, pat4); + + Botan::SIMD_4x32 rol = input; + rol.rotate_left(3); + + test_eq(result, "rotate_left", rol, + Botan::rotate_left(pat1, 3), + Botan::rotate_left(pat2, 3), + Botan::rotate_left(pat3, 3), + Botan::rotate_left(pat4, 3)); + + Botan::SIMD_4x32 ror = input; + ror.rotate_right(9); + + test_eq(result, "rotate_right", ror, + Botan::rotate_right(pat1, 9), + Botan::rotate_right(pat2, 9), + Botan::rotate_right(pat3, 9), + Botan::rotate_right(pat4, 9)); + + Botan::SIMD_4x32 add = input + splat; + test_eq(result, "add +", add, pat1+pat1, pat2+pat1, pat3+pat1, pat4+pat1); + + add -= splat; + test_eq(result, "sub -=", add, pat1, pat2, pat3, pat4); + + add += splat; + test_eq(result, "add +=", add, pat1+pat1, pat2+pat1, pat3+pat1, pat4+pat1); + + test_eq(result, "xor", input ^ splat, 0, pat2^pat1, pat3^pat1, pat4^pat1); + test_eq(result, "or", input | splat, pat1, pat2 | pat1, pat3 | pat1, pat4 | pat1); + test_eq(result, "and", input & splat, pat1, pat2 & pat1, pat3 & pat1, pat4 & pat1); + + Botan::SIMD_4x32 blender = input; + blender |= splat; + test_eq(result, "|=", blender, pat1, pat2|pat1, pat3|pat1, pat4|pat1); + blender &= splat; + test_eq(result, "&=", blender, pat1, pat1, pat1, pat1); + blender ^= splat; + test_eq(result, "^=", blender, 0, 0, 0, 0); + + blender = ~blender; + test_eq(result, "~", blender, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); + + blender = blender >> 23; + test_eq(result, ">>", blender, 0x1FF, 0x1FF, 0x1FF, 0x1FF); + + blender = blender << 27; + test_eq(result, "<<", blender, 0xF8000000, 0xF8000000, 0xF8000000, 0xF8000000); + + blender = ~blender; + test_eq(result, "~", blender, 0x7FFFFFF, 0x7FFFFFF, 0x7FFFFFF, 0x7FFFFFF); + + blender = input.andc(~blender); + test_eq(result, "andc", blender, + ~pat1 & 0xF8000000, ~pat2 & 0xF8000000, + ~pat3 & 0xF8000000, ~pat4 & 0xF8000000); + + test_eq(result, "bswap", input.bswap(), + Botan::reverse_bytes(pat1), + Botan::reverse_bytes(pat2), + Botan::reverse_bytes(pat3), + Botan::reverse_bytes(pat4)); + + Botan::SIMD_4x32 t1(pat1, pat2, pat3, pat4); + Botan::SIMD_4x32 t2(pat1+1, pat2+1, pat3+1, pat4+1); + Botan::SIMD_4x32 t3(pat1+2, pat2+2, pat3+2, pat4+2); + Botan::SIMD_4x32 t4(pat1+3, pat2+3, pat3+3, pat4+3); + + Botan::SIMD_4x32::transpose(t1, t2, t3, t4); + + test_eq(result, "transpose t1", t1, pat1, pat1+1, pat1+2, pat1+3); + test_eq(result, "transpose t2", t2, pat2, pat2+1, pat2+2, pat2+3); + test_eq(result, "transpose t3", t3, pat3, pat3+1, pat3+2, pat3+3); + test_eq(result, "transpose t4", t4, pat4, pat4+1, pat4+2, pat4+3); + + return {result}; + } + + private: + void test_eq(Test::Result& result, const std::string& op, + const Botan::SIMD_4x32& simd, + uint32_t exp0, uint32_t exp1, uint32_t exp2, uint32_t exp3) + { + uint8_t mem_be[16]; + simd.store_be(mem_be); + + result.test_int_eq("SIMD_4x32 " + op + " elem0 BE", Botan::make_uint32(mem_be[ 0], mem_be[ 1], mem_be[ 2], mem_be[ 3]), exp0); + result.test_int_eq("SIMD_4x32 " + op + " elem1 BE", Botan::make_uint32(mem_be[ 4], mem_be[ 5], mem_be[ 6], mem_be[ 7]), exp1); + result.test_int_eq("SIMD_4x32 " + op + " elem2 BE", Botan::make_uint32(mem_be[ 8], mem_be[ 9], mem_be[10], mem_be[11]), exp2); + result.test_int_eq("SIMD_4x32 " + op + " elem3 BE", Botan::make_uint32(mem_be[12], mem_be[13], mem_be[14], mem_be[15]), exp3); + + // Check load_be+store_be results in same value + const Botan::SIMD_4x32 reloaded_be = Botan::SIMD_4x32::load_be(mem_be); + uint8_t mem_be2[16]; + reloaded_be.store_be(mem_be2); + result.test_eq(nullptr, "SIMD_4x32 load_be", mem_be, 16, mem_be2, 16); + + uint8_t mem_le[16]; + simd.store_le(mem_le); + + result.test_int_eq("SIMD_4x32 " + op + " elem0 LE", Botan::make_uint32(mem_le[ 3], mem_le[ 2], mem_le[ 1], mem_le[ 0]), exp0); + result.test_int_eq("SIMD_4x32 " + op + " elem1 LE", Botan::make_uint32(mem_le[ 7], mem_le[ 6], mem_le[ 5], mem_le[ 4]), exp1); + result.test_int_eq("SIMD_4x32 " + op + " elem2 LE", Botan::make_uint32(mem_le[11], mem_le[10], mem_le[ 9], mem_le[ 8]), exp2); + result.test_int_eq("SIMD_4x32 " + op + " elem3 LE", Botan::make_uint32(mem_le[15], mem_le[14], mem_le[13], mem_le[12]), exp3); + + // Check load_le+store_le results in same value + const Botan::SIMD_4x32 reloaded_le = Botan::SIMD_4x32::load_le(mem_le); + uint8_t mem_le2[16]; + reloaded_le.store_le(mem_le2); + result.test_eq(nullptr, "SIMD_4x32 load_le", mem_le, 16, mem_le2, 16); + } + + }; + +BOTAN_REGISTER_TEST("simd_32", SIMD_32_Tests); +#endif + +} diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index 855e19f80..fa93397b2 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -799,6 +799,13 @@ std::vector<Botan::CPUID::CPUID_bits> map_cpuid_string(const std::string& tok) return {Botan::CPUID::CPUID_ALTIVEC_BIT}; #endif +#if defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY) + if(tok == "neon" || tok == "simd") + return {Botan::CPUID::CPUID_ARM_NEON_BIT}; +#endif + + BOTAN_UNUSED(tok); + return {}; } diff --git a/src/tests/tests.h b/src/tests/tests.h index aa5208325..ed7008c34 100644 --- a/src/tests/tests.h +++ b/src/tests/tests.h @@ -187,6 +187,12 @@ class Test return test_eq(what, static_cast<size_t>(x), static_cast<size_t>(y)); } + template<typename I1, typename I2> + bool test_int_eq(const std::string& what, I1 x, I2 y) + { + return test_eq(what.c_str(), static_cast<size_t>(x), static_cast<size_t>(y)); + } + bool test_lt(const std::string& what, size_t produced, size_t expected); bool test_lte(const std::string& what, size_t produced, size_t expected); bool test_gte(const std::string& what, size_t produced, size_t expected); |