aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/build-data/botan.doxy.in1
-rw-r--r--src/build-data/policy/bsi.txt1
-rw-r--r--src/build-data/policy/modern.txt1
-rw-r--r--src/build-data/policy/nist.txt1
-rw-r--r--src/lib/block/serpent/serpent.cpp26
-rw-r--r--src/lib/block/serpent/serpent.h13
-rw-r--r--src/lib/block/serpent/serpent_avx2/info.txt9
-rw-r--r--src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp155
-rw-r--r--src/lib/utils/simd/simd_avx2/info.txt16
-rw-r--r--src/lib/utils/simd/simd_avx2/simd_avx2.h198
-rw-r--r--src/tests/data/block/serpent.vec2
11 files changed, 415 insertions, 8 deletions
diff --git a/src/build-data/botan.doxy.in b/src/build-data/botan.doxy.in
index ec6ec6626..c3261a7a9 100644
--- a/src/build-data/botan.doxy.in
+++ b/src/build-data/botan.doxy.in
@@ -160,6 +160,7 @@ PREDEFINED = BOTAN_HAS_AES_ARMV8 \
BOTAN_HAS_IDEA_SSE2 \
BOTAN_HAS_NOEKEON_SIMD \
BOTAN_HAS_SERPENT_SIMD \
+ BOTAN_HAS_SERPENT_AVX2 \
BOTAN_HAS_SHA1_SSE2 \
BOTAN_HAS_SHA2_32_X86 \
BOTAN_HAS_SHA2_32_X86_BMI2 \
diff --git a/src/build-data/policy/bsi.txt b/src/build-data/policy/bsi.txt
index f152186ce..f9950c7c7 100644
--- a/src/build-data/policy/bsi.txt
+++ b/src/build-data/policy/bsi.txt
@@ -101,6 +101,7 @@ noekeon_simd
seed
serpent
serpent_simd
+serpent_avx2
shacal2
shacal2_x86
shacal2_simd
diff --git a/src/build-data/policy/modern.txt b/src/build-data/policy/modern.txt
index c97b87e72..ae659087c 100644
--- a/src/build-data/policy/modern.txt
+++ b/src/build-data/policy/modern.txt
@@ -63,6 +63,7 @@ aes_ssse3
aes_armv8
aes_power8
serpent_simd
+serpent_avx2
threefish_512_avx2
chacha_sse2
diff --git a/src/build-data/policy/nist.txt b/src/build-data/policy/nist.txt
index c76587e82..d9f698e31 100644
--- a/src/build-data/policy/nist.txt
+++ b/src/build-data/policy/nist.txt
@@ -104,6 +104,7 @@ noekeon_simd
seed
serpent
serpent_simd
+serpent_avx2
sm4
shacal2
shacal2_x86
diff --git a/src/lib/block/serpent/serpent.cpp b/src/lib/block/serpent/serpent.cpp
index 39968e87e..d9001d19f 100644
--- a/src/lib/block/serpent/serpent.cpp
+++ b/src/lib/block/serpent/serpent.cpp
@@ -59,6 +59,19 @@ void Serpent::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
verify_key_set(m_round_key.empty() == false);
+#if defined(BOTAN_HAS_SERPENT_AVX2)
+ if(CPUID::has_avx2())
+ {
+ while(blocks >= 8)
+ {
+ avx2_encrypt_8(in, out);
+ in += 8 * BLOCK_SIZE;
+ out += 8 * BLOCK_SIZE;
+ blocks -= 8;
+ }
+ }
+#endif
+
#if defined(BOTAN_HAS_SERPENT_SIMD)
if(CPUID::has_simd_32())
{
@@ -121,6 +134,19 @@ void Serpent::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
{
verify_key_set(m_round_key.empty() == false);
+#if defined(BOTAN_HAS_SERPENT_AVX2)
+ if(CPUID::has_avx2())
+ {
+ while(blocks >= 8)
+ {
+ avx2_decrypt_8(in, out);
+ in += 8 * BLOCK_SIZE;
+ out += 8 * BLOCK_SIZE;
+ blocks -= 8;
+ }
+ }
+#endif
+
#if defined(BOTAN_HAS_SERPENT_SIMD)
if(CPUID::has_simd_32())
{
diff --git a/src/lib/block/serpent/serpent.h b/src/lib/block/serpent/serpent.h
index 4d23c9a01..641ee0b9c 100644
--- a/src/lib/block/serpent/serpent.h
+++ b/src/lib/block/serpent/serpent.h
@@ -30,18 +30,17 @@ class BOTAN_PUBLIC_API(2,0) Serpent final : public Block_Cipher_Fixed_Params<16,
size_t parallelism() const override { return 4; }
private:
+
#if defined(BOTAN_HAS_SERPENT_SIMD)
- /**
- * Encrypt 4 blocks in parallel using SSE2 or AltiVec
- */
void simd_encrypt_4(const uint8_t in[64], uint8_t out[64]) const;
-
- /**
- * Decrypt 4 blocks in parallel using SSE2 or AltiVec
- */
void simd_decrypt_4(const uint8_t in[64], uint8_t out[64]) const;
#endif
+#if defined(BOTAN_HAS_SERPENT_AVX2)
+ void avx2_encrypt_8(const uint8_t in[64], uint8_t out[64]) const;
+ void avx2_decrypt_8(const uint8_t in[64], uint8_t out[64]) const;
+#endif
+
void key_schedule(const uint8_t key[], size_t length) override;
secure_vector<uint32_t> m_round_key;
diff --git a/src/lib/block/serpent/serpent_avx2/info.txt b/src/lib/block/serpent/serpent_avx2/info.txt
new file mode 100644
index 000000000..8225e63a3
--- /dev/null
+++ b/src/lib/block/serpent/serpent_avx2/info.txt
@@ -0,0 +1,9 @@
+<defines>
+SERPENT_AVX2 -> 20180824
+</defines>
+
+need_isa avx2
+
+<requires>
+simd_avx2
+</requires>
diff --git a/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp b/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp
new file mode 100644
index 000000000..4e4420d58
--- /dev/null
+++ b/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp
@@ -0,0 +1,155 @@
+/*
+* (C) 2018 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/serpent.h>
+#include <botan/internal/serpent_sbox.h>
+#include <botan/internal/simd_avx2.h>
+
+namespace Botan {
+
+
+#define key_xor(round, B0, B1, B2, B3) \
+ do { \
+ B0 ^= SIMD_8x32::splat(m_round_key[4*round ]); \
+ B1 ^= SIMD_8x32::splat(m_round_key[4*round+1]); \
+ B2 ^= SIMD_8x32::splat(m_round_key[4*round+2]); \
+ B3 ^= SIMD_8x32::splat(m_round_key[4*round+3]); \
+ } while(0)
+
+/*
+* Serpent's linear transformations
+*/
+#define transform(B0, B1, B2, B3) \
+ do { \
+ B0 = B0.rotl<13>(); \
+ B2 = B2.rotl<3>(); \
+ B1 ^= B0 ^ B2; \
+ B3 ^= B2 ^ B0.shl<3>(); \
+ B1 = B1.rotl<1>(); \
+ B3 = B3.rotl<7>(); \
+ B0 ^= B1 ^ B3; \
+ B2 ^= B3 ^ B1.shl<7>(); \
+ B0 = B0.rotl<5>(); \
+ B2 = B2.rotl<22>(); \
+ } while(0)
+
+#define i_transform(B0, B1, B2, B3) \
+ do { \
+ B2 = B2.rotr<22>(); \
+ B0 = B0.rotr<5>(); \
+ B2 ^= B3 ^ B1.shl<7>(); \
+ B0 ^= B1 ^ B3; \
+ B3 = B3.rotr<7>(); \
+ B1 = B1.rotr<1>(); \
+ B3 ^= B2 ^ B0.shl<3>(); \
+ B1 ^= B0 ^ B2; \
+ B2 = B2.rotr<3>(); \
+ B0 = B0.rotr<13>(); \
+ } while(0)
+
+void Serpent::avx2_encrypt_8(const uint8_t in[64], uint8_t out[64]) const
+ {
+ SIMD_8x32 B0 = SIMD_8x32::load_le(in);
+ SIMD_8x32 B1 = SIMD_8x32::load_le(in + 32);
+ SIMD_8x32 B2 = SIMD_8x32::load_le(in + 64);
+ SIMD_8x32 B3 = SIMD_8x32::load_le(in + 96);
+
+ SIMD_8x32::transpose(B0, B1, B2, B3);
+
+ key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(11,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(12,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(13,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(14,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(15,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(16,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(17,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(18,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(19,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(20,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(21,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(22,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(23,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(24,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(25,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(26,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(27,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(28,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(29,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
+
+ SIMD_8x32::transpose(B0, B1, B2, B3);
+ B0.store_le(out);
+ B1.store_le(out + 32);
+ B2.store_le(out + 64);
+ B3.store_le(out + 96);
+ }
+
+void Serpent::avx2_decrypt_8(const uint8_t in[64], uint8_t out[64]) const
+ {
+ SIMD_8x32 B0 = SIMD_8x32::load_le(in);
+ SIMD_8x32 B1 = SIMD_8x32::load_le(in + 32);
+ SIMD_8x32 B2 = SIMD_8x32::load_le(in + 64);
+ SIMD_8x32 B3 = SIMD_8x32::load_le(in + 96);
+
+ SIMD_8x32::transpose(B0, B1, B2, B3);
+
+ key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
+
+ SIMD_8x32::transpose(B0, B1, B2, B3);
+
+ B0.store_le(out);
+ B1.store_le(out + 32);
+ B2.store_le(out + 64);
+ B3.store_le(out + 96);
+ }
+
+}
diff --git a/src/lib/utils/simd/simd_avx2/info.txt b/src/lib/utils/simd/simd_avx2/info.txt
new file mode 100644
index 000000000..e3d043a12
--- /dev/null
+++ b/src/lib/utils/simd/simd_avx2/info.txt
@@ -0,0 +1,16 @@
+<defines>
+SIMD_AVX2 -> 20180824
+</defines>
+
+need_isa avx2
+
+<header:internal>
+simd_avx2.h
+</header:internal>
+
+<cc>
+gcc
+clang
+msvc
+icc
+</cc>
diff --git a/src/lib/utils/simd/simd_avx2/simd_avx2.h b/src/lib/utils/simd/simd_avx2/simd_avx2.h
new file mode 100644
index 000000000..19f930854
--- /dev/null
+++ b/src/lib/utils/simd/simd_avx2/simd_avx2.h
@@ -0,0 +1,198 @@
+/*
+* (C) 2018 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#ifndef BOTAN_SIMD_AVX2_H_
+#define BOTAN_SIMD_AVX2_H_
+
+#include <botan/types.h>
+#include <immintrin.h>
+
+namespace Botan {
+
+class SIMD_8x32 final
+ {
+ public:
+
+ SIMD_8x32& operator=(const SIMD_8x32& other) = default;
+ SIMD_8x32(const SIMD_8x32& other) = default;
+
+#if !defined(BOTAN_BUILD_COMPILER_IS_MSVC_2013)
+ SIMD_8x32& operator=(SIMD_8x32&& other) = default;
+ SIMD_8x32(SIMD_8x32&& other) = default;
+#endif
+
+ SIMD_8x32()
+ {
+ m_avx2 = _mm256_setzero_si256();
+ }
+
+ explicit SIMD_8x32(const uint32_t B[8])
+ {
+ m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B));
+ }
+
+ static SIMD_8x32 splat(uint32_t B)
+ {
+ return SIMD_8x32(_mm256_set1_epi32(B));
+ }
+
+ static SIMD_8x32 load_le(const uint8_t* in)
+ {
+ return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
+ }
+
+ static SIMD_8x32 load_be(const uint8_t* in)
+ {
+ return load_le(in).bswap();
+ }
+
+ void store_le(uint8_t out[]) const
+ {
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2);
+ }
+
+ void store_be(uint8_t out[]) const
+ {
+ bswap().store_le(out);
+ }
+
+ template<size_t ROT>
+ SIMD_8x32 rotl() const
+ {
+ static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant");
+
+ return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)),
+ _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT))));
+ }
+
+ template<size_t ROT>
+ SIMD_8x32 rotr() const
+ {
+ return this->rotl<32-ROT>();
+ }
+
+ SIMD_8x32 operator+(const SIMD_8x32& other) const
+ {
+ SIMD_8x32 retval(*this);
+ retval += other;
+ return retval;
+ }
+
+ SIMD_8x32 operator-(const SIMD_8x32& other) const
+ {
+ SIMD_8x32 retval(*this);
+ retval -= other;
+ return retval;
+ }
+
+ SIMD_8x32 operator^(const SIMD_8x32& other) const
+ {
+ SIMD_8x32 retval(*this);
+ retval ^= other;
+ return retval;
+ }
+
+ SIMD_8x32 operator|(const SIMD_8x32& other) const
+ {
+ SIMD_8x32 retval(*this);
+ retval |= other;
+ return retval;
+ }
+
+ SIMD_8x32 operator&(const SIMD_8x32& other) const
+ {
+ SIMD_8x32 retval(*this);
+ retval &= other;
+ return retval;
+ }
+
+ void operator+=(const SIMD_8x32& other)
+ {
+ m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2);
+ }
+
+ void operator-=(const SIMD_8x32& other)
+ {
+ m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2);
+ }
+
+ void operator^=(const SIMD_8x32& other)
+ {
+ m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2);
+ }
+
+ void operator|=(const SIMD_8x32& other)
+ {
+ m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2);
+ }
+
+ void operator&=(const SIMD_8x32& other)
+ {
+ m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2);
+ }
+
+ template<int SHIFT> SIMD_8x32 shl() const
+ {
+ return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
+ }
+
+ template<int SHIFT> SIMD_8x32 shr() const
+ {
+ return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
+ }
+
+ SIMD_8x32 operator~() const
+ {
+ return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
+ }
+
+ // (~reg) & other
+ SIMD_8x32 andc(const SIMD_8x32& other) const
+ {
+ return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
+ }
+
+ SIMD_8x32 bswap() const
+ {
+ const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0,
+ 7, 6, 5, 4,
+ 11, 10, 9, 8,
+ 15, 14, 13, 12,
+ 19, 18, 17, 16,
+ 23, 22, 21, 20,
+ 27, 26, 25, 24,
+ 31, 30, 29, 28 };
+
+ const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK));
+
+ const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
+
+ return SIMD_8x32(output);
+ }
+
+ static void transpose(SIMD_8x32& B0, SIMD_8x32& B1,
+ SIMD_8x32& B2, SIMD_8x32& B3)
+ {
+ const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2);
+ const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2);
+ const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2);
+ const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2);
+
+ B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
+ B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
+ B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
+ B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
+ }
+
+ private:
+ SIMD_8x32(__m256i x) : m_avx2(x) {}
+
+ __m256i m_avx2;
+ };
+
+}
+
+#endif
diff --git a/src/tests/data/block/serpent.vec b/src/tests/data/block/serpent.vec
index 9e6b9eb07..272519236 100644
--- a/src/tests/data/block/serpent.vec
+++ b/src/tests/data/block/serpent.vec
@@ -1,5 +1,5 @@
-#test cpuid simd
+#test cpuid simd avx2
[Serpent]
Key = 00000000000000000000000000000000