diff options
-rwxr-xr-x | configure.py | 59 | ||||
-rw-r--r-- | doc/todo.rst | 1 | ||||
-rw-r--r-- | src/build-data/botan.doxy.in | 1 | ||||
-rw-r--r-- | src/build-data/policy/bsi.txt | 1 | ||||
-rw-r--r-- | src/build-data/policy/modern.txt | 1 | ||||
-rw-r--r-- | src/build-data/policy/nist.txt | 1 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent.cpp | 26 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent.h | 13 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent_avx2/info.txt | 9 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp | 155 | ||||
-rw-r--r-- | src/lib/utils/simd/simd_avx2/info.txt | 16 | ||||
-rw-r--r-- | src/lib/utils/simd/simd_avx2/simd_avx2.h | 198 | ||||
-rw-r--r-- | src/tests/data/block/serpent.vec | 2 |
13 files changed, 457 insertions, 26 deletions
diff --git a/configure.py b/configure.py index d31c8c20b..b03802e56 100755 --- a/configure.py +++ b/configure.py @@ -2360,16 +2360,10 @@ class AmalgamationHeader(object): self.included_already = set() self.all_std_includes = set() - encoding_kwords = {} - if sys.version_info[0] == 3: - encoding_kwords['encoding'] = 'utf8' - self.file_contents = {} for filepath in sorted(input_filepaths): try: - with open(filepath, **encoding_kwords) as f: - raw_content = f.readlines() - contents = AmalgamationGenerator.strip_header_goop(filepath, raw_content) + contents = AmalgamationGenerator.read_header(filepath) self.file_contents[os.path.basename(filepath)] = contents except IOError as e: logging.error('Error processing file %s for amalgamation: %s' % (filepath, e)) @@ -2447,6 +2441,15 @@ class AmalgamationGenerator(object): _header_guard_pattern = re.compile('^#define BOTAN_.*_H_$') @staticmethod + def read_header(filepath): + encoding_kwords = {} + if sys.version_info[0] == 3: + encoding_kwords['encoding'] = 'utf8' + with open(filepath, **encoding_kwords) as f: + raw_content = f.readlines() + return AmalgamationGenerator.strip_header_goop(filepath, raw_content) + + @staticmethod def strip_header_goop(header_name, header_lines): lines = copy.copy(header_lines) # defensive copy @@ -2513,16 +2516,32 @@ class AmalgamationGenerator(object): logging.info('Writing amalgamation header to %s' % (header_name)) pub_header_amalag.write_to_file(header_name, "BOTAN_AMALGAMATION_H_") - internal_headers = AmalgamationHeader(self._build_paths.internal_headers) + isa_headers = {} + internal_headers = [] + + def known_isa_header(hdr): + if hdr == 'simd_avx2.h': + return 'avx2' + return None + + for hdr in self._build_paths.internal_headers: + isa = known_isa_header(os.path.basename(hdr)) + if isa: + isa_headers[isa] = ''.join(AmalgamationGenerator.read_header(hdr)) + else: + internal_headers.append(hdr) + + internal_headers = AmalgamationHeader(internal_headers) header_int_name = '%s_internal.h' % (AmalgamationGenerator.filename_prefix) logging.info('Writing amalgamation header to %s' % (header_int_name)) internal_headers.write_to_file(header_int_name, "BOTAN_AMALGAMATION_INTERNAL_H_") header_files = [header_name, header_int_name] included_in_headers = pub_header_amalag.all_std_includes | internal_headers.all_std_includes - return header_files, included_in_headers + return header_files, included_in_headers, isa_headers - def _generate_sources(self, amalgamation_headers, included_in_headers): #pylint: disable=too-many-locals,too-many-branches + def _generate_sources(self, amalgamation_headers, included_in_headers, isa_headers): + #pylint: disable=too-many-locals,too-many-branches encoding_kwords = {} if sys.version_info[0] == 3: encoding_kwords['encoding'] = 'utf8' @@ -2541,6 +2560,14 @@ class AmalgamationGenerator(object): logging.info('Writing amalgamation source to %s' % (filepath)) amalgamation_files[target] = open(filepath, 'w', **encoding_kwords) + def gcc_isa(isa): + if isa == 'sse41': + return 'sse4.1' + elif isa == 'sse42': + return 'ssse4.2' + else: + return isa + for target, f in amalgamation_files.items(): AmalgamationHeader.write_banner(f) f.write('\n') @@ -2550,13 +2577,11 @@ class AmalgamationGenerator(object): for isa in self._isas_for_target(target): - if isa == 'sse41': - isa = 'sse4.1' - elif isa == 'sse42': - isa = 'ssse4.2' + if isa in isa_headers: + f.write(isa_headers[isa]) f.write('#if defined(__GNUG__) && !defined(__clang__)\n') - f.write('#pragma GCC target ("%s")\n' % (isa)) + f.write('#pragma GCC target ("%s")\n' % (gcc_isa(isa))) f.write('#endif\n') # target to include header map @@ -2588,8 +2613,8 @@ class AmalgamationGenerator(object): return set(amalgamation_sources.values()) def generate(self): - amalgamation_headers, included_in_headers = self._generate_headers() - amalgamation_sources = self._generate_sources(amalgamation_headers, included_in_headers) + amalgamation_headers, included_in_headers, isa_headers = self._generate_headers() + amalgamation_sources = self._generate_sources(amalgamation_headers, included_in_headers, isa_headers) return (sorted(amalgamation_sources), sorted(amalgamation_headers)) diff --git a/doc/todo.rst b/doc/todo.rst index e6b0a2df0..c7d41197a 100644 --- a/doc/todo.rst +++ b/doc/todo.rst @@ -15,7 +15,6 @@ Ciphers, Hashes, PBKDF * Compressed tables for AES * AES using vector permutes for NEON * Camellia using AES-NI -* Serpent using AVX2 or SSSE3/pshufb * ChaCha20 using AVX2, NEON * ASCON 1.2 (CAESAR) * NORX-64 3.0 (CAESAR) diff --git a/src/build-data/botan.doxy.in b/src/build-data/botan.doxy.in index ec6ec6626..c3261a7a9 100644 --- a/src/build-data/botan.doxy.in +++ b/src/build-data/botan.doxy.in @@ -160,6 +160,7 @@ PREDEFINED = BOTAN_HAS_AES_ARMV8 \ BOTAN_HAS_IDEA_SSE2 \ BOTAN_HAS_NOEKEON_SIMD \ BOTAN_HAS_SERPENT_SIMD \ + BOTAN_HAS_SERPENT_AVX2 \ BOTAN_HAS_SHA1_SSE2 \ BOTAN_HAS_SHA2_32_X86 \ BOTAN_HAS_SHA2_32_X86_BMI2 \ diff --git a/src/build-data/policy/bsi.txt b/src/build-data/policy/bsi.txt index f152186ce..f9950c7c7 100644 --- a/src/build-data/policy/bsi.txt +++ b/src/build-data/policy/bsi.txt @@ -101,6 +101,7 @@ noekeon_simd seed serpent serpent_simd +serpent_avx2 shacal2 shacal2_x86 shacal2_simd diff --git a/src/build-data/policy/modern.txt b/src/build-data/policy/modern.txt index c97b87e72..ae659087c 100644 --- a/src/build-data/policy/modern.txt +++ b/src/build-data/policy/modern.txt @@ -63,6 +63,7 @@ aes_ssse3 aes_armv8 aes_power8 serpent_simd +serpent_avx2 threefish_512_avx2 chacha_sse2 diff --git a/src/build-data/policy/nist.txt b/src/build-data/policy/nist.txt index c76587e82..d9f698e31 100644 --- a/src/build-data/policy/nist.txt +++ b/src/build-data/policy/nist.txt @@ -104,6 +104,7 @@ noekeon_simd seed serpent serpent_simd +serpent_avx2 sm4 shacal2 shacal2_x86 diff --git a/src/lib/block/serpent/serpent.cpp b/src/lib/block/serpent/serpent.cpp index 39968e87e..d9001d19f 100644 --- a/src/lib/block/serpent/serpent.cpp +++ b/src/lib/block/serpent/serpent.cpp @@ -59,6 +59,19 @@ void Serpent::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_round_key.empty() == false); +#if defined(BOTAN_HAS_SERPENT_AVX2) + if(CPUID::has_avx2()) + { + while(blocks >= 8) + { + avx2_encrypt_8(in, out); + in += 8 * BLOCK_SIZE; + out += 8 * BLOCK_SIZE; + blocks -= 8; + } + } +#endif + #if defined(BOTAN_HAS_SERPENT_SIMD) if(CPUID::has_simd_32()) { @@ -121,6 +134,19 @@ void Serpent::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_round_key.empty() == false); +#if defined(BOTAN_HAS_SERPENT_AVX2) + if(CPUID::has_avx2()) + { + while(blocks >= 8) + { + avx2_decrypt_8(in, out); + in += 8 * BLOCK_SIZE; + out += 8 * BLOCK_SIZE; + blocks -= 8; + } + } +#endif + #if defined(BOTAN_HAS_SERPENT_SIMD) if(CPUID::has_simd_32()) { diff --git a/src/lib/block/serpent/serpent.h b/src/lib/block/serpent/serpent.h index 4d23c9a01..641ee0b9c 100644 --- a/src/lib/block/serpent/serpent.h +++ b/src/lib/block/serpent/serpent.h @@ -30,18 +30,17 @@ class BOTAN_PUBLIC_API(2,0) Serpent final : public Block_Cipher_Fixed_Params<16, size_t parallelism() const override { return 4; } private: + #if defined(BOTAN_HAS_SERPENT_SIMD) - /** - * Encrypt 4 blocks in parallel using SSE2 or AltiVec - */ void simd_encrypt_4(const uint8_t in[64], uint8_t out[64]) const; - - /** - * Decrypt 4 blocks in parallel using SSE2 or AltiVec - */ void simd_decrypt_4(const uint8_t in[64], uint8_t out[64]) const; #endif +#if defined(BOTAN_HAS_SERPENT_AVX2) + void avx2_encrypt_8(const uint8_t in[64], uint8_t out[64]) const; + void avx2_decrypt_8(const uint8_t in[64], uint8_t out[64]) const; +#endif + void key_schedule(const uint8_t key[], size_t length) override; secure_vector<uint32_t> m_round_key; diff --git a/src/lib/block/serpent/serpent_avx2/info.txt b/src/lib/block/serpent/serpent_avx2/info.txt new file mode 100644 index 000000000..8225e63a3 --- /dev/null +++ b/src/lib/block/serpent/serpent_avx2/info.txt @@ -0,0 +1,9 @@ +<defines> +SERPENT_AVX2 -> 20180824 +</defines> + +need_isa avx2 + +<requires> +simd_avx2 +</requires> diff --git a/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp b/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp new file mode 100644 index 000000000..4e4420d58 --- /dev/null +++ b/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp @@ -0,0 +1,155 @@ +/* +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/serpent.h> +#include <botan/internal/serpent_sbox.h> +#include <botan/internal/simd_avx2.h> + +namespace Botan { + + +#define key_xor(round, B0, B1, B2, B3) \ + do { \ + B0 ^= SIMD_8x32::splat(m_round_key[4*round ]); \ + B1 ^= SIMD_8x32::splat(m_round_key[4*round+1]); \ + B2 ^= SIMD_8x32::splat(m_round_key[4*round+2]); \ + B3 ^= SIMD_8x32::splat(m_round_key[4*round+3]); \ + } while(0) + +/* +* Serpent's linear transformations +*/ +#define transform(B0, B1, B2, B3) \ + do { \ + B0 = B0.rotl<13>(); \ + B2 = B2.rotl<3>(); \ + B1 ^= B0 ^ B2; \ + B3 ^= B2 ^ B0.shl<3>(); \ + B1 = B1.rotl<1>(); \ + B3 = B3.rotl<7>(); \ + B0 ^= B1 ^ B3; \ + B2 ^= B3 ^ B1.shl<7>(); \ + B0 = B0.rotl<5>(); \ + B2 = B2.rotl<22>(); \ + } while(0) + +#define i_transform(B0, B1, B2, B3) \ + do { \ + B2 = B2.rotr<22>(); \ + B0 = B0.rotr<5>(); \ + B2 ^= B3 ^ B1.shl<7>(); \ + B0 ^= B1 ^ B3; \ + B3 = B3.rotr<7>(); \ + B1 = B1.rotr<1>(); \ + B3 ^= B2 ^ B0.shl<3>(); \ + B1 ^= B0 ^ B2; \ + B2 = B2.rotr<3>(); \ + B0 = B0.rotr<13>(); \ + } while(0) + +void Serpent::avx2_encrypt_8(const uint8_t in[64], uint8_t out[64]) const + { + SIMD_8x32 B0 = SIMD_8x32::load_le(in); + SIMD_8x32 B1 = SIMD_8x32::load_le(in + 32); + SIMD_8x32 B2 = SIMD_8x32::load_le(in + 64); + SIMD_8x32 B3 = SIMD_8x32::load_le(in + 96); + + SIMD_8x32::transpose(B0, B1, B2, B3); + + key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(11,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(12,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(13,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(14,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(15,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(16,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(17,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(18,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(19,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(20,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(21,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(22,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(23,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(24,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(25,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(26,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(27,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(28,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(29,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3); + + SIMD_8x32::transpose(B0, B1, B2, B3); + B0.store_le(out); + B1.store_le(out + 32); + B2.store_le(out + 64); + B3.store_le(out + 96); + } + +void Serpent::avx2_decrypt_8(const uint8_t in[64], uint8_t out[64]) const + { + SIMD_8x32 B0 = SIMD_8x32::load_le(in); + SIMD_8x32 B1 = SIMD_8x32::load_le(in + 32); + SIMD_8x32 B2 = SIMD_8x32::load_le(in + 64); + SIMD_8x32 B3 = SIMD_8x32::load_le(in + 96); + + SIMD_8x32::transpose(B0, B1, B2, B3); + + key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3); + + SIMD_8x32::transpose(B0, B1, B2, B3); + + B0.store_le(out); + B1.store_le(out + 32); + B2.store_le(out + 64); + B3.store_le(out + 96); + } + +} diff --git a/src/lib/utils/simd/simd_avx2/info.txt b/src/lib/utils/simd/simd_avx2/info.txt new file mode 100644 index 000000000..e3d043a12 --- /dev/null +++ b/src/lib/utils/simd/simd_avx2/info.txt @@ -0,0 +1,16 @@ +<defines> +SIMD_AVX2 -> 20180824 +</defines> + +need_isa avx2 + +<header:internal> +simd_avx2.h +</header:internal> + +<cc> +gcc +clang +msvc +icc +</cc> diff --git a/src/lib/utils/simd/simd_avx2/simd_avx2.h b/src/lib/utils/simd/simd_avx2/simd_avx2.h new file mode 100644 index 000000000..19f930854 --- /dev/null +++ b/src/lib/utils/simd/simd_avx2/simd_avx2.h @@ -0,0 +1,198 @@ +/* +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#ifndef BOTAN_SIMD_AVX2_H_ +#define BOTAN_SIMD_AVX2_H_ + +#include <botan/types.h> +#include <immintrin.h> + +namespace Botan { + +class SIMD_8x32 final + { + public: + + SIMD_8x32& operator=(const SIMD_8x32& other) = default; + SIMD_8x32(const SIMD_8x32& other) = default; + +#if !defined(BOTAN_BUILD_COMPILER_IS_MSVC_2013) + SIMD_8x32& operator=(SIMD_8x32&& other) = default; + SIMD_8x32(SIMD_8x32&& other) = default; +#endif + + SIMD_8x32() + { + m_avx2 = _mm256_setzero_si256(); + } + + explicit SIMD_8x32(const uint32_t B[8]) + { + m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B)); + } + + static SIMD_8x32 splat(uint32_t B) + { + return SIMD_8x32(_mm256_set1_epi32(B)); + } + + static SIMD_8x32 load_le(const uint8_t* in) + { + return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); + } + + static SIMD_8x32 load_be(const uint8_t* in) + { + return load_le(in).bswap(); + } + + void store_le(uint8_t out[]) const + { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); + } + + void store_be(uint8_t out[]) const + { + bswap().store_le(out); + } + + template<size_t ROT> + SIMD_8x32 rotl() const + { + static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); + + return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), + _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); + } + + template<size_t ROT> + SIMD_8x32 rotr() const + { + return this->rotl<32-ROT>(); + } + + SIMD_8x32 operator+(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval += other; + return retval; + } + + SIMD_8x32 operator-(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval -= other; + return retval; + } + + SIMD_8x32 operator^(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval ^= other; + return retval; + } + + SIMD_8x32 operator|(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval |= other; + return retval; + } + + SIMD_8x32 operator&(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval &= other; + return retval; + } + + void operator+=(const SIMD_8x32& other) + { + m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); + } + + void operator-=(const SIMD_8x32& other) + { + m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); + } + + void operator^=(const SIMD_8x32& other) + { + m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); + } + + void operator|=(const SIMD_8x32& other) + { + m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); + } + + void operator&=(const SIMD_8x32& other) + { + m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); + } + + template<int SHIFT> SIMD_8x32 shl() const + { + return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT)); + } + + template<int SHIFT> SIMD_8x32 shr() const + { + return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT)); + } + + SIMD_8x32 operator~() const + { + return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF))); + } + + // (~reg) & other + SIMD_8x32 andc(const SIMD_8x32& other) const + { + return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2)); + } + + SIMD_8x32 bswap() const + { + const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 19, 18, 17, 16, + 23, 22, 21, 20, + 27, 26, 25, 24, + 31, 30, 29, 28 }; + + const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK)); + + const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap); + + return SIMD_8x32(output); + } + + static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, + SIMD_8x32& B2, SIMD_8x32& B3) + { + const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2); + const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2); + const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2); + const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2); + + B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1); + B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1); + B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3); + B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3); + } + + private: + SIMD_8x32(__m256i x) : m_avx2(x) {} + + __m256i m_avx2; + }; + +} + +#endif diff --git a/src/tests/data/block/serpent.vec b/src/tests/data/block/serpent.vec index 9e6b9eb07..272519236 100644 --- a/src/tests/data/block/serpent.vec +++ b/src/tests/data/block/serpent.vec @@ -1,5 +1,5 @@ -#test cpuid simd +#test cpuid simd avx2 [Serpent] Key = 00000000000000000000000000000000 |