diff options
author | Jack Lloyd <[email protected]> | 2018-08-24 14:44:20 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2018-08-24 14:44:20 -0400 |
commit | fb7849b2a2d48d950f327a5e8b5e89e1d49caf99 (patch) | |
tree | 94098a0d9ce4bf285f3c1ec5001275c56da70820 /src | |
parent | 162cf0b2cd41dcb866abff1bbf860ea253f89821 (diff) |
WIP for Serpent AVX2
Diffstat (limited to 'src')
-rw-r--r-- | src/build-data/botan.doxy.in | 1 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent.cpp | 26 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent.h | 13 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent_avx2/info.txt | 12 | ||||
-rw-r--r-- | src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp | 371 |
5 files changed, 416 insertions, 7 deletions
diff --git a/src/build-data/botan.doxy.in b/src/build-data/botan.doxy.in index ec6ec6626..c3261a7a9 100644 --- a/src/build-data/botan.doxy.in +++ b/src/build-data/botan.doxy.in @@ -160,6 +160,7 @@ PREDEFINED = BOTAN_HAS_AES_ARMV8 \ BOTAN_HAS_IDEA_SSE2 \ BOTAN_HAS_NOEKEON_SIMD \ BOTAN_HAS_SERPENT_SIMD \ + BOTAN_HAS_SERPENT_AVX2 \ BOTAN_HAS_SHA1_SSE2 \ BOTAN_HAS_SHA2_32_X86 \ BOTAN_HAS_SHA2_32_X86_BMI2 \ diff --git a/src/lib/block/serpent/serpent.cpp b/src/lib/block/serpent/serpent.cpp index 39968e87e..ac95a8706 100644 --- a/src/lib/block/serpent/serpent.cpp +++ b/src/lib/block/serpent/serpent.cpp @@ -59,6 +59,19 @@ void Serpent::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_round_key.empty() == false); +#if defined(BOTAN_HAS_SERPENT_AVX2) + if(CPUID::has_avx2()) + { + while(blocks >= 8) + { + avx2_encrypt_8(in, out); + in += 8 * BLOCK_SIZE; + out += 8 * BLOCK_SIZE; + blocks -= 8; + } + } +#endif + #if defined(BOTAN_HAS_SERPENT_SIMD) if(CPUID::has_simd_32()) { @@ -121,6 +134,19 @@ void Serpent::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_round_key.empty() == false); +#if defined(BOTAN_HAS_SERPENT_AVX2) + if(CPUID::has_avx2() && false) + { + while(blocks >= 8) + { + avx2_decrypt_8(in, out); + in += 8 * BLOCK_SIZE; + out += 8 * BLOCK_SIZE; + blocks -= 8; + } + } +#endif + #if defined(BOTAN_HAS_SERPENT_SIMD) if(CPUID::has_simd_32()) { diff --git a/src/lib/block/serpent/serpent.h b/src/lib/block/serpent/serpent.h index 4d23c9a01..641ee0b9c 100644 --- a/src/lib/block/serpent/serpent.h +++ b/src/lib/block/serpent/serpent.h @@ -30,18 +30,17 @@ class BOTAN_PUBLIC_API(2,0) Serpent final : public Block_Cipher_Fixed_Params<16, size_t parallelism() const override { return 4; } private: + #if defined(BOTAN_HAS_SERPENT_SIMD) - /** - * Encrypt 4 blocks in parallel using SSE2 or AltiVec - */ void simd_encrypt_4(const uint8_t in[64], uint8_t out[64]) const; - - /** - * Decrypt 4 blocks in parallel using SSE2 or AltiVec - */ void simd_decrypt_4(const uint8_t in[64], uint8_t out[64]) const; #endif +#if defined(BOTAN_HAS_SERPENT_AVX2) + void avx2_encrypt_8(const uint8_t in[64], uint8_t out[64]) const; + void avx2_decrypt_8(const uint8_t in[64], uint8_t out[64]) const; +#endif + void key_schedule(const uint8_t key[], size_t length) override; secure_vector<uint32_t> m_round_key; diff --git a/src/lib/block/serpent/serpent_avx2/info.txt b/src/lib/block/serpent/serpent_avx2/info.txt new file mode 100644 index 000000000..8c1ac19ef --- /dev/null +++ b/src/lib/block/serpent/serpent_avx2/info.txt @@ -0,0 +1,12 @@ +<defines> +SERPENT_AVX2 -> 20180824 +</defines> + +need_isa avx2 + +<cc> +gcc +clang +msvc +icc +</cc> diff --git a/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp b/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp new file mode 100644 index 000000000..88a5b4316 --- /dev/null +++ b/src/lib/block/serpent/serpent_avx2/serpent_avx2.cpp @@ -0,0 +1,371 @@ +/* +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/serpent.h> +#include <botan/internal/serpent_sbox.h> +#include <botan/internal/simd_32.h> + +namespace Botan { + +namespace { + +class SIMD_8x32 final + { + public: + + SIMD_8x32& operator=(const SIMD_8x32& other) = default; + SIMD_8x32(const SIMD_8x32& other) = default; + +#if !defined(BOTAN_BUILD_COMPILER_IS_MSVC_2013) + SIMD_8x32& operator=(SIMD_8x32&& other) = default; + SIMD_8x32(SIMD_8x32&& other) = default; +#endif + + SIMD_8x32() : m_lo(), m_hi() + { + } + + SIMD_8x32(const SIMD_4x32& l, const SIMD_4x32& h) : m_lo(l), m_hi(h) + { + } + + /** + * Load SIMD register with 8 32-bit elements + */ + explicit SIMD_8x32(const uint32_t B[8]) : m_lo(B), m_hi(B + 4) + { + } + + /** + * Load SIMD register with 8 32-bit elements + */ + SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, + uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) : + m_lo(B0, B1, B2, B3), + m_hi(B4, B5, B6, B7) + { + } + + /** + * Load SIMD register with one 32-bit element repeated + */ + static SIMD_8x32 splat(uint32_t B) + { + SIMD_4x32 s = SIMD_4x32::splat(B); + return SIMD_8x32(s, s); + } + + /** + * Load a SIMD register with little-endian convention + */ + static SIMD_8x32 load_le(const uint8_t* in) + { + return SIMD_8x32(SIMD_4x32::load_le(in), SIMD_4x32::load_le(in + 16)); + } + + /** + * Load a SIMD register with big-endian convention + */ + static SIMD_8x32 load_be(const uint8_t* in) + { + return SIMD_8x32(SIMD_4x32::load_be(in), SIMD_4x32::load_be(in + 16)); + } + + /** + * Load a SIMD register with little-endian convention + */ + void store_le(uint8_t out[]) const + { + m_lo.store_le(out); + m_hi.store_le(out + 16); + } + + /** + * Load a SIMD register with big-endian convention + */ + void store_be(uint8_t out[]) const + { + m_lo.store_be(out); + m_hi.store_be(out + 16); + } + + /** + * Left rotation by a compile time constant + */ + template<size_t ROT> + SIMD_8x32 rotl() const + { + static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); + return SIMD_8x32(m_lo.rotl<ROT>(), m_hi.rotl<ROT>()); + } + + /** + * Right rotation by a compile time constant + */ + template<size_t ROT> + SIMD_8x32 rotr() const + { + return this->rotl<32-ROT>(); + } + + /** + * Add elements of a SIMD vector + */ + SIMD_8x32 operator+(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval += other; + return retval; + } + + /** + * Subtract elements of a SIMD vector + */ + SIMD_8x32 operator-(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval -= other; + return retval; + } + + /** + * XOR elements of a SIMD vector + */ + SIMD_8x32 operator^(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval ^= other; + return retval; + } + + /** + * Binary OR elements of a SIMD vector + */ + SIMD_8x32 operator|(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval |= other; + return retval; + } + + /** + * Binary AND elements of a SIMD vector + */ + SIMD_8x32 operator&(const SIMD_8x32& other) const + { + SIMD_8x32 retval(*this); + retval &= other; + return retval; + } + + void operator+=(const SIMD_8x32& other) + { + m_lo += other.m_lo; + m_hi += other.m_hi; + } + + void operator-=(const SIMD_8x32& other) + { + m_lo -= other.m_lo; + m_hi -= other.m_hi; + } + + void operator^=(const SIMD_8x32& other) + { + m_lo ^= other.m_lo; + m_hi ^= other.m_hi; + } + + void operator|=(const SIMD_8x32& other) + { + m_lo |= other.m_lo; + m_hi |= other.m_hi; + } + + void operator&=(const SIMD_8x32& other) + { + m_lo &= other.m_lo; + m_hi &= other.m_hi; + } + + template<int SHIFT> SIMD_8x32 shl() const + { + return SIMD_8x32(m_lo.shl<SHIFT>(), m_hi.shl<SHIFT>()); + } + + template<int SHIFT> SIMD_8x32 shr() const + { + return SIMD_8x32(m_lo.shr<SHIFT>(), m_hi.shr<SHIFT>()); + } + + SIMD_8x32 operator~() const + { + return SIMD_8x32(~m_lo, ~m_hi); + } + + // (~reg) & other + SIMD_8x32 andc(const SIMD_8x32& other) const + { + return SIMD_8x32(m_lo.andc(other.m_lo), m_hi.andc(other.m_hi)); + } + + /** + * Return copy *this with each word byte swapped + */ + SIMD_8x32 bswap() const + { + return SIMD_8x32(m_lo.bswap(), m_hi.bswap()); + } + + static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, + SIMD_8x32& B2, SIMD_8x32& B3) + { + SIMD_4x32::transpose(B0.m_lo, B1.m_lo, B2.m_lo, B3.m_lo); + SIMD_4x32::transpose(B0.m_hi, B1.m_hi, B2.m_hi, B3.m_hi); + } + public: + SIMD_4x32 m_lo, m_hi; + }; + +} + +#define key_xor(round, B0, B1, B2, B3) \ + do { \ + B0 ^= SIMD_8x32::splat(m_round_key[4*round ]); \ + B1 ^= SIMD_8x32::splat(m_round_key[4*round+1]); \ + B2 ^= SIMD_8x32::splat(m_round_key[4*round+2]); \ + B3 ^= SIMD_8x32::splat(m_round_key[4*round+3]); \ + } while(0) + +/* +* Serpent's linear transformations +*/ +#define transform(B0, B1, B2, B3) \ + do { \ + B0 = B0.rotl<13>(); \ + B2 = B2.rotl<3>(); \ + B1 ^= B0 ^ B2; \ + B3 ^= B2 ^ B0.shl<3>(); \ + B1 = B1.rotl<1>(); \ + B3 = B3.rotl<7>(); \ + B0 ^= B1 ^ B3; \ + B2 ^= B3 ^ B1.shl<7>(); \ + B0 = B0.rotl<5>(); \ + B2 = B2.rotl<22>(); \ + } while(0) + +#define i_transform(B0, B1, B2, B3) \ + do { \ + B2 = B2.rotr<22>(); \ + B0 = B0.rotr<5>(); \ + B2 ^= B3 ^ B1.shl<7>(); \ + B0 ^= B1 ^ B3; \ + B3 = B3.rotr<7>(); \ + B1 = B1.rotr<1>(); \ + B3 ^= B2 ^ B0.shl<3>(); \ + B1 ^= B0 ^ B2; \ + B2 = B2.rotr<3>(); \ + B0 = B0.rotr<13>(); \ + } while(0) + +void Serpent::avx2_encrypt_8(const uint8_t in[64], uint8_t out[64]) const + { + SIMD_32 B0 = SIMD_32::load_le(in); + SIMD_32 B1 = SIMD_32::load_le(in + 16); + SIMD_32 B2 = SIMD_32::load_le(in + 32); + SIMD_32 B3 = SIMD_32::load_le(in + 48); + + SIMD_32 C0 = SIMD_32::load_le(in + 64); + SIMD_32 C1 = SIMD_32::load_le(in + 64 + 16); + SIMD_32 C2 = SIMD_32::load_le(in + 64 + 32); + SIMD_32 C3 = SIMD_32::load_le(in + 64 + 48); + + SIMD_32::transpose(B0, B1, B2, B3); + SIMD_32::transpose(C0, C1, C2, C3); + +#if 0 + SIMD_8x32 Z0 = SIMD_8x32::load_le(in); + SIMD_8x32 Z1 = SIMD_8x32::load_le(in + 32); + SIMD_8x32 Z2 = SIMD_8x32::load_le(in + 64); + SIMD_8x32 Z3 = SIMD_8x32::load_le(in + 96); + + SIMD_8x32::transpose(Z0, Z1, Z2, Z3); + #else + SIMD_8x32 Z0(B0,C0); + SIMD_8x32 Z1(B1,C1); + SIMD_8x32 Z2(B2,C2); + SIMD_8x32 Z3(B3,C3); + #endif + + key_xor( 0,Z0,Z1,Z2,Z3); SBoxE1(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 1,Z0,Z1,Z2,Z3); SBoxE2(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 2,Z0,Z1,Z2,Z3); SBoxE3(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 3,Z0,Z1,Z2,Z3); SBoxE4(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 4,Z0,Z1,Z2,Z3); SBoxE5(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 5,Z0,Z1,Z2,Z3); SBoxE6(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 6,Z0,Z1,Z2,Z3); SBoxE7(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 7,Z0,Z1,Z2,Z3); SBoxE8(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + + + key_xor( 8,Z0,Z1,Z2,Z3); SBoxE1(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor( 9,Z0,Z1,Z2,Z3); SBoxE2(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(10,Z0,Z1,Z2,Z3); SBoxE3(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(11,Z0,Z1,Z2,Z3); SBoxE4(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(12,Z0,Z1,Z2,Z3); SBoxE5(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(13,Z0,Z1,Z2,Z3); SBoxE6(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(14,Z0,Z1,Z2,Z3); SBoxE7(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(15,Z0,Z1,Z2,Z3); SBoxE8(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + + key_xor(16,Z0,Z1,Z2,Z3); SBoxE1(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(17,Z0,Z1,Z2,Z3); SBoxE2(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(18,Z0,Z1,Z2,Z3); SBoxE3(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(19,Z0,Z1,Z2,Z3); SBoxE4(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(20,Z0,Z1,Z2,Z3); SBoxE5(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(21,Z0,Z1,Z2,Z3); SBoxE6(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(22,Z0,Z1,Z2,Z3); SBoxE7(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(23,Z0,Z1,Z2,Z3); SBoxE8(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + + key_xor(24,Z0,Z1,Z2,Z3); SBoxE1(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(25,Z0,Z1,Z2,Z3); SBoxE2(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(26,Z0,Z1,Z2,Z3); SBoxE3(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(27,Z0,Z1,Z2,Z3); SBoxE4(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(28,Z0,Z1,Z2,Z3); SBoxE5(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(29,Z0,Z1,Z2,Z3); SBoxE6(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(30,Z0,Z1,Z2,Z3); SBoxE7(Z0,Z1,Z2,Z3); transform(Z0,Z1,Z2,Z3); + key_xor(31,Z0,Z1,Z2,Z3); SBoxE8(Z0,Z1,Z2,Z3); key_xor(32,Z0,Z1,Z2,Z3); + + B0 = Z0.m_lo; + B1 = Z1.m_lo; + B2 = Z2.m_lo; + B3 = Z3.m_lo; + C0 = Z0.m_hi; + C1 = Z1.m_hi; + C2 = Z2.m_hi; + C3 = Z3.m_hi; + + SIMD_32::transpose(B0, B1, B2, B3); + SIMD_32::transpose(C0, C1, C2, C3); + + B0.store_le(out); + B1.store_le(out + 16); + B2.store_le(out + 32); + B3.store_le(out + 48); + + C0.store_le(out + 64); + C1.store_le(out + 16 + 64); + C2.store_le(out + 32 + 64); + C3.store_le(out + 48 + 64); + } + +/* +* SIMD Serpent Decryption of 4 blocks in parallel +*/ +void Serpent::avx2_decrypt_8(const uint8_t in[64], uint8_t out[64]) const + { + + } + +} |