aboutsummaryrefslogtreecommitdiffstats
path: root/src/block/serpent_simd
diff options
context:
space:
mode:
Diffstat (limited to 'src/block/serpent_simd')
-rw-r--r--src/block/serpent_simd/info.txt8
-rw-r--r--src/block/serpent_simd/serp_simd.cpp206
-rw-r--r--src/block/serpent_simd/serp_simd.h29
-rw-r--r--src/block/serpent_simd/serp_simd_sbox.h426
4 files changed, 669 insertions, 0 deletions
diff --git a/src/block/serpent_simd/info.txt b/src/block/serpent_simd/info.txt
new file mode 100644
index 000000000..5d9115a4d
--- /dev/null
+++ b/src/block/serpent_simd/info.txt
@@ -0,0 +1,8 @@
+realname "Serpent (SIMD)"
+define SERPENT_SIMD
+
+<requires>
+serpent
+simd_32
+simd_engine
+</requires>
diff --git a/src/block/serpent_simd/serp_simd.cpp b/src/block/serpent_simd/serp_simd.cpp
new file mode 100644
index 000000000..b394b0c26
--- /dev/null
+++ b/src/block/serpent_simd/serp_simd.cpp
@@ -0,0 +1,206 @@
+/*
+* Serpent (SIMD)
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#include <botan/serp_simd.h>
+#include <botan/serp_simd_sbox.h>
+#include <botan/simd_32.h>
+#include <botan/loadstor.h>
+
+namespace Botan {
+
+namespace {
+
+#define key_xor(round, B0, B1, B2, B3) \
+ do { \
+ B0 ^= SIMD_32(keys[4*round ]); \
+ B1 ^= SIMD_32(keys[4*round+1]); \
+ B2 ^= SIMD_32(keys[4*round+2]); \
+ B3 ^= SIMD_32(keys[4*round+3]); \
+ } while(0);
+
+/*
+* Serpent's linear transformations
+*/
+#define transform(B0, B1, B2, B3) \
+ do { \
+ B0.rotate_left(13); \
+ B2.rotate_left(3); \
+ B1 ^= B0 ^ B2; \
+ B3 ^= B2 ^ (B0 << 3); \
+ B1.rotate_left(1); \
+ B3.rotate_left(7); \
+ B0 ^= B1 ^ B3; \
+ B2 ^= B3 ^ (B1 << 7); \
+ B0.rotate_left(5); \
+ B2.rotate_left(22); \
+ } while(0);
+
+#define i_transform(B0, B1, B2, B3) \
+ do { \
+ B2.rotate_right(22); \
+ B0.rotate_right(5); \
+ B2 ^= B3 ^ (B1 << 7); \
+ B0 ^= B1 ^ B3; \
+ B3.rotate_right(7); \
+ B1.rotate_right(1); \
+ B3 ^= B2 ^ (B0 << 3); \
+ B1 ^= B0 ^ B2; \
+ B2.rotate_right(3); \
+ B0.rotate_right(13); \
+ } while(0);
+
+/*
+* SIMD Serpent Encryption of 4 blocks in parallel
+*/
+void serpent_encrypt_4(const byte in[64],
+ byte out[64],
+ const u32bit keys[132])
+ {
+ SIMD_32 B0 = SIMD_32::load_le(in);
+ SIMD_32 B1 = SIMD_32::load_le(in + 16);
+ SIMD_32 B2 = SIMD_32::load_le(in + 32);
+ SIMD_32 B3 = SIMD_32::load_le(in + 48);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(11,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(12,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(13,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(14,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(15,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor(16,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(17,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(18,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(19,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(20,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(21,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(22,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(23,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor(24,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(25,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(26,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(27,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(28,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(29,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ B0.store_le(out);
+ B1.store_le(out + 16);
+ B2.store_le(out + 32);
+ B3.store_le(out + 48);
+ }
+
+/*
+* SIMD Serpent Decryption of 4 blocks in parallel
+*/
+void serpent_decrypt_4(const byte in[64],
+ byte out[64],
+ const u32bit keys[132])
+ {
+ SIMD_32 B0 = SIMD_32::load_le(in);
+ SIMD_32 B1 = SIMD_32::load_le(in + 16);
+ SIMD_32 B2 = SIMD_32::load_le(in + 32);
+ SIMD_32 B3 = SIMD_32::load_le(in + 48);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ B0.store_le(out);
+ B1.store_le(out + 16);
+ B2.store_le(out + 32);
+ B3.store_le(out + 48);
+ }
+
+}
+
+/*
+* Serpent Encryption
+*/
+void Serpent_SIMD::encrypt_n(const byte in[], byte out[], u32bit blocks) const
+ {
+ while(blocks >= 4)
+ {
+ serpent_encrypt_4(in, out, this->round_key);
+ in += 4 * BLOCK_SIZE;
+ out += 4 * BLOCK_SIZE;
+ blocks -= 4;
+ }
+
+ Serpent::encrypt_n(in, out, blocks);
+ }
+
+/*
+* Serpent Decryption
+*/
+void Serpent_SIMD::decrypt_n(const byte in[], byte out[], u32bit blocks) const
+ {
+ while(blocks >= 4)
+ {
+ serpent_decrypt_4(in, out, this->round_key);
+ in += 4 * BLOCK_SIZE;
+ out += 4 * BLOCK_SIZE;
+ blocks -= 4;
+ }
+
+ Serpent::decrypt_n(in, out, blocks);
+ }
+
+}
diff --git a/src/block/serpent_simd/serp_simd.h b/src/block/serpent_simd/serp_simd.h
new file mode 100644
index 000000000..1ecb70159
--- /dev/null
+++ b/src/block/serpent_simd/serp_simd.h
@@ -0,0 +1,29 @@
+/*
+* Serpent (SIMD)
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef BOTAN_SERPENT_SIMD_H__
+#define BOTAN_SERPENT_SIMD_H__
+
+#include <botan/serpent.h>
+
+namespace Botan {
+
+/*
+* Serpent
+*/
+class BOTAN_DLL Serpent_SIMD : public Serpent
+ {
+ public:
+ void encrypt_n(const byte in[], byte out[], u32bit blocks) const;
+ void decrypt_n(const byte in[], byte out[], u32bit blocks) const;
+
+ BlockCipher* clone() const { return new Serpent_SIMD; }
+ };
+
+}
+
+#endif
diff --git a/src/block/serpent_simd/serp_simd_sbox.h b/src/block/serpent_simd/serp_simd_sbox.h
new file mode 100644
index 000000000..6e3da7359
--- /dev/null
+++ b/src/block/serpent_simd/serp_simd_sbox.h
@@ -0,0 +1,426 @@
+/*
+* Serpent Sboxes in SIMD form
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef SERPENT_SIMD_SBOXES_H__
+#define SERPENT_SIMD_SBOXES_H__
+
+#define SBoxE1(B0, B1, B2, B3) \
+ do { \
+ B3 ^= B0; \
+ SIMD_32 B4 = B1; \
+ B1 &= B3; \
+ B4 ^= B2; \
+ B1 ^= B0; \
+ B0 |= B3; \
+ B0 ^= B4; \
+ B4 ^= B3; \
+ B3 ^= B2; \
+ B2 |= B1; \
+ B2 ^= B4; \
+ B4 = ~B4; \
+ B4 |= B1; \
+ B1 ^= B3; \
+ B1 ^= B4; \
+ B3 |= B0; \
+ B1 ^= B3; \
+ B4 ^= B3; \
+ B3 = B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxE2(B0, B1, B2, B3) \
+ do { \
+ B0 = ~B0; \
+ B2 = ~B2; \
+ SIMD_32 B4 = B0; \
+ B0 &= B1; \
+ B2 ^= B0; \
+ B0 |= B3; \
+ B3 ^= B2; \
+ B1 ^= B0; \
+ B0 ^= B4; \
+ B4 |= B1; \
+ B1 ^= B3; \
+ B2 |= B0; \
+ B2 &= B4; \
+ B0 ^= B1; \
+ B1 &= B2; \
+ B1 ^= B0; \
+ B0 &= B2; \
+ B4 ^= B0; \
+ B0 = B2; \
+ B2 = B3; \
+ B3 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxE3(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B0; \
+ B0 &= B2; \
+ B0 ^= B3; \
+ B2 ^= B1; \
+ B2 ^= B0; \
+ B3 |= B4; \
+ B3 ^= B1; \
+ B4 ^= B2; \
+ B1 = B3; \
+ B3 |= B4; \
+ B3 ^= B0; \
+ B0 &= B1; \
+ B4 ^= B0; \
+ B1 ^= B3; \
+ B1 ^= B4; \
+ B4 = ~B4; \
+ B0 = B2; \
+ B2 = B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxE4(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B0; \
+ B0 |= B3; \
+ B3 ^= B1; \
+ B1 &= B4; \
+ B4 ^= B2; \
+ B2 ^= B3; \
+ B3 &= B0; \
+ B4 |= B1; \
+ B3 ^= B4; \
+ B0 ^= B1; \
+ B4 &= B0; \
+ B1 ^= B3; \
+ B4 ^= B2; \
+ B1 |= B0; \
+ B1 ^= B2; \
+ B0 ^= B3; \
+ B2 = B1; \
+ B1 |= B3; \
+ B0 ^= B1; \
+ B1 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxE5(B0, B1, B2, B3) \
+ do { \
+ B1 ^= B3; \
+ B3 = ~B3; \
+ B2 ^= B3; \
+ B3 ^= B0; \
+ SIMD_32 B4 = B1; \
+ B1 &= B3; \
+ B1 ^= B2; \
+ B4 ^= B3; \
+ B0 ^= B4; \
+ B2 &= B4; \
+ B2 ^= B0; \
+ B0 &= B1; \
+ B3 ^= B0; \
+ B4 |= B1; \
+ B4 ^= B0; \
+ B0 |= B3; \
+ B0 ^= B2; \
+ B2 &= B3; \
+ B0 = ~B0; \
+ B4 ^= B2; \
+ B2 = B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxE6(B0, B1, B2, B3) \
+ do { \
+ B0 ^= B1; \
+ B1 ^= B3; \
+ B3 = ~B3; \
+ SIMD_32 B4 = B1; \
+ B1 &= B0; \
+ B2 ^= B3; \
+ B1 ^= B2; \
+ B2 |= B4; \
+ B4 ^= B3; \
+ B3 &= B1; \
+ B3 ^= B0; \
+ B4 ^= B1; \
+ B4 ^= B2; \
+ B2 ^= B0; \
+ B0 &= B3; \
+ B2 = ~B2; \
+ B0 ^= B4; \
+ B4 |= B3; \
+ B4 ^= B2; \
+ B2 = B0; \
+ B0 = B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxE7(B0, B1, B2, B3) \
+ do { \
+ B2 = ~B2; \
+ SIMD_32 B4 = B3; \
+ B3 &= B0; \
+ B0 ^= B4; \
+ B3 ^= B2; \
+ B2 |= B4; \
+ B1 ^= B3; \
+ B2 ^= B0; \
+ B0 |= B1; \
+ B2 ^= B1; \
+ B4 ^= B0; \
+ B0 |= B3; \
+ B0 ^= B2; \
+ B4 ^= B3; \
+ B4 ^= B0; \
+ B3 = ~B3; \
+ B2 &= B4; \
+ B3 ^= B2; \
+ B2 = B4; \
+ } while(0);
+
+#define SBoxE8(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B1; \
+ B1 |= B2; \
+ B1 ^= B3; \
+ B4 ^= B2; \
+ B2 ^= B1; \
+ B3 |= B4; \
+ B3 &= B0; \
+ B4 ^= B2; \
+ B3 ^= B1; \
+ B1 |= B4; \
+ B1 ^= B0; \
+ B0 |= B4; \
+ B0 ^= B2; \
+ B1 ^= B4; \
+ B2 ^= B1; \
+ B1 &= B0; \
+ B1 ^= B4; \
+ B2 = ~B2; \
+ B2 |= B0; \
+ B4 ^= B2; \
+ B2 = B1; \
+ B1 = B3; \
+ B3 = B0; \
+ B0 = B4; \
+ } while(0);
+
+#define SBoxD1(B0, B1, B2, B3) \
+ do { \
+ B2 = ~B2; \
+ SIMD_32 B4 = B1; \
+ B1 |= B0; \
+ B4 = ~B4; \
+ B1 ^= B2; \
+ B2 |= B4; \
+ B1 ^= B3; \
+ B0 ^= B4; \
+ B2 ^= B0; \
+ B0 &= B3; \
+ B4 ^= B0; \
+ B0 |= B1; \
+ B0 ^= B2; \
+ B3 ^= B4; \
+ B2 ^= B1; \
+ B3 ^= B0; \
+ B3 ^= B1; \
+ B2 &= B3; \
+ B4 ^= B2; \
+ B2 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxD2(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B1; \
+ B1 ^= B3; \
+ B3 &= B1; \
+ B4 ^= B2; \
+ B3 ^= B0; \
+ B0 |= B1; \
+ B2 ^= B3; \
+ B0 ^= B4; \
+ B0 |= B2; \
+ B1 ^= B3; \
+ B0 ^= B1; \
+ B1 |= B3; \
+ B1 ^= B0; \
+ B4 = ~B4; \
+ B4 ^= B1; \
+ B1 |= B0; \
+ B1 ^= B0; \
+ B1 |= B4; \
+ B3 ^= B1; \
+ B1 = B0; \
+ B0 = B4; \
+ B4 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxD3(B0, B1, B2, B3) \
+ do { \
+ B2 ^= B3; \
+ B3 ^= B0; \
+ SIMD_32 B4 = B3; \
+ B3 &= B2; \
+ B3 ^= B1; \
+ B1 |= B2; \
+ B1 ^= B4; \
+ B4 &= B3; \
+ B2 ^= B3; \
+ B4 &= B0; \
+ B4 ^= B2; \
+ B2 &= B1; \
+ B2 |= B0; \
+ B3 = ~B3; \
+ B2 ^= B3; \
+ B0 ^= B3; \
+ B0 &= B1; \
+ B3 ^= B4; \
+ B3 ^= B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxD4(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B2; \
+ B2 ^= B1; \
+ B0 ^= B2; \
+ B4 &= B2; \
+ B4 ^= B0; \
+ B0 &= B1; \
+ B1 ^= B3; \
+ B3 |= B4; \
+ B2 ^= B3; \
+ B0 ^= B3; \
+ B1 ^= B4; \
+ B3 &= B2; \
+ B3 ^= B1; \
+ B1 ^= B0; \
+ B1 |= B2; \
+ B0 ^= B3; \
+ B1 ^= B4; \
+ B0 ^= B1; \
+ B4 = B0; \
+ B0 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxD5(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B2; \
+ B2 &= B3; \
+ B2 ^= B1; \
+ B1 |= B3; \
+ B1 &= B0; \
+ B4 ^= B2; \
+ B4 ^= B1; \
+ B1 &= B2; \
+ B0 = ~B0; \
+ B3 ^= B4; \
+ B1 ^= B3; \
+ B3 &= B0; \
+ B3 ^= B2; \
+ B0 ^= B1; \
+ B2 &= B0; \
+ B3 ^= B0; \
+ B2 ^= B4; \
+ B2 |= B3; \
+ B3 ^= B0; \
+ B2 ^= B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxD6(B0, B1, B2, B3) \
+ do { \
+ B1 = ~B1; \
+ SIMD_32 B4 = B3; \
+ B2 ^= B1; \
+ B3 |= B0; \
+ B3 ^= B2; \
+ B2 |= B1; \
+ B2 &= B0; \
+ B4 ^= B3; \
+ B2 ^= B4; \
+ B4 |= B0; \
+ B4 ^= B1; \
+ B1 &= B2; \
+ B1 ^= B3; \
+ B4 ^= B2; \
+ B3 &= B4; \
+ B4 ^= B1; \
+ B3 ^= B4; \
+ B4 = ~B4; \
+ B3 ^= B0; \
+ B0 = B1; \
+ B1 = B4; \
+ B4 = B3; \
+ B3 = B2; \
+ B2 = B4; \
+ } while(0);
+
+#define SBoxD7(B0, B1, B2, B3) \
+ do { \
+ B0 ^= B2; \
+ SIMD_32 B4 = B2; \
+ B2 &= B0; \
+ B4 ^= B3; \
+ B2 = ~B2; \
+ B3 ^= B1; \
+ B2 ^= B3; \
+ B4 |= B0; \
+ B0 ^= B2; \
+ B3 ^= B4; \
+ B4 ^= B1; \
+ B1 &= B3; \
+ B1 ^= B0; \
+ B0 ^= B3; \
+ B0 |= B2; \
+ B3 ^= B1; \
+ B4 ^= B0; \
+ B0 = B1; \
+ B1 = B2; \
+ B2 = B4; \
+ } while(0);
+
+#define SBoxD8(B0, B1, B2, B3) \
+ do { \
+ SIMD_32 B4 = B2; \
+ B2 ^= B0; \
+ B0 &= B3; \
+ B4 |= B3; \
+ B2 = ~B2; \
+ B3 ^= B1; \
+ B1 |= B0; \
+ B0 ^= B2; \
+ B2 &= B4; \
+ B3 &= B4; \
+ B1 ^= B2; \
+ B2 ^= B0; \
+ B0 |= B2; \
+ B4 ^= B1; \
+ B0 ^= B3; \
+ B3 ^= B4; \
+ B4 |= B0; \
+ B3 ^= B2; \
+ B4 ^= B2; \
+ B2 = B1; \
+ B1 = B0; \
+ B0 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#endif