aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorlloyd <[email protected]>2009-08-12 13:49:19 +0000
committerlloyd <[email protected]>2009-08-12 13:49:19 +0000
commit89eb757b344d3605f3f8012079749f01ef23bb6b (patch)
tree8b50f13c278029ac2c8797050c6032c4a92760be /src
parent285d350ed62d2714592a27bf577832a92cb6902f (diff)
Use SSE2 unpack instructions instead of unions for input/output conversion.
About 10% faster than previous. Currently 112 MiB/s in ECB mode, versus about 40 MiB/s in scalar mode, on my 2.4 GHz Core2
Diffstat (limited to 'src')
-rw-r--r--src/block/serpent_sse2/serp_sse2.cpp162
-rw-r--r--src/block/serpent_sse2/serp_sse2_sbox.h401
2 files changed, 263 insertions, 300 deletions
diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp
index 3e78f0bac..ea937c95a 100644
--- a/src/block/serpent_sse2/serp_sse2.cpp
+++ b/src/block/serpent_sse2/serp_sse2.cpp
@@ -14,88 +14,76 @@ namespace Botan {
namespace {
+#define key_xor(round, B0, B1, B2, B3) \
+ do { \
+ __m128i key = _mm_loadu_si128(keys + round); \
+ B0 = _mm_xor_si128(B0, _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0))); \
+ B1 = _mm_xor_si128(B1, _mm_shuffle_epi32(key, _MM_SHUFFLE(1,1,1,1))); \
+ B2 = _mm_xor_si128(B2, _mm_shuffle_epi32(key, _MM_SHUFFLE(2,2,2,2))); \
+ B3 = _mm_xor_si128(B3, _mm_shuffle_epi32(key, _MM_SHUFFLE(3,3,3,3))); \
+ } while(0);
+
+/*
+* Serpent's linear transformation
+*/
#define rotate_left_m128(vec, rot) \
_mm_or_si128(_mm_slli_epi32(vec, rot), _mm_srli_epi32(vec, 32-rot))
-#define key_xor(round, b0, b1, b2, b3) \
- do { \
- __m128i key = _mm_loadu_si128(keys + round); \
- b0 = _mm_xor_si128(b0, _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0))); \
- b1 = _mm_xor_si128(b1, _mm_shuffle_epi32(key, _MM_SHUFFLE(1,1,1,1))); \
- b2 = _mm_xor_si128(b2, _mm_shuffle_epi32(key, _MM_SHUFFLE(2,2,2,2))); \
- b3 = _mm_xor_si128(b3, _mm_shuffle_epi32(key, _MM_SHUFFLE(3,3,3,3))); \
- } while(0);
-
-#define transform(b0, b1, b2, b3) \
- do \
- { \
- b0 = rotate_left_m128(b0, 13); \
- b2 = rotate_left_m128(b2, 3); \
- b1 = _mm_xor_si128(b1, _mm_xor_si128(b0, b2)); \
- b3 = _mm_xor_si128(b3, _mm_xor_si128(b2, _mm_slli_epi32(b0, 3))); \
- b1 = rotate_left_m128(b1, 1); \
- b3 = rotate_left_m128(b3, 7); \
- b0 = _mm_xor_si128(b0, _mm_xor_si128(b1, b3)); \
- b2 = _mm_xor_si128(b2, _mm_xor_si128(b3, _mm_slli_epi32(b1, 7))); \
- b0 = rotate_left_m128(b0, 5); \
- b2 = rotate_left_m128(b2, 22); \
- } while(0);
-
-void print_simd(const char* name, __m128i vec)
- {
- union { __m128i v; int32_t ints[4]; } u = { vec };
+#define transform(B0, B1, B2, B3) \
+ do { \
+ B0 = rotate_left_m128(B0, 13); \
+ B2 = rotate_left_m128(B2, 3); \
+ B1 = _mm_xor_si128(B1, _mm_xor_si128(B0, B2)); \
+ B3 = _mm_xor_si128(B3, _mm_xor_si128(B2, _mm_slli_epi32(B0, 3))); \
+ B1 = rotate_left_m128(B1, 1); \
+ B3 = rotate_left_m128(B3, 7); \
+ B0 = _mm_xor_si128(B0, _mm_xor_si128(B1, B3)); \
+ B2 = _mm_xor_si128(B2, _mm_xor_si128(B3, _mm_slli_epi32(B1, 7))); \
+ B0 = rotate_left_m128(B0, 5); \
+ B2 = rotate_left_m128(B2, 22); \
+ } while(0);
- printf("%s: ", name);
- for(u32bit i = 0; i != 4; ++i)
- printf("%08X ", u.ints[i]);
- printf("\n");
- }
+/*
+* 4x4 SSE2 integer matrix transpose
+*/
+#define transpose(B0, B1, B2, B3) \
+ do { \
+ __m128i T0 = _mm_unpacklo_epi32(B0, B1); \
+ __m128i T1 = _mm_unpacklo_epi32(B2, B3); \
+ __m128i T2 = _mm_unpackhi_epi32(B0, B1); \
+ __m128i T3 = _mm_unpackhi_epi32(B2, B3); \
+ B0 = _mm_unpacklo_epi64(T0, T1); \
+ B1 = _mm_unpackhi_epi64(T0, T1); \
+ B2 = _mm_unpacklo_epi64(T2, T3); \
+ B3 = _mm_unpackhi_epi64(T2, T3); \
+ } while(0);
+/*
+* SSE2 Serpent Encryption of 4 blocks in parallel
+*/
void serpent_encrypt_4(const byte in[64],
byte out[64],
const u32bit keys_32[132])
{
const __m128i* keys = (const __m128i*)(keys_32);
-
- /*
- FIXME: figure out a fast way to do this with 4 loads with
- _mm_loadu_si128 plus shuffle/interleave ops
- */
- union { __m128i v; u32bit u32[4]; } convert;
-
- convert.u32[0] = load_le<u32bit>(in, 0);
- convert.u32[1] = load_le<u32bit>(in, 4);
- convert.u32[2] = load_le<u32bit>(in, 8);
- convert.u32[3] = load_le<u32bit>(in, 12);
- __m128i B0 = convert.v;
-
- convert.u32[0] = load_le<u32bit>(in, 1);
- convert.u32[1] = load_le<u32bit>(in, 5);
- convert.u32[2] = load_le<u32bit>(in, 9);
- convert.u32[3] = load_le<u32bit>(in, 13);
- __m128i B1 = convert.v;
-
- convert.u32[0] = load_le<u32bit>(in, 2);
- convert.u32[1] = load_le<u32bit>(in, 6);
- convert.u32[2] = load_le<u32bit>(in, 10);
- convert.u32[3] = load_le<u32bit>(in, 14);
- __m128i B2 = convert.v;
-
- convert.u32[0] = load_le<u32bit>(in, 3);
- convert.u32[1] = load_le<u32bit>(in, 7);
- convert.u32[2] = load_le<u32bit>(in, 11);
- convert.u32[3] = load_le<u32bit>(in, 15);
- __m128i B3 = convert.v;
-
- key_xor(0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
- key_xor(7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
-
+ __m128i* out_mm = (__m128i*)(out);
+ __m128i* in_mm = (__m128i*)(in);
+
+ __m128i B0 = _mm_loadu_si128(in_mm);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ transpose(B0, B1, B2, B3);
+
+ key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
@@ -121,30 +109,12 @@ void serpent_encrypt_4(const byte in[64],
key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
- // FIXME: figure out how to do this fast
- union { __m128i v; u32bit u32[4]; } convert_B0;
- union { __m128i v; u32bit u32[4]; } convert_B1;
- union { __m128i v; u32bit u32[4]; } convert_B2;
- union { __m128i v; u32bit u32[4]; } convert_B3;
- convert_B0.v = B0;
- convert_B1.v = B1;
- convert_B2.v = B2;
- convert_B3.v = B3;
- store_le(out,
- convert_B0.u32[0], convert_B1.u32[0],
- convert_B2.u32[0], convert_B3.u32[0]);
-
- store_le(out + 16,
- convert_B0.u32[1], convert_B1.u32[1],
- convert_B2.u32[1], convert_B3.u32[1]);
-
- store_le(out + 32,
- convert_B0.u32[2], convert_B1.u32[2],
- convert_B2.u32[2], convert_B3.u32[2]);
-
- store_le(out + 48,
- convert_B0.u32[3], convert_B1.u32[3],
- convert_B2.u32[3], convert_B3.u32[3]);
+ transpose(B0, B1, B2, B3);
+
+ _mm_storeu_si128(out_mm , B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
}
}
diff --git a/src/block/serpent_sse2/serp_sse2_sbox.h b/src/block/serpent_sse2/serp_sse2_sbox.h
index 2c4d9d9cb..bc8678a89 100644
--- a/src/block/serpent_sse2/serp_sse2_sbox.h
+++ b/src/block/serpent_sse2/serp_sse2_sbox.h
@@ -8,218 +8,211 @@
#ifndef SERPENT_SSE2_SBOXES_H__
#define SERPENT_SSE2_SBOXES_H__
-#define SBoxE1(b0, b1, b2, b3) \
+#define SBoxE1(B0, B1, B2, B3) \
do { \
- b3 = _mm_xor_si128(b3, b0); \
- __m128i b4 = b1; \
- b1 = _mm_and_si128(b1, b3); \
- b4 = _mm_xor_si128(b4, b2); \
- b1 = _mm_xor_si128(b1, b0); \
- b0 = _mm_or_si128(b0, b3); \
- b0 = _mm_xor_si128(b0, b4); \
- b4 = _mm_xor_si128(b4, b3); \
- b3 = _mm_xor_si128(b3, b2); \
- b2 = _mm_or_si128(b2, b1); \
- b2 = _mm_xor_si128(b2, b4); \
- b4 = _mm_andnot_si128(b4, _mm_set1_epi8(0xFF)); \
- b4 = _mm_or_si128(b4, b1); \
- b1 = _mm_xor_si128(b1, b3); \
- b1 = _mm_xor_si128(b1, b4); \
- b3 = _mm_or_si128(b3, b0); \
- b1 = _mm_xor_si128(b1, b3); \
- b4 = _mm_xor_si128(b4, b3); \
- b3 = b0; \
- b0 = b1; \
- b1 = b4; \
+ B3 = _mm_xor_si128(B3, B0); \
+ __m128i B4 = B1; \
+ B1 = _mm_and_si128(B1, B3); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B1 = _mm_xor_si128(B1, B0); \
+ B0 = _mm_or_si128(B0, B3); \
+ B0 = _mm_xor_si128(B0, B4); \
+ B4 = _mm_xor_si128(B4, B3); \
+ B3 = _mm_xor_si128(B3, B2); \
+ B2 = _mm_or_si128(B2, B1); \
+ B2 = _mm_xor_si128(B2, B4); \
+ B4 = _mm_andnot_si128(B4, _mm_set1_epi8(0xFF)); \
+ B4 = _mm_or_si128(B4, B1); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B1 = _mm_xor_si128(B1, B4); \
+ B3 = _mm_or_si128(B3, B0); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B4 = _mm_xor_si128(B4, B3); \
+ B3 = B0; \
+ B0 = B1; \
+ B1 = B4; \
} while(0);
-#define SBoxE2(b0, b1, b2, b3) \
- do \
- { \
- b0 = _mm_andnot_si128(b0, _mm_set1_epi8(0xFF)); \
- b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
- __m128i b4 = b0; \
- b0 = _mm_and_si128(b0, b1); \
- b2 = _mm_xor_si128(b2, b0); \
- b0 = _mm_or_si128(b0, b3); \
- b3 = _mm_xor_si128(b3, b2); \
- b1 = _mm_xor_si128(b1, b0); \
- b0 = _mm_xor_si128(b0, b4); \
- b4 = _mm_or_si128(b4, b1); \
- b1 = _mm_xor_si128(b1, b3); \
- b2 = _mm_or_si128(b2, b0); \
- b2 = _mm_and_si128(b2, b4); \
- b0 = _mm_xor_si128(b0, b1); \
- b1 = _mm_and_si128(b1, b2); \
- b1 = _mm_xor_si128(b1, b0); \
- b0 = _mm_and_si128(b0, b2); \
- b4 = _mm_xor_si128(b4, b0); \
- b0 = b2; \
- b2 = b3; \
- b3 = b1; \
- b1 = b4; \
- } while(0);
+#define SBoxE2(B0, B1, B2, B3) \
+ do { \
+ B0 = _mm_andnot_si128(B0, _mm_set1_epi8(0xFF)); \
+ B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \
+ __m128i B4 = B0; \
+ B0 = _mm_and_si128(B0, B1); \
+ B2 = _mm_xor_si128(B2, B0); \
+ B0 = _mm_or_si128(B0, B3); \
+ B3 = _mm_xor_si128(B3, B2); \
+ B1 = _mm_xor_si128(B1, B0); \
+ B0 = _mm_xor_si128(B0, B4); \
+ B4 = _mm_or_si128(B4, B1); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B2 = _mm_or_si128(B2, B0); \
+ B2 = _mm_and_si128(B2, B4); \
+ B0 = _mm_xor_si128(B0, B1); \
+ B1 = _mm_and_si128(B1, B2); \
+ B1 = _mm_xor_si128(B1, B0); \
+ B0 = _mm_and_si128(B0, B2); \
+ B4 = _mm_xor_si128(B4, B0); \
+ B0 = B2; \
+ B2 = B3; \
+ B3 = B1; \
+ B1 = B4; \
+ } while(0);
-#define SBoxE3(b0, b1, b2, b3) \
- do \
- { \
- __m128i b4 = b0; \
- b0 = _mm_and_si128(b0, b2); \
- b0 = _mm_xor_si128(b0, b3); \
- b2 = _mm_xor_si128(b2, b1); \
- b2 = _mm_xor_si128(b2, b0); \
- b3 = _mm_or_si128(b3, b4); \
- b3 = _mm_xor_si128(b3, b1); \
- b4 = _mm_xor_si128(b4, b2); \
- b1 = b3; \
- b3 = _mm_or_si128(b3, b4); \
- b3 = _mm_xor_si128(b3, b0); \
- b0 = _mm_and_si128(b0, b1); \
- b4 = _mm_xor_si128(b4, b0); \
- b1 = _mm_xor_si128(b1, b3); \
- b1 = _mm_xor_si128(b1, b4); \
- b4 = _mm_andnot_si128(b4, _mm_set1_epi8(0xFF)); \
- b0 = b2; \
- b2 = b1; \
- b1 = b3; \
- b3 = b4; \
- } while(0);
+#define SBoxE3(B0, B1, B2, B3) \
+ do { \
+ __m128i B4 = B0; \
+ B0 = _mm_and_si128(B0, B2); \
+ B0 = _mm_xor_si128(B0, B3); \
+ B2 = _mm_xor_si128(B2, B1); \
+ B2 = _mm_xor_si128(B2, B0); \
+ B3 = _mm_or_si128(B3, B4); \
+ B3 = _mm_xor_si128(B3, B1); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B1 = B3; \
+ B3 = _mm_or_si128(B3, B4); \
+ B3 = _mm_xor_si128(B3, B0); \
+ B0 = _mm_and_si128(B0, B1); \
+ B4 = _mm_xor_si128(B4, B0); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B1 = _mm_xor_si128(B1, B4); \
+ B4 = _mm_andnot_si128(B4, _mm_set1_epi8(0xFF)); \
+ B0 = B2; \
+ B2 = B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
-#define SBoxE4(b0, b1, b2, b3) \
- do \
- { \
- __m128i b4 = b0; \
- b0 = _mm_or_si128(b0, b3); \
- b3 = _mm_xor_si128(b3, b1); \
- b1 = _mm_and_si128(b1, b4); \
- b4 = _mm_xor_si128(b4, b2); \
- b2 = _mm_xor_si128(b2, b3); \
- b3 = _mm_and_si128(b3, b0); \
- b4 = _mm_or_si128(b4, b1); \
- b3 = _mm_xor_si128(b3, b4); \
- b0 = _mm_xor_si128(b0, b1); \
- b4 = _mm_and_si128(b4, b0); \
- b1 = _mm_xor_si128(b1, b3); \
- b4 = _mm_xor_si128(b4, b2); \
- b1 = _mm_or_si128(b1, b0); \
- b1 = _mm_xor_si128(b1, b2); \
- b0 = _mm_xor_si128(b0, b3); \
- b2 = b1; \
- b1 = _mm_or_si128(b1, b3); \
- b1 = _mm_xor_si128(b1, b0); \
- b0 = b1; \
- b1 = b2; \
- b2 = b3; \
- b3 = b4; \
- } while(0);
+#define SBoxE4(B0, B1, B2, B3) \
+ do { \
+ __m128i B4 = B0; \
+ B0 = _mm_or_si128(B0, B3); \
+ B3 = _mm_xor_si128(B3, B1); \
+ B1 = _mm_and_si128(B1, B4); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B2 = _mm_xor_si128(B2, B3); \
+ B3 = _mm_and_si128(B3, B0); \
+ B4 = _mm_or_si128(B4, B1); \
+ B3 = _mm_xor_si128(B3, B4); \
+ B0 = _mm_xor_si128(B0, B1); \
+ B4 = _mm_and_si128(B4, B0); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B1 = _mm_or_si128(B1, B0); \
+ B1 = _mm_xor_si128(B1, B2); \
+ B0 = _mm_xor_si128(B0, B3); \
+ B2 = B1; \
+ B1 = _mm_or_si128(B1, B3); \
+ B1 = _mm_xor_si128(B1, B0); \
+ B0 = B1; \
+ B1 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
-#define SBoxE5(b0, b1, b2, b3) \
- do \
- { \
- b1 = _mm_xor_si128(b1, b3); \
- b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \
- b2 = _mm_xor_si128(b2, b3); \
- b3 = _mm_xor_si128(b3, b0); \
- __m128i b4 = b1; \
- b1 = _mm_and_si128(b1, b3); \
- b1 = _mm_xor_si128(b1, b2); \
- b4 = _mm_xor_si128(b4, b3); \
- b0 = _mm_xor_si128(b0, b4); \
- b2 = _mm_and_si128(b2, b4); \
- b2 = _mm_xor_si128(b2, b0); \
- b0 = _mm_and_si128(b0, b1); \
- b3 = _mm_xor_si128(b3, b0); \
- b4 = _mm_or_si128(b4, b1); \
- b4 = _mm_xor_si128(b4, b0); \
- b0 = _mm_or_si128(b0, b3); \
- b0 = _mm_xor_si128(b0, b2); \
- b2 = _mm_and_si128(b2, b3); \
- b0 = _mm_andnot_si128(b0, _mm_set1_epi8(0xFF)); \
- b4 = _mm_xor_si128(b4, b2); \
- b2 = b0; \
- b0 = b1; \
- b1 = b4; \
- } while(0);
+#define SBoxE5(B0, B1, B2, B3) \
+ do { \
+ B1 = _mm_xor_si128(B1, B3); \
+ B3 = _mm_andnot_si128(B3, _mm_set1_epi8(0xFF)); \
+ B2 = _mm_xor_si128(B2, B3); \
+ B3 = _mm_xor_si128(B3, B0); \
+ __m128i B4 = B1; \
+ B1 = _mm_and_si128(B1, B3); \
+ B1 = _mm_xor_si128(B1, B2); \
+ B4 = _mm_xor_si128(B4, B3); \
+ B0 = _mm_xor_si128(B0, B4); \
+ B2 = _mm_and_si128(B2, B4); \
+ B2 = _mm_xor_si128(B2, B0); \
+ B0 = _mm_and_si128(B0, B1); \
+ B3 = _mm_xor_si128(B3, B0); \
+ B4 = _mm_or_si128(B4, B1); \
+ B4 = _mm_xor_si128(B4, B0); \
+ B0 = _mm_or_si128(B0, B3); \
+ B0 = _mm_xor_si128(B0, B2); \
+ B2 = _mm_and_si128(B2, B3); \
+ B0 = _mm_andnot_si128(B0, _mm_set1_epi8(0xFF)); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B2 = B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
-#define SBoxE6(b0, b1, b2, b3) \
- do \
- { \
- b0 = _mm_xor_si128(b0, b1); \
- b1 = _mm_xor_si128(b1, b3); \
- b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \
- __m128i b4 = b1; \
- b1 = _mm_and_si128(b1, b0); \
- b2 = _mm_xor_si128(b2, b3); \
- b1 = _mm_xor_si128(b1, b2); \
- b2 = _mm_or_si128(b2, b4); \
- b4 = _mm_xor_si128(b4, b3); \
- b3 = _mm_and_si128(b3, b1); \
- b3 = _mm_xor_si128(b3, b0); \
- b4 = _mm_xor_si128(b4, b1); \
- b4 = _mm_xor_si128(b4, b2); \
- b2 = _mm_xor_si128(b2, b0); \
- b0 = _mm_and_si128(b0, b3); \
- b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
- b0 = _mm_xor_si128(b0, b4); \
- b4 = _mm_or_si128(b4, b3); \
- b4 = _mm_xor_si128(b4, b2); \
- b2 = b0; \
- b0 = b1; \
- b1 = b3; \
- b3 = b4; \
- } while(0);
+#define SBoxE6(B0, B1, B2, B3) \
+ do { \
+ B0 = _mm_xor_si128(B0, B1); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B3 = _mm_andnot_si128(B3, _mm_set1_epi8(0xFF)); \
+ __m128i B4 = B1; \
+ B1 = _mm_and_si128(B1, B0); \
+ B2 = _mm_xor_si128(B2, B3); \
+ B1 = _mm_xor_si128(B1, B2); \
+ B2 = _mm_or_si128(B2, B4); \
+ B4 = _mm_xor_si128(B4, B3); \
+ B3 = _mm_and_si128(B3, B1); \
+ B3 = _mm_xor_si128(B3, B0); \
+ B4 = _mm_xor_si128(B4, B1); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B2 = _mm_xor_si128(B2, B0); \
+ B0 = _mm_and_si128(B0, B3); \
+ B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \
+ B0 = _mm_xor_si128(B0, B4); \
+ B4 = _mm_or_si128(B4, B3); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B2 = B0; \
+ B0 = B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
-#define SBoxE7(b0, b1, b2, b3) \
- do \
- { \
- b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
- __m128i b4 = b3; \
- b3 = _mm_and_si128(b3, b0); \
- b0 = _mm_xor_si128(b0, b4); \
- b3 = _mm_xor_si128(b3, b2); \
- b2 = _mm_or_si128(b2, b4); \
- b1 = _mm_xor_si128(b1, b3); \
- b2 = _mm_xor_si128(b2, b0); \
- b0 = _mm_or_si128(b0, b1); \
- b2 = _mm_xor_si128(b2, b1); \
- b4 = _mm_xor_si128(b4, b0); \
- b0 = _mm_or_si128(b0, b3); \
- b0 = _mm_xor_si128(b0, b2); \
- b4 = _mm_xor_si128(b4, b3); \
- b4 = _mm_xor_si128(b4, b0); \
- b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \
- b2 = _mm_and_si128(b2, b4); \
- b3 = _mm_xor_si128(b3, b2); \
- b2 = b4; \
- } while(0);
+#define SBoxE7(B0, B1, B2, B3) \
+ do { \
+ B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \
+ __m128i B4 = B3; \
+ B3 = _mm_and_si128(B3, B0); \
+ B0 = _mm_xor_si128(B0, B4); \
+ B3 = _mm_xor_si128(B3, B2); \
+ B2 = _mm_or_si128(B2, B4); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B2 = _mm_xor_si128(B2, B0); \
+ B0 = _mm_or_si128(B0, B1); \
+ B2 = _mm_xor_si128(B2, B1); \
+ B4 = _mm_xor_si128(B4, B0); \
+ B0 = _mm_or_si128(B0, B3); \
+ B0 = _mm_xor_si128(B0, B2); \
+ B4 = _mm_xor_si128(B4, B3); \
+ B4 = _mm_xor_si128(B4, B0); \
+ B3 = _mm_andnot_si128(B3, _mm_set1_epi8(0xFF)); \
+ B2 = _mm_and_si128(B2, B4); \
+ B3 = _mm_xor_si128(B3, B2); \
+ B2 = B4; \
+ } while(0);
-#define SBoxE8(b0, b1, b2, b3) \
- do \
- { \
- __m128i b4 = b1; \
- b1 = _mm_or_si128(b1, b2); \
- b1 = _mm_xor_si128(b1, b3); \
- b4 = _mm_xor_si128(b4, b2); \
- b2 = _mm_xor_si128(b2, b1); \
- b3 = _mm_or_si128(b3, b4); \
- b3 = _mm_and_si128(b3, b0); \
- b4 = _mm_xor_si128(b4, b2); \
- b3 = _mm_xor_si128(b3, b1); \
- b1 = _mm_or_si128(b1, b4); \
- b1 = _mm_xor_si128(b1, b0); \
- b0 = _mm_or_si128(b0, b4); \
- b0 = _mm_xor_si128(b0, b2); \
- b1 = _mm_xor_si128(b1, b4); \
- b2 = _mm_xor_si128(b2, b1); \
- b1 = _mm_and_si128(b1, b0); \
- b1 = _mm_xor_si128(b1, b4); \
- b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
- b2 = _mm_or_si128(b2, b0); \
- b4 = _mm_xor_si128(b4, b2); \
- b2 = b1; \
- b1 = b3; \
- b3 = b0; \
- b0 = b4; \
- } while(0);
+#define SBoxE8(B0, B1, B2, B3) \
+ do { \
+ __m128i B4 = B1; \
+ B1 = _mm_or_si128(B1, B2); \
+ B1 = _mm_xor_si128(B1, B3); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B2 = _mm_xor_si128(B2, B1); \
+ B3 = _mm_or_si128(B3, B4); \
+ B3 = _mm_and_si128(B3, B0); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B3 = _mm_xor_si128(B3, B1); \
+ B1 = _mm_or_si128(B1, B4); \
+ B1 = _mm_xor_si128(B1, B0); \
+ B0 = _mm_or_si128(B0, B4); \
+ B0 = _mm_xor_si128(B0, B2); \
+ B1 = _mm_xor_si128(B1, B4); \
+ B2 = _mm_xor_si128(B2, B1); \
+ B1 = _mm_and_si128(B1, B0); \
+ B1 = _mm_xor_si128(B1, B4); \
+ B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \
+ B2 = _mm_or_si128(B2, B0); \
+ B4 = _mm_xor_si128(B4, B2); \
+ B2 = B1; \
+ B1 = B3; \
+ B3 = B0; \
+ B0 = B4; \
+ } while(0);
#endif