diff options
author | lloyd <[email protected]> | 2009-10-28 19:50:22 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2009-10-28 19:50:22 +0000 |
commit | b076d903d39f4629ba2593903eb63a72e89f9e03 (patch) | |
tree | c3dac5b32283b8d550d34b108c3ace26e4144133 /src/block/serpent_sse2 | |
parent | 9462f875b13a321f42a127166d49670ca04afcde (diff) | |
parent | 185d85338562627aa4800436a3fe6efa11886351 (diff) |
propagate from branch 'net.randombit.botan' (head bf629b13dd132b263e76a72b7eca0f7e4ab19aac)
to branch 'net.randombit.botan.general-simd' (head f731cff08ff0d04c062742c0c6cfcc18856400ea)
Diffstat (limited to 'src/block/serpent_sse2')
-rw-r--r-- | src/block/serpent_sse2/serp_sse2.cpp | 142 | ||||
-rw-r--r-- | src/block/serpent_sse2/serp_sse2_sbox.h | 622 |
2 files changed, 361 insertions, 403 deletions
diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp index c51bb69ab..be79e870d 100644 --- a/src/block/serpent_sse2/serp_sse2.cpp +++ b/src/block/serpent_sse2/serp_sse2.cpp @@ -1,5 +1,5 @@ /* -* Serpent (SSE2) +* Serpent (SIMD) * (C) 2009 Jack Lloyd * * Distributed under the terms of the Botan license @@ -7,72 +7,50 @@ #include <botan/serp_sse2.h> #include <botan/serp_sse2_sbox.h> +#include <botan/simd_32.h> #include <botan/loadstor.h> -#include <emmintrin.h> namespace Botan { namespace { -#define key_xor(round, B0, B1, B2, B3) \ - do { \ - __m128i key = _mm_loadu_si128(keys + round); \ - B0 = _mm_xor_si128(B0, _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0))); \ - B1 = _mm_xor_si128(B1, _mm_shuffle_epi32(key, _MM_SHUFFLE(1,1,1,1))); \ - B2 = _mm_xor_si128(B2, _mm_shuffle_epi32(key, _MM_SHUFFLE(2,2,2,2))); \ - B3 = _mm_xor_si128(B3, _mm_shuffle_epi32(key, _MM_SHUFFLE(3,3,3,3))); \ +#define key_xor(round, B0, B1, B2, B3) \ + do { \ + B0 ^= SIMD_32(keys[4*round ]); \ + B1 ^= SIMD_32(keys[4*round+1]); \ + B2 ^= SIMD_32(keys[4*round+2]); \ + B3 ^= SIMD_32(keys[4*round+3]); \ } while(0); /* * Serpent's linear transformations */ -#define rotate_left_m128(vec, rot) \ - _mm_or_si128(_mm_slli_epi32(vec, rot), _mm_srli_epi32(vec, 32-rot)) - -#define rotate_right_m128(vec, rot) \ - _mm_or_si128(_mm_srli_epi32(vec, rot), _mm_slli_epi32(vec, 32-rot)) - -#define transform(B0, B1, B2, B3) \ - do { \ - B0 = rotate_left_m128(B0, 13); \ - B2 = rotate_left_m128(B2, 3); \ - B1 = _mm_xor_si128(B1, _mm_xor_si128(B0, B2)); \ - B3 = _mm_xor_si128(B3, _mm_xor_si128(B2, _mm_slli_epi32(B0, 3))); \ - B1 = rotate_left_m128(B1, 1); \ - B3 = rotate_left_m128(B3, 7); \ - B0 = _mm_xor_si128(B0, _mm_xor_si128(B1, B3)); \ - B2 = _mm_xor_si128(B2, _mm_xor_si128(B3, _mm_slli_epi32(B1, 7))); \ - B0 = rotate_left_m128(B0, 5); \ - B2 = rotate_left_m128(B2, 22); \ +#define transform(B0, B1, B2, B3) \ + do { \ + B0.rotate_left(13); \ + B2.rotate_left(3); \ + B1 ^= B0 ^ B2; \ + B3 ^= B2 ^ (B0 << 3); \ + B1.rotate_left(1); \ + B3.rotate_left(7); \ + B0 ^= B1 ^ B3; \ + B2 ^= B3 ^ (B1 << 7); \ + B0.rotate_left(5); \ + B2.rotate_left(22); \ } while(0); -#define i_transform(B0, B1, B2, B3) \ - do { \ - B2 = rotate_right_m128(B2, 22); \ - B0 = rotate_right_m128(B0, 5); \ - B2 = _mm_xor_si128(B2, _mm_xor_si128(B3, _mm_slli_epi32(B1, 7))); \ - B0 = _mm_xor_si128(B0, _mm_xor_si128(B1, B3)); \ - B3 = rotate_right_m128(B3, 7); \ - B1 = rotate_right_m128(B1, 1); \ - B3 = _mm_xor_si128(B3, _mm_xor_si128(B2, _mm_slli_epi32(B0, 3))); \ - B1 = _mm_xor_si128(B1, _mm_xor_si128(B0, B2)); \ - B2 = rotate_right_m128(B2, 3); \ - B0 = rotate_right_m128(B0, 13); \ - } while(0); - -/* -* 4x4 SSE2 integer matrix transpose -*/ -#define transpose(B0, B1, B2, B3) \ - do { \ - __m128i T0 = _mm_unpacklo_epi32(B0, B1); \ - __m128i T1 = _mm_unpacklo_epi32(B2, B3); \ - __m128i T2 = _mm_unpackhi_epi32(B0, B1); \ - __m128i T3 = _mm_unpackhi_epi32(B2, B3); \ - B0 = _mm_unpacklo_epi64(T0, T1); \ - B1 = _mm_unpackhi_epi64(T0, T1); \ - B2 = _mm_unpacklo_epi64(T2, T3); \ - B3 = _mm_unpackhi_epi64(T2, T3); \ +#define i_transform(B0, B1, B2, B3) \ + do { \ + B2.rotate_right(22); \ + B0.rotate_right(5); \ + B2 ^= B3 ^ (B1 << 7); \ + B0 ^= B1 ^ B3; \ + B3.rotate_right(7); \ + B1.rotate_right(1); \ + B3 ^= B2 ^ (B0 << 3); \ + B1 ^= B0 ^ B2; \ + B2.rotate_right(3); \ + B0.rotate_right(13); \ } while(0); /* @@ -80,20 +58,14 @@ namespace { */ void serpent_encrypt_4(const byte in[64], byte out[64], - const u32bit keys_32[132]) + const u32bit keys[132]) { - const __m128i all_ones = _mm_set1_epi8(0xFF); + SIMD_32 B0 = SIMD_32::load_le(in); + SIMD_32 B1 = SIMD_32::load_le(in + 16); + SIMD_32 B2 = SIMD_32::load_le(in + 32); + SIMD_32 B3 = SIMD_32::load_le(in + 48); - const __m128i* keys = (const __m128i*)(keys_32); - __m128i* out_mm = (__m128i*)(out); - __m128i* in_mm = (__m128i*)(in); - - __m128i B0 = _mm_loadu_si128(in_mm); - __m128i B1 = _mm_loadu_si128(in_mm + 1); - __m128i B2 = _mm_loadu_si128(in_mm + 2); - __m128i B3 = _mm_loadu_si128(in_mm + 3); - - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); @@ -131,12 +103,12 @@ void serpent_encrypt_4(const byte in[64], key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3); - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); - _mm_storeu_si128(out_mm , B0); - _mm_storeu_si128(out_mm + 1, B1); - _mm_storeu_si128(out_mm + 2, B2); - _mm_storeu_si128(out_mm + 3, B3); + B0.store_le(out); + B1.store_le(out + 16); + B2.store_le(out + 32); + B3.store_le(out + 48); } /* @@ -144,20 +116,14 @@ void serpent_encrypt_4(const byte in[64], */ void serpent_decrypt_4(const byte in[64], byte out[64], - const u32bit keys_32[132]) + const u32bit keys[132]) { - const __m128i all_ones = _mm_set1_epi8(0xFF); - - const __m128i* keys = (const __m128i*)(keys_32); - __m128i* out_mm = (__m128i*)(out); - __m128i* in_mm = (__m128i*)(in); - - __m128i B0 = _mm_loadu_si128(in_mm); - __m128i B1 = _mm_loadu_si128(in_mm + 1); - __m128i B2 = _mm_loadu_si128(in_mm + 2); - __m128i B3 = _mm_loadu_si128(in_mm + 3); + SIMD_32 B0 = SIMD_32::load_le(in); + SIMD_32 B1 = SIMD_32::load_le(in + 16); + SIMD_32 B2 = SIMD_32::load_le(in + 32); + SIMD_32 B3 = SIMD_32::load_le(in + 48); - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3); @@ -195,12 +161,12 @@ void serpent_decrypt_4(const byte in[64], i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3); - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); - _mm_storeu_si128(out_mm , B0); - _mm_storeu_si128(out_mm + 1, B1); - _mm_storeu_si128(out_mm + 2, B2); - _mm_storeu_si128(out_mm + 3, B3); + B0.store_le(out); + B1.store_le(out + 16); + B2.store_le(out + 32); + B3.store_le(out + 48); } } diff --git a/src/block/serpent_sse2/serp_sse2_sbox.h b/src/block/serpent_sse2/serp_sse2_sbox.h index 40c552e87..6e3da7359 100644 --- a/src/block/serpent_sse2/serp_sse2_sbox.h +++ b/src/block/serpent_sse2/serp_sse2_sbox.h @@ -1,33 +1,33 @@ /* -* Serpent Sboxes in SSE2 form +* Serpent Sboxes in SIMD form * (C) 2009 Jack Lloyd * * Distributed under the terms of the Botan license */ -#ifndef SERPENT_SSE2_SBOXES_H__ -#define SERPENT_SSE2_SBOXES_H__ +#ifndef SERPENT_SIMD_SBOXES_H__ +#define SERPENT_SIMD_SBOXES_H__ #define SBoxE1(B0, B1, B2, B3) \ do { \ - B3 = _mm_xor_si128(B3, B0); \ - __m128i B4 = B1; \ - B1 = _mm_and_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B0 = _mm_xor_si128(B0, B4); \ - B4 = _mm_xor_si128(B4, B3); \ - B3 = _mm_xor_si128(B3, B2); \ - B2 = _mm_or_si128(B2, B1); \ - B2 = _mm_xor_si128(B2, B4); \ - B4 = _mm_xor_si128(B4, all_ones); \ - B4 = _mm_or_si128(B4, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B4); \ - B3 = _mm_or_si128(B3, B0); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B3); \ + B3 ^= B0; \ + SIMD_32 B4 = B1; \ + B1 &= B3; \ + B4 ^= B2; \ + B1 ^= B0; \ + B0 |= B3; \ + B0 ^= B4; \ + B4 ^= B3; \ + B3 ^= B2; \ + B2 |= B1; \ + B2 ^= B4; \ + B4 = ~B4; \ + B4 |= B1; \ + B1 ^= B3; \ + B1 ^= B4; \ + B3 |= B0; \ + B1 ^= B3; \ + B4 ^= B3; \ B3 = B0; \ B0 = B1; \ B1 = B4; \ @@ -35,24 +35,24 @@ #define SBoxE2(B0, B1, B2, B3) \ do { \ - B0 = _mm_xor_si128(B0, all_ones); \ - B2 = _mm_xor_si128(B2, all_ones); \ - __m128i B4 = B0; \ - B0 = _mm_and_si128(B0, B1); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B3 = _mm_xor_si128(B3, B2); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_xor_si128(B0, B4); \ - B4 = _mm_or_si128(B4, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B2 = _mm_or_si128(B2, B0); \ - B2 = _mm_and_si128(B2, B4); \ - B0 = _mm_xor_si128(B0, B1); \ - B1 = _mm_and_si128(B1, B2); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_and_si128(B0, B2); \ - B4 = _mm_xor_si128(B4, B0); \ + B0 = ~B0; \ + B2 = ~B2; \ + SIMD_32 B4 = B0; \ + B0 &= B1; \ + B2 ^= B0; \ + B0 |= B3; \ + B3 ^= B2; \ + B1 ^= B0; \ + B0 ^= B4; \ + B4 |= B1; \ + B1 ^= B3; \ + B2 |= B0; \ + B2 &= B4; \ + B0 ^= B1; \ + B1 &= B2; \ + B1 ^= B0; \ + B0 &= B2; \ + B4 ^= B0; \ B0 = B2; \ B2 = B3; \ B3 = B1; \ @@ -61,22 +61,22 @@ #define SBoxE3(B0, B1, B2, B3) \ do { \ - __m128i B4 = B0; \ - B0 = _mm_and_si128(B0, B2); \ - B0 = _mm_xor_si128(B0, B3); \ - B2 = _mm_xor_si128(B2, B1); \ - B2 = _mm_xor_si128(B2, B0); \ - B3 = _mm_or_si128(B3, B4); \ - B3 = _mm_xor_si128(B3, B1); \ - B4 = _mm_xor_si128(B4, B2); \ + SIMD_32 B4 = B0; \ + B0 &= B2; \ + B0 ^= B3; \ + B2 ^= B1; \ + B2 ^= B0; \ + B3 |= B4; \ + B3 ^= B1; \ + B4 ^= B2; \ B1 = B3; \ - B3 = _mm_or_si128(B3, B4); \ - B3 = _mm_xor_si128(B3, B0); \ - B0 = _mm_and_si128(B0, B1); \ - B4 = _mm_xor_si128(B4, B0); \ - B1 = _mm_xor_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B4); \ - B4 = _mm_xor_si128(B4, all_ones); \ + B3 |= B4; \ + B3 ^= B0; \ + B0 &= B1; \ + B4 ^= B0; \ + B1 ^= B3; \ + B1 ^= B4; \ + B4 = ~B4; \ B0 = B2; \ B2 = B1; \ B1 = B3; \ @@ -85,25 +85,25 @@ #define SBoxE4(B0, B1, B2, B3) \ do { \ - __m128i B4 = B0; \ - B0 = _mm_or_si128(B0, B3); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_and_si128(B1, B4); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_xor_si128(B2, B3); \ - B3 = _mm_and_si128(B3, B0); \ - B4 = _mm_or_si128(B4, B1); \ - B3 = _mm_xor_si128(B3, B4); \ - B0 = _mm_xor_si128(B0, B1); \ - B4 = _mm_and_si128(B4, B0); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B1 = _mm_or_si128(B1, B0); \ - B1 = _mm_xor_si128(B1, B2); \ - B0 = _mm_xor_si128(B0, B3); \ + SIMD_32 B4 = B0; \ + B0 |= B3; \ + B3 ^= B1; \ + B1 &= B4; \ + B4 ^= B2; \ + B2 ^= B3; \ + B3 &= B0; \ + B4 |= B1; \ + B3 ^= B4; \ + B0 ^= B1; \ + B4 &= B0; \ + B1 ^= B3; \ + B4 ^= B2; \ + B1 |= B0; \ + B1 ^= B2; \ + B0 ^= B3; \ B2 = B1; \ - B1 = _mm_or_si128(B1, B3); \ - B0 = _mm_xor_si128(B0, B1); \ + B1 |= B3; \ + B0 ^= B1; \ B1 = B2; \ B2 = B3; \ B3 = B4; \ @@ -111,26 +111,26 @@ #define SBoxE5(B0, B1, B2, B3) \ do { \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_xor_si128(B3, all_ones); \ - B2 = _mm_xor_si128(B2, B3); \ - B3 = _mm_xor_si128(B3, B0); \ - __m128i B4 = B1; \ - B1 = _mm_and_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B2); \ - B4 = _mm_xor_si128(B4, B3); \ - B0 = _mm_xor_si128(B0, B4); \ - B2 = _mm_and_si128(B2, B4); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_and_si128(B0, B1); \ - B3 = _mm_xor_si128(B3, B0); \ - B4 = _mm_or_si128(B4, B1); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B0 = _mm_xor_si128(B0, B2); \ - B2 = _mm_and_si128(B2, B3); \ - B0 = _mm_xor_si128(B0, all_ones); \ - B4 = _mm_xor_si128(B4, B2); \ + B1 ^= B3; \ + B3 = ~B3; \ + B2 ^= B3; \ + B3 ^= B0; \ + SIMD_32 B4 = B1; \ + B1 &= B3; \ + B1 ^= B2; \ + B4 ^= B3; \ + B0 ^= B4; \ + B2 &= B4; \ + B2 ^= B0; \ + B0 &= B1; \ + B3 ^= B0; \ + B4 |= B1; \ + B4 ^= B0; \ + B0 |= B3; \ + B0 ^= B2; \ + B2 &= B3; \ + B0 = ~B0; \ + B4 ^= B2; \ B2 = B0; \ B0 = B1; \ B1 = B4; \ @@ -138,25 +138,25 @@ #define SBoxE6(B0, B1, B2, B3) \ do { \ - B0 = _mm_xor_si128(B0, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_xor_si128(B3, all_ones); \ - __m128i B4 = B1; \ - B1 = _mm_and_si128(B1, B0); \ - B2 = _mm_xor_si128(B2, B3); \ - B1 = _mm_xor_si128(B1, B2); \ - B2 = _mm_or_si128(B2, B4); \ - B4 = _mm_xor_si128(B4, B3); \ - B3 = _mm_and_si128(B3, B1); \ - B3 = _mm_xor_si128(B3, B0); \ - B4 = _mm_xor_si128(B4, B1); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_and_si128(B0, B3); \ - B2 = _mm_xor_si128(B2, all_ones); \ - B0 = _mm_xor_si128(B0, B4); \ - B4 = _mm_or_si128(B4, B3); \ - B4 = _mm_xor_si128(B4, B2); \ + B0 ^= B1; \ + B1 ^= B3; \ + B3 = ~B3; \ + SIMD_32 B4 = B1; \ + B1 &= B0; \ + B2 ^= B3; \ + B1 ^= B2; \ + B2 |= B4; \ + B4 ^= B3; \ + B3 &= B1; \ + B3 ^= B0; \ + B4 ^= B1; \ + B4 ^= B2; \ + B2 ^= B0; \ + B0 &= B3; \ + B2 = ~B2; \ + B0 ^= B4; \ + B4 |= B3; \ + B4 ^= B2; \ B2 = B0; \ B0 = B1; \ B1 = B3; \ @@ -165,49 +165,49 @@ #define SBoxE7(B0, B1, B2, B3) \ do { \ - B2 = _mm_xor_si128(B2, all_ones); \ - __m128i B4 = B3; \ - B3 = _mm_and_si128(B3, B0); \ - B0 = _mm_xor_si128(B0, B4); \ - B3 = _mm_xor_si128(B3, B2); \ - B2 = _mm_or_si128(B2, B4); \ - B1 = _mm_xor_si128(B1, B3); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_or_si128(B0, B1); \ - B2 = _mm_xor_si128(B2, B1); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B0 = _mm_xor_si128(B0, B2); \ - B4 = _mm_xor_si128(B4, B3); \ - B4 = _mm_xor_si128(B4, B0); \ - B3 = _mm_xor_si128(B3, all_ones); \ - B2 = _mm_and_si128(B2, B4); \ - B3 = _mm_xor_si128(B3, B2); \ + B2 = ~B2; \ + SIMD_32 B4 = B3; \ + B3 &= B0; \ + B0 ^= B4; \ + B3 ^= B2; \ + B2 |= B4; \ + B1 ^= B3; \ + B2 ^= B0; \ + B0 |= B1; \ + B2 ^= B1; \ + B4 ^= B0; \ + B0 |= B3; \ + B0 ^= B2; \ + B4 ^= B3; \ + B4 ^= B0; \ + B3 = ~B3; \ + B2 &= B4; \ + B3 ^= B2; \ B2 = B4; \ } while(0); #define SBoxE8(B0, B1, B2, B3) \ do { \ - __m128i B4 = B1; \ - B1 = _mm_or_si128(B1, B2); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_xor_si128(B2, B1); \ - B3 = _mm_or_si128(B3, B4); \ - B3 = _mm_and_si128(B3, B0); \ - B4 = _mm_xor_si128(B4, B2); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_or_si128(B1, B4); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_or_si128(B0, B4); \ - B0 = _mm_xor_si128(B0, B2); \ - B1 = _mm_xor_si128(B1, B4); \ - B2 = _mm_xor_si128(B2, B1); \ - B1 = _mm_and_si128(B1, B0); \ - B1 = _mm_xor_si128(B1, B4); \ - B2 = _mm_xor_si128(B2, all_ones); \ - B2 = _mm_or_si128(B2, B0); \ - B4 = _mm_xor_si128(B4, B2); \ + SIMD_32 B4 = B1; \ + B1 |= B2; \ + B1 ^= B3; \ + B4 ^= B2; \ + B2 ^= B1; \ + B3 |= B4; \ + B3 &= B0; \ + B4 ^= B2; \ + B3 ^= B1; \ + B1 |= B4; \ + B1 ^= B0; \ + B0 |= B4; \ + B0 ^= B2; \ + B1 ^= B4; \ + B2 ^= B1; \ + B1 &= B0; \ + B1 ^= B4; \ + B2 = ~B2; \ + B2 |= B0; \ + B4 ^= B2; \ B2 = B1; \ B1 = B3; \ B3 = B0; \ @@ -215,53 +215,51 @@ } while(0); #define SBoxD1(B0, B1, B2, B3) \ - do \ - { \ - B2 = _mm_xor_si128(B2, all_ones); \ - __m128i B4 = B1; \ - B1 = _mm_or_si128(B1, B0); \ - B4 = _mm_xor_si128(B4, all_ones); \ - B1 = _mm_xor_si128(B1, B2); \ - B2 = _mm_or_si128(B2, B4); \ - B1 = _mm_xor_si128(B1, B3); \ - B0 = _mm_xor_si128(B0, B4); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_and_si128(B0, B3); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = _mm_or_si128(B0, B1); \ - B0 = _mm_xor_si128(B0, B2); \ - B3 = _mm_xor_si128(B3, B4); \ - B2 = _mm_xor_si128(B2, B1); \ - B3 = _mm_xor_si128(B3, B0); \ - B3 = _mm_xor_si128(B3, B1); \ - B2 = _mm_and_si128(B2, B3); \ - B4 = _mm_xor_si128(B4, B2); \ + do { \ + B2 = ~B2; \ + SIMD_32 B4 = B1; \ + B1 |= B0; \ + B4 = ~B4; \ + B1 ^= B2; \ + B2 |= B4; \ + B1 ^= B3; \ + B0 ^= B4; \ + B2 ^= B0; \ + B0 &= B3; \ + B4 ^= B0; \ + B0 |= B1; \ + B0 ^= B2; \ + B3 ^= B4; \ + B2 ^= B1; \ + B3 ^= B0; \ + B3 ^= B1; \ + B2 &= B3; \ + B4 ^= B2; \ B2 = B1; \ B1 = B4; \ } while(0); #define SBoxD2(B0, B1, B2, B3) \ - do \ - { \ - __m128i B4 = B1; \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_and_si128(B3, B1); \ - B4 = _mm_xor_si128(B4, B2); \ - B3 = _mm_xor_si128(B3, B0); \ - B0 = _mm_or_si128(B0, B1); \ - B2 = _mm_xor_si128(B2, B3); \ - B0 = _mm_xor_si128(B0, B4); \ - B0 = _mm_or_si128(B0, B2); \ - B1 = _mm_xor_si128(B1, B3); \ - B0 = _mm_xor_si128(B0, B1); \ - B1 = _mm_or_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B0); \ - B4 = _mm_xor_si128(B4, all_ones); \ - B4 = _mm_xor_si128(B4, B1); \ - B1 = _mm_or_si128(B1, B0); \ - B1 = _mm_xor_si128(B1, B0); \ - B1 = _mm_or_si128(B1, B4); \ - B3 = _mm_xor_si128(B3, B1); \ + do { \ + SIMD_32 B4 = B1; \ + B1 ^= B3; \ + B3 &= B1; \ + B4 ^= B2; \ + B3 ^= B0; \ + B0 |= B1; \ + B2 ^= B3; \ + B0 ^= B4; \ + B0 |= B2; \ + B1 ^= B3; \ + B0 ^= B1; \ + B1 |= B3; \ + B1 ^= B0; \ + B4 = ~B4; \ + B4 ^= B1; \ + B1 |= B0; \ + B1 ^= B0; \ + B1 |= B4; \ + B3 ^= B1; \ B1 = B0; \ B0 = B4; \ B4 = B2; \ @@ -270,52 +268,50 @@ } while(0); #define SBoxD3(B0, B1, B2, B3) \ - do \ - { \ - B2 = _mm_xor_si128(B2, B3); \ - B3 = _mm_xor_si128(B3, B0); \ - __m128i B4 = B3; \ - B3 = _mm_and_si128(B3, B2); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_or_si128(B1, B2); \ - B1 = _mm_xor_si128(B1, B4); \ - B4 = _mm_and_si128(B4, B3); \ - B2 = _mm_xor_si128(B2, B3); \ - B4 = _mm_and_si128(B4, B0); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_and_si128(B2, B1); \ - B2 = _mm_or_si128(B2, B0); \ - B3 = _mm_xor_si128(B3, all_ones); \ - B2 = _mm_xor_si128(B2, B3); \ - B0 = _mm_xor_si128(B0, B3); \ - B0 = _mm_and_si128(B0, B1); \ - B3 = _mm_xor_si128(B3, B4); \ - B3 = _mm_xor_si128(B3, B0); \ + do { \ + B2 ^= B3; \ + B3 ^= B0; \ + SIMD_32 B4 = B3; \ + B3 &= B2; \ + B3 ^= B1; \ + B1 |= B2; \ + B1 ^= B4; \ + B4 &= B3; \ + B2 ^= B3; \ + B4 &= B0; \ + B4 ^= B2; \ + B2 &= B1; \ + B2 |= B0; \ + B3 = ~B3; \ + B2 ^= B3; \ + B0 ^= B3; \ + B0 &= B1; \ + B3 ^= B4; \ + B3 ^= B0; \ B0 = B1; \ B1 = B4; \ } while(0); #define SBoxD4(B0, B1, B2, B3) \ - do \ - { \ - __m128i B4 = B2; \ - B2 = _mm_xor_si128(B2, B1); \ - B0 = _mm_xor_si128(B0, B2); \ - B4 = _mm_and_si128(B4, B2); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = _mm_and_si128(B0, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_or_si128(B3, B4); \ - B2 = _mm_xor_si128(B2, B3); \ - B0 = _mm_xor_si128(B0, B3); \ - B1 = _mm_xor_si128(B1, B4); \ - B3 = _mm_and_si128(B3, B2); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_xor_si128(B1, B0); \ - B1 = _mm_or_si128(B1, B2); \ - B0 = _mm_xor_si128(B0, B3); \ - B1 = _mm_xor_si128(B1, B4); \ - B0 = _mm_xor_si128(B0, B1); \ + do { \ + SIMD_32 B4 = B2; \ + B2 ^= B1; \ + B0 ^= B2; \ + B4 &= B2; \ + B4 ^= B0; \ + B0 &= B1; \ + B1 ^= B3; \ + B3 |= B4; \ + B2 ^= B3; \ + B0 ^= B3; \ + B1 ^= B4; \ + B3 &= B2; \ + B3 ^= B1; \ + B1 ^= B0; \ + B1 |= B2; \ + B0 ^= B3; \ + B1 ^= B4; \ + B0 ^= B1; \ B4 = B0; \ B0 = B2; \ B2 = B3; \ @@ -323,54 +319,52 @@ } while(0); #define SBoxD5(B0, B1, B2, B3) \ - do \ - { \ - __m128i B4 = B2; \ - B2 = _mm_and_si128(B2, B3); \ - B2 = _mm_xor_si128(B2, B1); \ - B1 = _mm_or_si128(B1, B3); \ - B1 = _mm_and_si128(B1, B0); \ - B4 = _mm_xor_si128(B4, B2); \ - B4 = _mm_xor_si128(B4, B1); \ - B1 = _mm_and_si128(B1, B2); \ - B0 = _mm_xor_si128(B0, all_ones); \ - B3 = _mm_xor_si128(B3, B4); \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_and_si128(B3, B0); \ - B3 = _mm_xor_si128(B3, B2); \ - B0 = _mm_xor_si128(B0, B1); \ - B2 = _mm_and_si128(B2, B0); \ - B3 = _mm_xor_si128(B3, B0); \ - B2 = _mm_xor_si128(B2, B4); \ - B2 = _mm_or_si128(B2, B3); \ - B3 = _mm_xor_si128(B3, B0); \ - B2 = _mm_xor_si128(B2, B1); \ + do { \ + SIMD_32 B4 = B2; \ + B2 &= B3; \ + B2 ^= B1; \ + B1 |= B3; \ + B1 &= B0; \ + B4 ^= B2; \ + B4 ^= B1; \ + B1 &= B2; \ + B0 = ~B0; \ + B3 ^= B4; \ + B1 ^= B3; \ + B3 &= B0; \ + B3 ^= B2; \ + B0 ^= B1; \ + B2 &= B0; \ + B3 ^= B0; \ + B2 ^= B4; \ + B2 |= B3; \ + B3 ^= B0; \ + B2 ^= B1; \ B1 = B3; \ B3 = B4; \ } while(0); #define SBoxD6(B0, B1, B2, B3) \ - do \ - { \ - B1 = _mm_xor_si128(B1, all_ones); \ - __m128i B4 = B3; \ - B2 = _mm_xor_si128(B2, B1); \ - B3 = _mm_or_si128(B3, B0); \ - B3 = _mm_xor_si128(B3, B2); \ - B2 = _mm_or_si128(B2, B1); \ - B2 = _mm_and_si128(B2, B0); \ - B4 = _mm_xor_si128(B4, B3); \ - B2 = _mm_xor_si128(B2, B4); \ - B4 = _mm_or_si128(B4, B0); \ - B4 = _mm_xor_si128(B4, B1); \ - B1 = _mm_and_si128(B1, B2); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B3 = _mm_and_si128(B3, B4); \ - B4 = _mm_xor_si128(B4, B1); \ - B3 = _mm_xor_si128(B3, B4); \ - B4 = _mm_xor_si128(B4, all_ones); \ - B3 = _mm_xor_si128(B3, B0); \ + do { \ + B1 = ~B1; \ + SIMD_32 B4 = B3; \ + B2 ^= B1; \ + B3 |= B0; \ + B3 ^= B2; \ + B2 |= B1; \ + B2 &= B0; \ + B4 ^= B3; \ + B2 ^= B4; \ + B4 |= B0; \ + B4 ^= B1; \ + B1 &= B2; \ + B1 ^= B3; \ + B4 ^= B2; \ + B3 &= B4; \ + B4 ^= B1; \ + B3 ^= B4; \ + B4 = ~B4; \ + B3 ^= B0; \ B0 = B1; \ B1 = B4; \ B4 = B3; \ @@ -379,52 +373,50 @@ } while(0); #define SBoxD7(B0, B1, B2, B3) \ - do \ - { \ - B0 = _mm_xor_si128(B0, B2); \ - __m128i B4 = B2; \ - B2 = _mm_and_si128(B2, B0); \ - B4 = _mm_xor_si128(B4, B3); \ - B2 = _mm_xor_si128(B2, all_ones); \ - B3 = _mm_xor_si128(B3, B1); \ - B2 = _mm_xor_si128(B2, B3); \ - B4 = _mm_or_si128(B4, B0); \ - B0 = _mm_xor_si128(B0, B2); \ - B3 = _mm_xor_si128(B3, B4); \ - B4 = _mm_xor_si128(B4, B1); \ - B1 = _mm_and_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_xor_si128(B0, B3); \ - B0 = _mm_or_si128(B0, B2); \ - B3 = _mm_xor_si128(B3, B1); \ - B4 = _mm_xor_si128(B4, B0); \ + do { \ + B0 ^= B2; \ + SIMD_32 B4 = B2; \ + B2 &= B0; \ + B4 ^= B3; \ + B2 = ~B2; \ + B3 ^= B1; \ + B2 ^= B3; \ + B4 |= B0; \ + B0 ^= B2; \ + B3 ^= B4; \ + B4 ^= B1; \ + B1 &= B3; \ + B1 ^= B0; \ + B0 ^= B3; \ + B0 |= B2; \ + B3 ^= B1; \ + B4 ^= B0; \ B0 = B1; \ B1 = B2; \ B2 = B4; \ } while(0); #define SBoxD8(B0, B1, B2, B3) \ - do \ - { \ - __m128i B4 = B2; \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_and_si128(B0, B3); \ - B4 = _mm_or_si128(B4, B3); \ - B2 = _mm_xor_si128(B2, all_ones); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_or_si128(B1, B0); \ - B0 = _mm_xor_si128(B0, B2); \ - B2 = _mm_and_si128(B2, B4); \ - B3 = _mm_and_si128(B3, B4); \ - B1 = _mm_xor_si128(B1, B2); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_or_si128(B0, B2); \ - B4 = _mm_xor_si128(B4, B1); \ - B0 = _mm_xor_si128(B0, B3); \ - B3 = _mm_xor_si128(B3, B4); \ - B4 = _mm_or_si128(B4, B0); \ - B3 = _mm_xor_si128(B3, B2); \ - B4 = _mm_xor_si128(B4, B2); \ + do { \ + SIMD_32 B4 = B2; \ + B2 ^= B0; \ + B0 &= B3; \ + B4 |= B3; \ + B2 = ~B2; \ + B3 ^= B1; \ + B1 |= B0; \ + B0 ^= B2; \ + B2 &= B4; \ + B3 &= B4; \ + B1 ^= B2; \ + B2 ^= B0; \ + B0 |= B2; \ + B4 ^= B1; \ + B0 ^= B3; \ + B3 ^= B4; \ + B4 |= B0; \ + B3 ^= B2; \ + B4 ^= B2; \ B2 = B1; \ B1 = B0; \ B0 = B3; \ |