From 8d2ceae6d43ab1e21604e112f437b1494def5ef8 Mon Sep 17 00:00:00 2001 From: lloyd Date: Wed, 12 Aug 2009 14:42:34 +0000 Subject: Small code cleanups in SSE2 Serpent --- src/block/serpent_sse2/serp_sse2.cpp | 6 +- src/block/serpent_sse2/serp_sse2_sbox.h | 381 ++++++++++++++++---------------- 2 files changed, 195 insertions(+), 192 deletions(-) (limited to 'src/block') diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp index ea937c95a..5ce7d8f47 100644 --- a/src/block/serpent_sse2/serp_sse2.cpp +++ b/src/block/serpent_sse2/serp_sse2.cpp @@ -65,6 +65,8 @@ void serpent_encrypt_4(const byte in[64], byte out[64], const u32bit keys_32[132]) { + const __m128i all_ones = _mm_set1_epi8(0xFF); + const __m128i* keys = (const __m128i*)(keys_32); __m128i* out_mm = (__m128i*)(out); __m128i* in_mm = (__m128i*)(in); @@ -84,6 +86,7 @@ void serpent_encrypt_4(const byte in[64], key_xor( 5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); @@ -92,6 +95,7 @@ void serpent_encrypt_4(const byte in[64], key_xor(13,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(16,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); @@ -100,6 +104,7 @@ void serpent_encrypt_4(const byte in[64], key_xor(21,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(24,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); @@ -127,7 +132,6 @@ void Serpent_SSE2::encrypt_n(const byte in[], byte out[], u32bit blocks) const while(blocks >= 4) { serpent_encrypt_4(in, out, this->round_key); - //Serpent::encrypt_n(in, out, 4); in += 4 * BLOCK_SIZE; out += 4 * BLOCK_SIZE; blocks -= 4; diff --git a/src/block/serpent_sse2/serp_sse2_sbox.h b/src/block/serpent_sse2/serp_sse2_sbox.h index bc8678a89..1660643ad 100644 --- a/src/block/serpent_sse2/serp_sse2_sbox.h +++ b/src/block/serpent_sse2/serp_sse2_sbox.h @@ -8,211 +8,210 @@ #ifndef SERPENT_SSE2_SBOXES_H__ #define SERPENT_SSE2_SBOXES_H__ -#define SBoxE1(B0, B1, B2, B3) \ - do { \ - B3 = _mm_xor_si128(B3, B0); \ - __m128i B4 = B1; \ - B1 = _mm_and_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B0 = _mm_xor_si128(B0, B4); \ - B4 = _mm_xor_si128(B4, B3); \ - B3 = _mm_xor_si128(B3, B2); \ - B2 = _mm_or_si128(B2, B1); \ - B2 = _mm_xor_si128(B2, B4); \ - B4 = _mm_andnot_si128(B4, _mm_set1_epi8(0xFF)); \ - B4 = _mm_or_si128(B4, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B4); \ - B3 = _mm_or_si128(B3, B0); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B3); \ - B3 = B0; \ - B0 = B1; \ - B1 = B4; \ +#define SBoxE1(B0, B1, B2, B3) \ + do { \ + B3 = _mm_xor_si128(B3, B0); \ + __m128i B4 = B1; \ + B1 = _mm_and_si128(B1, B3); \ + B4 = _mm_xor_si128(B4, B2); \ + B1 = _mm_xor_si128(B1, B0); \ + B0 = _mm_or_si128(B0, B3); \ + B0 = _mm_xor_si128(B0, B4); \ + B4 = _mm_xor_si128(B4, B3); \ + B3 = _mm_xor_si128(B3, B2); \ + B2 = _mm_or_si128(B2, B1); \ + B2 = _mm_xor_si128(B2, B4); \ + B4 = _mm_xor_si128(B4, all_ones); \ + B4 = _mm_or_si128(B4, B1); \ + B1 = _mm_xor_si128(B1, B3); \ + B1 = _mm_xor_si128(B1, B4); \ + B3 = _mm_or_si128(B3, B0); \ + B1 = _mm_xor_si128(B1, B3); \ + B4 = _mm_xor_si128(B4, B3); \ + B3 = B0; \ + B0 = B1; \ + B1 = B4; \ } while(0); -#define SBoxE2(B0, B1, B2, B3) \ - do { \ - B0 = _mm_andnot_si128(B0, _mm_set1_epi8(0xFF)); \ - B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \ - __m128i B4 = B0; \ - B0 = _mm_and_si128(B0, B1); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B3 = _mm_xor_si128(B3, B2); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_xor_si128(B0, B4); \ - B4 = _mm_or_si128(B4, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B2 = _mm_or_si128(B2, B0); \ - B2 = _mm_and_si128(B2, B4); \ - B0 = _mm_xor_si128(B0, B1); \ - B1 = _mm_and_si128(B1, B2); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_and_si128(B0, B2); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = B2; \ - B2 = B3; \ - B3 = B1; \ - B1 = B4; \ +#define SBoxE2(B0, B1, B2, B3) \ + do { \ + B0 = _mm_xor_si128(B0, all_ones); \ + B2 = _mm_xor_si128(B2, all_ones); \ + __m128i B4 = B0; \ + B0 = _mm_and_si128(B0, B1); \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_or_si128(B0, B3); \ + B3 = _mm_xor_si128(B3, B2); \ + B1 = _mm_xor_si128(B1, B0); \ + B0 = _mm_xor_si128(B0, B4); \ + B4 = _mm_or_si128(B4, B1); \ + B1 = _mm_xor_si128(B1, B3); \ + B2 = _mm_or_si128(B2, B0); \ + B2 = _mm_and_si128(B2, B4); \ + B0 = _mm_xor_si128(B0, B1); \ + B1 = _mm_and_si128(B1, B2); \ + B1 = _mm_xor_si128(B1, B0); \ + B0 = _mm_and_si128(B0, B2); \ + B4 = _mm_xor_si128(B4, B0); \ + B0 = B2; \ + B2 = B3; \ + B3 = B1; \ + B1 = B4; \ } while(0); -#define SBoxE3(B0, B1, B2, B3) \ - do { \ - __m128i B4 = B0; \ - B0 = _mm_and_si128(B0, B2); \ - B0 = _mm_xor_si128(B0, B3); \ - B2 = _mm_xor_si128(B2, B1); \ - B2 = _mm_xor_si128(B2, B0); \ - B3 = _mm_or_si128(B3, B4); \ - B3 = _mm_xor_si128(B3, B1); \ - B4 = _mm_xor_si128(B4, B2); \ - B1 = B3; \ - B3 = _mm_or_si128(B3, B4); \ - B3 = _mm_xor_si128(B3, B0); \ - B0 = _mm_and_si128(B0, B1); \ - B4 = _mm_xor_si128(B4, B0); \ - B1 = _mm_xor_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B4); \ - B4 = _mm_andnot_si128(B4, _mm_set1_epi8(0xFF)); \ - B0 = B2; \ - B2 = B1; \ - B1 = B3; \ - B3 = B4; \ +#define SBoxE3(B0, B1, B2, B3) \ + do { \ + __m128i B4 = B0; \ + B0 = _mm_and_si128(B0, B2); \ + B0 = _mm_xor_si128(B0, B3); \ + B2 = _mm_xor_si128(B2, B1); \ + B2 = _mm_xor_si128(B2, B0); \ + B3 = _mm_or_si128(B3, B4); \ + B3 = _mm_xor_si128(B3, B1); \ + B4 = _mm_xor_si128(B4, B2); \ + B1 = B3; \ + B3 = _mm_or_si128(B3, B4); \ + B3 = _mm_xor_si128(B3, B0); \ + B0 = _mm_and_si128(B0, B1); \ + B4 = _mm_xor_si128(B4, B0); \ + B1 = _mm_xor_si128(B1, B3); \ + B1 = _mm_xor_si128(B1, B4); \ + B4 = _mm_xor_si128(B4, all_ones); \ + B0 = B2; \ + B2 = B1; \ + B1 = B3; \ + B3 = B4; \ } while(0); -#define SBoxE4(B0, B1, B2, B3) \ - do { \ - __m128i B4 = B0; \ - B0 = _mm_or_si128(B0, B3); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_and_si128(B1, B4); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_xor_si128(B2, B3); \ - B3 = _mm_and_si128(B3, B0); \ - B4 = _mm_or_si128(B4, B1); \ - B3 = _mm_xor_si128(B3, B4); \ - B0 = _mm_xor_si128(B0, B1); \ - B4 = _mm_and_si128(B4, B0); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B1 = _mm_or_si128(B1, B0); \ - B1 = _mm_xor_si128(B1, B2); \ - B0 = _mm_xor_si128(B0, B3); \ - B2 = B1; \ - B1 = _mm_or_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = B1; \ - B1 = B2; \ - B2 = B3; \ - B3 = B4; \ +#define SBoxE4(B0, B1, B2, B3) \ + do { \ + __m128i B4 = B0; \ + B0 = _mm_or_si128(B0, B3); \ + B3 = _mm_xor_si128(B3, B1); \ + B1 = _mm_and_si128(B1, B4); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = _mm_xor_si128(B2, B3); \ + B3 = _mm_and_si128(B3, B0); \ + B4 = _mm_or_si128(B4, B1); \ + B3 = _mm_xor_si128(B3, B4); \ + B0 = _mm_xor_si128(B0, B1); \ + B4 = _mm_and_si128(B4, B0); \ + B1 = _mm_xor_si128(B1, B3); \ + B4 = _mm_xor_si128(B4, B2); \ + B1 = _mm_or_si128(B1, B0); \ + B1 = _mm_xor_si128(B1, B2); \ + B0 = _mm_xor_si128(B0, B3); \ + B2 = B1; \ + B1 = _mm_or_si128(B1, B3); \ + B0 = _mm_xor_si128(B0, B1); \ + B1 = B2; \ + B2 = B3; \ + B3 = B4; \ } while(0); -#define SBoxE5(B0, B1, B2, B3) \ - do { \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_andnot_si128(B3, _mm_set1_epi8(0xFF)); \ - B2 = _mm_xor_si128(B2, B3); \ - B3 = _mm_xor_si128(B3, B0); \ - __m128i B4 = B1; \ - B1 = _mm_and_si128(B1, B3); \ - B1 = _mm_xor_si128(B1, B2); \ - B4 = _mm_xor_si128(B4, B3); \ - B0 = _mm_xor_si128(B0, B4); \ - B2 = _mm_and_si128(B2, B4); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_and_si128(B0, B1); \ - B3 = _mm_xor_si128(B3, B0); \ - B4 = _mm_or_si128(B4, B1); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B0 = _mm_xor_si128(B0, B2); \ - B2 = _mm_and_si128(B2, B3); \ - B0 = _mm_andnot_si128(B0, _mm_set1_epi8(0xFF)); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = B0; \ - B0 = B1; \ - B1 = B4; \ +#define SBoxE5(B0, B1, B2, B3) \ + do { \ + B1 = _mm_xor_si128(B1, B3); \ + B3 = _mm_xor_si128(B3, all_ones); \ + B2 = _mm_xor_si128(B2, B3); \ + B3 = _mm_xor_si128(B3, B0); \ + __m128i B4 = B1; \ + B1 = _mm_and_si128(B1, B3); \ + B1 = _mm_xor_si128(B1, B2); \ + B4 = _mm_xor_si128(B4, B3); \ + B0 = _mm_xor_si128(B0, B4); \ + B2 = _mm_and_si128(B2, B4); \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_and_si128(B0, B1); \ + B3 = _mm_xor_si128(B3, B0); \ + B4 = _mm_or_si128(B4, B1); \ + B4 = _mm_xor_si128(B4, B0); \ + B0 = _mm_or_si128(B0, B3); \ + B0 = _mm_xor_si128(B0, B2); \ + B2 = _mm_and_si128(B2, B3); \ + B0 = _mm_xor_si128(B0, all_ones); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = B0; \ + B0 = B1; \ + B1 = B4; \ } while(0); -#define SBoxE6(B0, B1, B2, B3) \ - do { \ - B0 = _mm_xor_si128(B0, B1); \ - B1 = _mm_xor_si128(B1, B3); \ - B3 = _mm_andnot_si128(B3, _mm_set1_epi8(0xFF)); \ - __m128i B4 = B1; \ - B1 = _mm_and_si128(B1, B0); \ - B2 = _mm_xor_si128(B2, B3); \ - B1 = _mm_xor_si128(B1, B2); \ - B2 = _mm_or_si128(B2, B4); \ - B4 = _mm_xor_si128(B4, B3); \ - B3 = _mm_and_si128(B3, B1); \ - B3 = _mm_xor_si128(B3, B0); \ - B4 = _mm_xor_si128(B4, B1); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_and_si128(B0, B3); \ - B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \ - B0 = _mm_xor_si128(B0, B4); \ - B4 = _mm_or_si128(B4, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = B0; \ - B0 = B1; \ - B1 = B3; \ - B3 = B4; \ +#define SBoxE6(B0, B1, B2, B3) \ + do { \ + B0 = _mm_xor_si128(B0, B1); \ + B1 = _mm_xor_si128(B1, B3); \ + B3 = _mm_xor_si128(B3, all_ones); \ + __m128i B4 = B1; \ + B1 = _mm_and_si128(B1, B0); \ + B2 = _mm_xor_si128(B2, B3); \ + B1 = _mm_xor_si128(B1, B2); \ + B2 = _mm_or_si128(B2, B4); \ + B4 = _mm_xor_si128(B4, B3); \ + B3 = _mm_and_si128(B3, B1); \ + B3 = _mm_xor_si128(B3, B0); \ + B4 = _mm_xor_si128(B4, B1); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_and_si128(B0, B3); \ + B2 = _mm_xor_si128(B2, all_ones); \ + B0 = _mm_xor_si128(B0, B4); \ + B4 = _mm_or_si128(B4, B3); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = B0; \ + B0 = B1; \ + B1 = B3; \ + B3 = B4; \ } while(0); -#define SBoxE7(B0, B1, B2, B3) \ - do { \ - B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \ - __m128i B4 = B3; \ - B3 = _mm_and_si128(B3, B0); \ - B0 = _mm_xor_si128(B0, B4); \ - B3 = _mm_xor_si128(B3, B2); \ - B2 = _mm_or_si128(B2, B4); \ - B1 = _mm_xor_si128(B1, B3); \ - B2 = _mm_xor_si128(B2, B0); \ - B0 = _mm_or_si128(B0, B1); \ - B2 = _mm_xor_si128(B2, B1); \ - B4 = _mm_xor_si128(B4, B0); \ - B0 = _mm_or_si128(B0, B3); \ - B0 = _mm_xor_si128(B0, B2); \ - B4 = _mm_xor_si128(B4, B3); \ - B4 = _mm_xor_si128(B4, B0); \ - B3 = _mm_andnot_si128(B3, _mm_set1_epi8(0xFF)); \ - B2 = _mm_and_si128(B2, B4); \ - B3 = _mm_xor_si128(B3, B2); \ - B2 = B4; \ +#define SBoxE7(B0, B1, B2, B3) \ + do { \ + B2 = _mm_xor_si128(B2, all_ones); \ + __m128i B4 = B3; \ + B3 = _mm_and_si128(B3, B0); \ + B0 = _mm_xor_si128(B0, B4); \ + B3 = _mm_xor_si128(B3, B2); \ + B2 = _mm_or_si128(B2, B4); \ + B1 = _mm_xor_si128(B1, B3); \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_or_si128(B0, B1); \ + B2 = _mm_xor_si128(B2, B1); \ + B4 = _mm_xor_si128(B4, B0); \ + B0 = _mm_or_si128(B0, B3); \ + B0 = _mm_xor_si128(B0, B2); \ + B4 = _mm_xor_si128(B4, B3); \ + B4 = _mm_xor_si128(B4, B0); \ + B3 = _mm_xor_si128(B3, all_ones); \ + B2 = _mm_and_si128(B2, B4); \ + B3 = _mm_xor_si128(B3, B2); \ + B2 = B4; \ } while(0); -#define SBoxE8(B0, B1, B2, B3) \ - do { \ - __m128i B4 = B1; \ - B1 = _mm_or_si128(B1, B2); \ - B1 = _mm_xor_si128(B1, B3); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = _mm_xor_si128(B2, B1); \ - B3 = _mm_or_si128(B3, B4); \ - B3 = _mm_and_si128(B3, B0); \ - B4 = _mm_xor_si128(B4, B2); \ - B3 = _mm_xor_si128(B3, B1); \ - B1 = _mm_or_si128(B1, B4); \ - B1 = _mm_xor_si128(B1, B0); \ - B0 = _mm_or_si128(B0, B4); \ - B0 = _mm_xor_si128(B0, B2); \ - B1 = _mm_xor_si128(B1, B4); \ - B2 = _mm_xor_si128(B2, B1); \ - B1 = _mm_and_si128(B1, B0); \ - B1 = _mm_xor_si128(B1, B4); \ - B2 = _mm_andnot_si128(B2, _mm_set1_epi8(0xFF)); \ - B2 = _mm_or_si128(B2, B0); \ - B4 = _mm_xor_si128(B4, B2); \ - B2 = B1; \ - B1 = B3; \ - B3 = B0; \ - B0 = B4; \ +#define SBoxE8(B0, B1, B2, B3) \ + do { \ + __m128i B4 = B1; \ + B1 = _mm_or_si128(B1, B2); \ + B1 = _mm_xor_si128(B1, B3); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = _mm_xor_si128(B2, B1); \ + B3 = _mm_or_si128(B3, B4); \ + B3 = _mm_and_si128(B3, B0); \ + B4 = _mm_xor_si128(B4, B2); \ + B3 = _mm_xor_si128(B3, B1); \ + B1 = _mm_or_si128(B1, B4); \ + B1 = _mm_xor_si128(B1, B0); \ + B0 = _mm_or_si128(B0, B4); \ + B0 = _mm_xor_si128(B0, B2); \ + B1 = _mm_xor_si128(B1, B4); \ + B2 = _mm_xor_si128(B2, B1); \ + B1 = _mm_and_si128(B1, B0); \ + B1 = _mm_xor_si128(B1, B4); \ + B2 = _mm_xor_si128(B2, all_ones); \ + B2 = _mm_or_si128(B2, B0); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = B1; \ + B1 = B3; \ + B3 = B0; \ + B0 = B4; \ } while(0); #endif -- cgit v1.2.3