From fdba61bb05f26f8ed979543b8ee9cfab96295800 Mon Sep 17 00:00:00 2001 From: lloyd Date: Wed, 12 Aug 2009 15:20:34 +0000 Subject: Add SSE2 Serpent decryption --- src/block/serpent_sse2/serp_sse2.cpp | 91 +++++++++++++- src/block/serpent_sse2/serp_sse2_sbox.h | 217 ++++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 1 deletion(-) (limited to 'src/block') diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp index 5ce7d8f47..a3319afec 100644 --- a/src/block/serpent_sse2/serp_sse2.cpp +++ b/src/block/serpent_sse2/serp_sse2.cpp @@ -24,11 +24,14 @@ namespace { } while(0); /* -* Serpent's linear transformation +* Serpent's linear transformations */ #define rotate_left_m128(vec, rot) \ _mm_or_si128(_mm_slli_epi32(vec, rot), _mm_srli_epi32(vec, 32-rot)) +#define rotate_right_m128(vec, rot) \ + _mm_or_si128(_mm_srli_epi32(vec, rot), _mm_slli_epi32(vec, 32-rot)) + #define transform(B0, B1, B2, B3) \ do { \ B0 = rotate_left_m128(B0, 13); \ @@ -43,6 +46,20 @@ namespace { B2 = rotate_left_m128(B2, 22); \ } while(0); +#define i_transform(B0, B1, B2, B3) \ + do { \ + B2 = rotate_right_m128(B2, 22); \ + B0 = rotate_right_m128(B0, 5); \ + B2 = _mm_xor_si128(B2, _mm_xor_si128(B3, _mm_slli_epi32(B1, 7))); \ + B0 = _mm_xor_si128(B0, _mm_xor_si128(B1, B3)); \ + B3 = rotate_right_m128(B3, 7); \ + B1 = rotate_right_m128(B1, 1); \ + B3 = _mm_xor_si128(B3, _mm_xor_si128(B2, _mm_slli_epi32(B0, 3))); \ + B1 = _mm_xor_si128(B1, _mm_xor_si128(B0, B2)); \ + B2 = rotate_right_m128(B2, 3); \ + B0 = rotate_right_m128(B0, 13); \ + } while(0); + /* * 4x4 SSE2 integer matrix transpose */ @@ -122,6 +139,70 @@ void serpent_encrypt_4(const byte in[64], _mm_storeu_si128(out_mm + 3, B3); } +/* +* SSE2 Serpent Decryption of 4 blocks in parallel +*/ +void serpent_decrypt_4(const byte in[64], + byte out[64], + const u32bit keys_32[132]) + { + const __m128i all_ones = _mm_set1_epi8(0xFF); + + const __m128i* keys = (const __m128i*)(keys_32); + __m128i* out_mm = (__m128i*)(out); + __m128i* in_mm = (__m128i*)(in); + + __m128i B0 = _mm_loadu_si128(in_mm); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + transpose(B0, B1, B2, B3); + + key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3); + + transpose(B0, B1, B2, B3); + + _mm_storeu_si128(out_mm , B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + } + } /* @@ -150,6 +231,14 @@ void Serpent_SSE2::encrypt_n(const byte in[], byte out[], u32bit blocks) const */ void Serpent_SSE2::decrypt_n(const byte in[], byte out[], u32bit blocks) const { + while(blocks >= 4) + { + serpent_decrypt_4(in, out, this->round_key); + in += 4 * BLOCK_SIZE; + out += 4 * BLOCK_SIZE; + blocks -= 4; + } + for(u32bit i = 0; i != blocks; ++i) { Serpent::decrypt_n(in, out, 1); diff --git a/src/block/serpent_sse2/serp_sse2_sbox.h b/src/block/serpent_sse2/serp_sse2_sbox.h index 1660643ad..40c552e87 100644 --- a/src/block/serpent_sse2/serp_sse2_sbox.h +++ b/src/block/serpent_sse2/serp_sse2_sbox.h @@ -214,4 +214,221 @@ B0 = B4; \ } while(0); +#define SBoxD1(B0, B1, B2, B3) \ + do \ + { \ + B2 = _mm_xor_si128(B2, all_ones); \ + __m128i B4 = B1; \ + B1 = _mm_or_si128(B1, B0); \ + B4 = _mm_xor_si128(B4, all_ones); \ + B1 = _mm_xor_si128(B1, B2); \ + B2 = _mm_or_si128(B2, B4); \ + B1 = _mm_xor_si128(B1, B3); \ + B0 = _mm_xor_si128(B0, B4); \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_and_si128(B0, B3); \ + B4 = _mm_xor_si128(B4, B0); \ + B0 = _mm_or_si128(B0, B1); \ + B0 = _mm_xor_si128(B0, B2); \ + B3 = _mm_xor_si128(B3, B4); \ + B2 = _mm_xor_si128(B2, B1); \ + B3 = _mm_xor_si128(B3, B0); \ + B3 = _mm_xor_si128(B3, B1); \ + B2 = _mm_and_si128(B2, B3); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = B1; \ + B1 = B4; \ + } while(0); + +#define SBoxD2(B0, B1, B2, B3) \ + do \ + { \ + __m128i B4 = B1; \ + B1 = _mm_xor_si128(B1, B3); \ + B3 = _mm_and_si128(B3, B1); \ + B4 = _mm_xor_si128(B4, B2); \ + B3 = _mm_xor_si128(B3, B0); \ + B0 = _mm_or_si128(B0, B1); \ + B2 = _mm_xor_si128(B2, B3); \ + B0 = _mm_xor_si128(B0, B4); \ + B0 = _mm_or_si128(B0, B2); \ + B1 = _mm_xor_si128(B1, B3); \ + B0 = _mm_xor_si128(B0, B1); \ + B1 = _mm_or_si128(B1, B3); \ + B1 = _mm_xor_si128(B1, B0); \ + B4 = _mm_xor_si128(B4, all_ones); \ + B4 = _mm_xor_si128(B4, B1); \ + B1 = _mm_or_si128(B1, B0); \ + B1 = _mm_xor_si128(B1, B0); \ + B1 = _mm_or_si128(B1, B4); \ + B3 = _mm_xor_si128(B3, B1); \ + B1 = B0; \ + B0 = B4; \ + B4 = B2; \ + B2 = B3; \ + B3 = B4; \ + } while(0); + +#define SBoxD3(B0, B1, B2, B3) \ + do \ + { \ + B2 = _mm_xor_si128(B2, B3); \ + B3 = _mm_xor_si128(B3, B0); \ + __m128i B4 = B3; \ + B3 = _mm_and_si128(B3, B2); \ + B3 = _mm_xor_si128(B3, B1); \ + B1 = _mm_or_si128(B1, B2); \ + B1 = _mm_xor_si128(B1, B4); \ + B4 = _mm_and_si128(B4, B3); \ + B2 = _mm_xor_si128(B2, B3); \ + B4 = _mm_and_si128(B4, B0); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = _mm_and_si128(B2, B1); \ + B2 = _mm_or_si128(B2, B0); \ + B3 = _mm_xor_si128(B3, all_ones); \ + B2 = _mm_xor_si128(B2, B3); \ + B0 = _mm_xor_si128(B0, B3); \ + B0 = _mm_and_si128(B0, B1); \ + B3 = _mm_xor_si128(B3, B4); \ + B3 = _mm_xor_si128(B3, B0); \ + B0 = B1; \ + B1 = B4; \ + } while(0); + +#define SBoxD4(B0, B1, B2, B3) \ + do \ + { \ + __m128i B4 = B2; \ + B2 = _mm_xor_si128(B2, B1); \ + B0 = _mm_xor_si128(B0, B2); \ + B4 = _mm_and_si128(B4, B2); \ + B4 = _mm_xor_si128(B4, B0); \ + B0 = _mm_and_si128(B0, B1); \ + B1 = _mm_xor_si128(B1, B3); \ + B3 = _mm_or_si128(B3, B4); \ + B2 = _mm_xor_si128(B2, B3); \ + B0 = _mm_xor_si128(B0, B3); \ + B1 = _mm_xor_si128(B1, B4); \ + B3 = _mm_and_si128(B3, B2); \ + B3 = _mm_xor_si128(B3, B1); \ + B1 = _mm_xor_si128(B1, B0); \ + B1 = _mm_or_si128(B1, B2); \ + B0 = _mm_xor_si128(B0, B3); \ + B1 = _mm_xor_si128(B1, B4); \ + B0 = _mm_xor_si128(B0, B1); \ + B4 = B0; \ + B0 = B2; \ + B2 = B3; \ + B3 = B4; \ + } while(0); + +#define SBoxD5(B0, B1, B2, B3) \ + do \ + { \ + __m128i B4 = B2; \ + B2 = _mm_and_si128(B2, B3); \ + B2 = _mm_xor_si128(B2, B1); \ + B1 = _mm_or_si128(B1, B3); \ + B1 = _mm_and_si128(B1, B0); \ + B4 = _mm_xor_si128(B4, B2); \ + B4 = _mm_xor_si128(B4, B1); \ + B1 = _mm_and_si128(B1, B2); \ + B0 = _mm_xor_si128(B0, all_ones); \ + B3 = _mm_xor_si128(B3, B4); \ + B1 = _mm_xor_si128(B1, B3); \ + B3 = _mm_and_si128(B3, B0); \ + B3 = _mm_xor_si128(B3, B2); \ + B0 = _mm_xor_si128(B0, B1); \ + B2 = _mm_and_si128(B2, B0); \ + B3 = _mm_xor_si128(B3, B0); \ + B2 = _mm_xor_si128(B2, B4); \ + B2 = _mm_or_si128(B2, B3); \ + B3 = _mm_xor_si128(B3, B0); \ + B2 = _mm_xor_si128(B2, B1); \ + B1 = B3; \ + B3 = B4; \ + } while(0); + +#define SBoxD6(B0, B1, B2, B3) \ + do \ + { \ + B1 = _mm_xor_si128(B1, all_ones); \ + __m128i B4 = B3; \ + B2 = _mm_xor_si128(B2, B1); \ + B3 = _mm_or_si128(B3, B0); \ + B3 = _mm_xor_si128(B3, B2); \ + B2 = _mm_or_si128(B2, B1); \ + B2 = _mm_and_si128(B2, B0); \ + B4 = _mm_xor_si128(B4, B3); \ + B2 = _mm_xor_si128(B2, B4); \ + B4 = _mm_or_si128(B4, B0); \ + B4 = _mm_xor_si128(B4, B1); \ + B1 = _mm_and_si128(B1, B2); \ + B1 = _mm_xor_si128(B1, B3); \ + B4 = _mm_xor_si128(B4, B2); \ + B3 = _mm_and_si128(B3, B4); \ + B4 = _mm_xor_si128(B4, B1); \ + B3 = _mm_xor_si128(B3, B4); \ + B4 = _mm_xor_si128(B4, all_ones); \ + B3 = _mm_xor_si128(B3, B0); \ + B0 = B1; \ + B1 = B4; \ + B4 = B3; \ + B3 = B2; \ + B2 = B4; \ + } while(0); + +#define SBoxD7(B0, B1, B2, B3) \ + do \ + { \ + B0 = _mm_xor_si128(B0, B2); \ + __m128i B4 = B2; \ + B2 = _mm_and_si128(B2, B0); \ + B4 = _mm_xor_si128(B4, B3); \ + B2 = _mm_xor_si128(B2, all_ones); \ + B3 = _mm_xor_si128(B3, B1); \ + B2 = _mm_xor_si128(B2, B3); \ + B4 = _mm_or_si128(B4, B0); \ + B0 = _mm_xor_si128(B0, B2); \ + B3 = _mm_xor_si128(B3, B4); \ + B4 = _mm_xor_si128(B4, B1); \ + B1 = _mm_and_si128(B1, B3); \ + B1 = _mm_xor_si128(B1, B0); \ + B0 = _mm_xor_si128(B0, B3); \ + B0 = _mm_or_si128(B0, B2); \ + B3 = _mm_xor_si128(B3, B1); \ + B4 = _mm_xor_si128(B4, B0); \ + B0 = B1; \ + B1 = B2; \ + B2 = B4; \ + } while(0); + +#define SBoxD8(B0, B1, B2, B3) \ + do \ + { \ + __m128i B4 = B2; \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_and_si128(B0, B3); \ + B4 = _mm_or_si128(B4, B3); \ + B2 = _mm_xor_si128(B2, all_ones); \ + B3 = _mm_xor_si128(B3, B1); \ + B1 = _mm_or_si128(B1, B0); \ + B0 = _mm_xor_si128(B0, B2); \ + B2 = _mm_and_si128(B2, B4); \ + B3 = _mm_and_si128(B3, B4); \ + B1 = _mm_xor_si128(B1, B2); \ + B2 = _mm_xor_si128(B2, B0); \ + B0 = _mm_or_si128(B0, B2); \ + B4 = _mm_xor_si128(B4, B1); \ + B0 = _mm_xor_si128(B0, B3); \ + B3 = _mm_xor_si128(B3, B4); \ + B4 = _mm_or_si128(B4, B0); \ + B3 = _mm_xor_si128(B3, B2); \ + B4 = _mm_xor_si128(B4, B2); \ + B2 = B1; \ + B1 = B0; \ + B0 = B3; \ + B3 = B4; \ + } while(0); + #endif -- cgit v1.2.3