diff options
Diffstat (limited to 'src/block/serpent_sse2/serp_sse2.cpp')
-rw-r--r-- | src/block/serpent_sse2/serp_sse2.cpp | 142 |
1 files changed, 54 insertions, 88 deletions
diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp index c51bb69ab..be79e870d 100644 --- a/src/block/serpent_sse2/serp_sse2.cpp +++ b/src/block/serpent_sse2/serp_sse2.cpp @@ -1,5 +1,5 @@ /* -* Serpent (SSE2) +* Serpent (SIMD) * (C) 2009 Jack Lloyd * * Distributed under the terms of the Botan license @@ -7,72 +7,50 @@ #include <botan/serp_sse2.h> #include <botan/serp_sse2_sbox.h> +#include <botan/simd_32.h> #include <botan/loadstor.h> -#include <emmintrin.h> namespace Botan { namespace { -#define key_xor(round, B0, B1, B2, B3) \ - do { \ - __m128i key = _mm_loadu_si128(keys + round); \ - B0 = _mm_xor_si128(B0, _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0))); \ - B1 = _mm_xor_si128(B1, _mm_shuffle_epi32(key, _MM_SHUFFLE(1,1,1,1))); \ - B2 = _mm_xor_si128(B2, _mm_shuffle_epi32(key, _MM_SHUFFLE(2,2,2,2))); \ - B3 = _mm_xor_si128(B3, _mm_shuffle_epi32(key, _MM_SHUFFLE(3,3,3,3))); \ +#define key_xor(round, B0, B1, B2, B3) \ + do { \ + B0 ^= SIMD_32(keys[4*round ]); \ + B1 ^= SIMD_32(keys[4*round+1]); \ + B2 ^= SIMD_32(keys[4*round+2]); \ + B3 ^= SIMD_32(keys[4*round+3]); \ } while(0); /* * Serpent's linear transformations */ -#define rotate_left_m128(vec, rot) \ - _mm_or_si128(_mm_slli_epi32(vec, rot), _mm_srli_epi32(vec, 32-rot)) - -#define rotate_right_m128(vec, rot) \ - _mm_or_si128(_mm_srli_epi32(vec, rot), _mm_slli_epi32(vec, 32-rot)) - -#define transform(B0, B1, B2, B3) \ - do { \ - B0 = rotate_left_m128(B0, 13); \ - B2 = rotate_left_m128(B2, 3); \ - B1 = _mm_xor_si128(B1, _mm_xor_si128(B0, B2)); \ - B3 = _mm_xor_si128(B3, _mm_xor_si128(B2, _mm_slli_epi32(B0, 3))); \ - B1 = rotate_left_m128(B1, 1); \ - B3 = rotate_left_m128(B3, 7); \ - B0 = _mm_xor_si128(B0, _mm_xor_si128(B1, B3)); \ - B2 = _mm_xor_si128(B2, _mm_xor_si128(B3, _mm_slli_epi32(B1, 7))); \ - B0 = rotate_left_m128(B0, 5); \ - B2 = rotate_left_m128(B2, 22); \ +#define transform(B0, B1, B2, B3) \ + do { \ + B0.rotate_left(13); \ + B2.rotate_left(3); \ + B1 ^= B0 ^ B2; \ + B3 ^= B2 ^ (B0 << 3); \ + B1.rotate_left(1); \ + B3.rotate_left(7); \ + B0 ^= B1 ^ B3; \ + B2 ^= B3 ^ (B1 << 7); \ + B0.rotate_left(5); \ + B2.rotate_left(22); \ } while(0); -#define i_transform(B0, B1, B2, B3) \ - do { \ - B2 = rotate_right_m128(B2, 22); \ - B0 = rotate_right_m128(B0, 5); \ - B2 = _mm_xor_si128(B2, _mm_xor_si128(B3, _mm_slli_epi32(B1, 7))); \ - B0 = _mm_xor_si128(B0, _mm_xor_si128(B1, B3)); \ - B3 = rotate_right_m128(B3, 7); \ - B1 = rotate_right_m128(B1, 1); \ - B3 = _mm_xor_si128(B3, _mm_xor_si128(B2, _mm_slli_epi32(B0, 3))); \ - B1 = _mm_xor_si128(B1, _mm_xor_si128(B0, B2)); \ - B2 = rotate_right_m128(B2, 3); \ - B0 = rotate_right_m128(B0, 13); \ - } while(0); - -/* -* 4x4 SSE2 integer matrix transpose -*/ -#define transpose(B0, B1, B2, B3) \ - do { \ - __m128i T0 = _mm_unpacklo_epi32(B0, B1); \ - __m128i T1 = _mm_unpacklo_epi32(B2, B3); \ - __m128i T2 = _mm_unpackhi_epi32(B0, B1); \ - __m128i T3 = _mm_unpackhi_epi32(B2, B3); \ - B0 = _mm_unpacklo_epi64(T0, T1); \ - B1 = _mm_unpackhi_epi64(T0, T1); \ - B2 = _mm_unpacklo_epi64(T2, T3); \ - B3 = _mm_unpackhi_epi64(T2, T3); \ +#define i_transform(B0, B1, B2, B3) \ + do { \ + B2.rotate_right(22); \ + B0.rotate_right(5); \ + B2 ^= B3 ^ (B1 << 7); \ + B0 ^= B1 ^ B3; \ + B3.rotate_right(7); \ + B1.rotate_right(1); \ + B3 ^= B2 ^ (B0 << 3); \ + B1 ^= B0 ^ B2; \ + B2.rotate_right(3); \ + B0.rotate_right(13); \ } while(0); /* @@ -80,20 +58,14 @@ namespace { */ void serpent_encrypt_4(const byte in[64], byte out[64], - const u32bit keys_32[132]) + const u32bit keys[132]) { - const __m128i all_ones = _mm_set1_epi8(0xFF); + SIMD_32 B0 = SIMD_32::load_le(in); + SIMD_32 B1 = SIMD_32::load_le(in + 16); + SIMD_32 B2 = SIMD_32::load_le(in + 32); + SIMD_32 B3 = SIMD_32::load_le(in + 48); - const __m128i* keys = (const __m128i*)(keys_32); - __m128i* out_mm = (__m128i*)(out); - __m128i* in_mm = (__m128i*)(in); - - __m128i B0 = _mm_loadu_si128(in_mm); - __m128i B1 = _mm_loadu_si128(in_mm + 1); - __m128i B2 = _mm_loadu_si128(in_mm + 2); - __m128i B3 = _mm_loadu_si128(in_mm + 3); - - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); key_xor( 0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); @@ -131,12 +103,12 @@ void serpent_encrypt_4(const byte in[64], key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3); - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); - _mm_storeu_si128(out_mm , B0); - _mm_storeu_si128(out_mm + 1, B1); - _mm_storeu_si128(out_mm + 2, B2); - _mm_storeu_si128(out_mm + 3, B3); + B0.store_le(out); + B1.store_le(out + 16); + B2.store_le(out + 32); + B3.store_le(out + 48); } /* @@ -144,20 +116,14 @@ void serpent_encrypt_4(const byte in[64], */ void serpent_decrypt_4(const byte in[64], byte out[64], - const u32bit keys_32[132]) + const u32bit keys[132]) { - const __m128i all_ones = _mm_set1_epi8(0xFF); - - const __m128i* keys = (const __m128i*)(keys_32); - __m128i* out_mm = (__m128i*)(out); - __m128i* in_mm = (__m128i*)(in); - - __m128i B0 = _mm_loadu_si128(in_mm); - __m128i B1 = _mm_loadu_si128(in_mm + 1); - __m128i B2 = _mm_loadu_si128(in_mm + 2); - __m128i B3 = _mm_loadu_si128(in_mm + 3); + SIMD_32 B0 = SIMD_32::load_le(in); + SIMD_32 B1 = SIMD_32::load_le(in + 16); + SIMD_32 B2 = SIMD_32::load_le(in + 32); + SIMD_32 B3 = SIMD_32::load_le(in + 48); - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); key_xor(32,B0,B1,B2,B3); SBoxD8(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3); @@ -195,12 +161,12 @@ void serpent_decrypt_4(const byte in[64], i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3); - transpose(B0, B1, B2, B3); + SIMD_32::transpose(B0, B1, B2, B3); - _mm_storeu_si128(out_mm , B0); - _mm_storeu_si128(out_mm + 1, B1); - _mm_storeu_si128(out_mm + 2, B2); - _mm_storeu_si128(out_mm + 3, B3); + B0.store_le(out); + B1.store_le(out + 16); + B2.store_le(out + 32); + B3.store_le(out + 48); } } |