diff options
author | lloyd <[email protected]> | 2009-08-12 12:23:16 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2009-08-12 12:23:16 +0000 |
commit | 285d350ed62d2714592a27bf577832a92cb6902f (patch) | |
tree | 09a477bb63f8f98ce68e108a84fb2bab240897dc | |
parent | c8c3d7f6eecd753aa87a882b1458346682e606db (diff) |
Add full 4-way SSE2 Serpent encryption. Load/store operations are via
unions and can be made much faster using interleave operations I think.
Currently ~2.5x faster in ECB or CTR mode on a Core2, which isn't too bad.
-rw-r--r-- | src/block/serpent_sse2/info.txt | 1 | ||||
-rw-r--r-- | src/block/serpent_sse2/serp_sse2.cpp | 111 | ||||
-rw-r--r-- | src/block/serpent_sse2/serp_sse2_sbox.h | 225 |
3 files changed, 290 insertions, 47 deletions
diff --git a/src/block/serpent_sse2/info.txt b/src/block/serpent_sse2/info.txt index ad8323f53..09733e98f 100644 --- a/src/block/serpent_sse2/info.txt +++ b/src/block/serpent_sse2/info.txt @@ -7,6 +7,7 @@ load_on auto <add> serp_sse2.cpp serp_sse2.h +serp_sse2_sbox.h </add> <arch> diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp index 759f3e1d6..3e78f0bac 100644 --- a/src/block/serpent_sse2/serp_sse2.cpp +++ b/src/block/serpent_sse2/serp_sse2.cpp @@ -6,6 +6,7 @@ */ #include <botan/serp_sse2.h> +#include <botan/serp_sse2_sbox.h> #include <botan/loadstor.h> #include <emmintrin.h> @@ -13,29 +14,6 @@ namespace Botan { namespace { -#define SBoxE1(b0, b1, b2, b3, b4) \ - do { \ - b3 = _mm_xor_si128(b3, b0); \ - b4 = b1; \ - b1 = _mm_and_si128(b1, b3); \ - b4 = _mm_xor_si128(b4, b2); \ - b1 = _mm_xor_si128(b1, b0); \ - b0 = _mm_or_si128(b0, b3); \ - b0 = _mm_xor_si128(b0, b4); \ - b4 = _mm_xor_si128(b4, b3); \ - b3 = _mm_xor_si128(b3, b2); \ - b2 = _mm_or_si128(b2, b1); \ - b2 = _mm_xor_si128(b2, b4); \ - b4 = _mm_andnot_si128(b4, all_ones); \ - b4 = _mm_or_si128(b4, b1); \ - b1 = _mm_xor_si128(b1, b3); \ - b1 = _mm_xor_si128(b1, b4); \ - b3 = _mm_or_si128(b3, b0); \ - b1 = _mm_xor_si128(b1, b3); \ - b4 = _mm_xor_si128(b4, b3); \ - b3 = b0; b0 = b1; b1 = b4; \ - } while(0); - #define rotate_left_m128(vec, rot) \ _mm_or_si128(_mm_slli_epi32(vec, rot), _mm_srli_epi32(vec, 32-rot)) @@ -89,45 +67,84 @@ void serpent_encrypt_4(const byte in[64], convert.u32[1] = load_le<u32bit>(in, 4); convert.u32[2] = load_le<u32bit>(in, 8); convert.u32[3] = load_le<u32bit>(in, 12); - - __m128i b0 = convert.v; + __m128i B0 = convert.v; convert.u32[0] = load_le<u32bit>(in, 1); convert.u32[1] = load_le<u32bit>(in, 5); convert.u32[2] = load_le<u32bit>(in, 9); convert.u32[3] = load_le<u32bit>(in, 13); - - __m128i b1 = convert.v; + __m128i B1 = convert.v; convert.u32[0] = load_le<u32bit>(in, 2); convert.u32[1] = load_le<u32bit>(in, 6); convert.u32[2] = load_le<u32bit>(in, 10); convert.u32[3] = load_le<u32bit>(in, 14); - - __m128i b2 = convert.v; + __m128i B2 = convert.v; convert.u32[0] = load_le<u32bit>(in, 3); convert.u32[1] = load_le<u32bit>(in, 7); convert.u32[2] = load_le<u32bit>(in, 11); convert.u32[3] = load_le<u32bit>(in, 15); - - - __m128i b3 = convert.v; - - __m128i b4; // temp - - const __m128i all_ones = _mm_set1_epi8(0xFF); - - key_xor(0, b0, b1, b2, b3); - SBoxE1(b0, b1, b2, b3, b4); - transform(b0, b1, b2, b3); - - key_xor(b0, b1, b2, b3, 1); - - print_simd("b0", b0); - print_simd("b1", b1); - print_simd("b2", b2); - print_simd("b3", b3); + __m128i B3 = convert.v; + + key_xor(0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + + key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(11,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(12,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(13,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(14,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(15,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(16,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(17,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(18,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(19,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(20,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(21,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(22,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(23,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(24,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(25,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(26,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(27,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(28,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(29,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3); + + // FIXME: figure out how to do this fast + union { __m128i v; u32bit u32[4]; } convert_B0; + union { __m128i v; u32bit u32[4]; } convert_B1; + union { __m128i v; u32bit u32[4]; } convert_B2; + union { __m128i v; u32bit u32[4]; } convert_B3; + convert_B0.v = B0; + convert_B1.v = B1; + convert_B2.v = B2; + convert_B3.v = B3; + store_le(out, + convert_B0.u32[0], convert_B1.u32[0], + convert_B2.u32[0], convert_B3.u32[0]); + + store_le(out + 16, + convert_B0.u32[1], convert_B1.u32[1], + convert_B2.u32[1], convert_B3.u32[1]); + + store_le(out + 32, + convert_B0.u32[2], convert_B1.u32[2], + convert_B2.u32[2], convert_B3.u32[2]); + + store_le(out + 48, + convert_B0.u32[3], convert_B1.u32[3], + convert_B2.u32[3], convert_B3.u32[3]); } } diff --git a/src/block/serpent_sse2/serp_sse2_sbox.h b/src/block/serpent_sse2/serp_sse2_sbox.h new file mode 100644 index 000000000..2c4d9d9cb --- /dev/null +++ b/src/block/serpent_sse2/serp_sse2_sbox.h @@ -0,0 +1,225 @@ +/* +* Serpent Sboxes in SSE2 form +* (C) 2009 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#ifndef SERPENT_SSE2_SBOXES_H__ +#define SERPENT_SSE2_SBOXES_H__ + +#define SBoxE1(b0, b1, b2, b3) \ + do { \ + b3 = _mm_xor_si128(b3, b0); \ + __m128i b4 = b1; \ + b1 = _mm_and_si128(b1, b3); \ + b4 = _mm_xor_si128(b4, b2); \ + b1 = _mm_xor_si128(b1, b0); \ + b0 = _mm_or_si128(b0, b3); \ + b0 = _mm_xor_si128(b0, b4); \ + b4 = _mm_xor_si128(b4, b3); \ + b3 = _mm_xor_si128(b3, b2); \ + b2 = _mm_or_si128(b2, b1); \ + b2 = _mm_xor_si128(b2, b4); \ + b4 = _mm_andnot_si128(b4, _mm_set1_epi8(0xFF)); \ + b4 = _mm_or_si128(b4, b1); \ + b1 = _mm_xor_si128(b1, b3); \ + b1 = _mm_xor_si128(b1, b4); \ + b3 = _mm_or_si128(b3, b0); \ + b1 = _mm_xor_si128(b1, b3); \ + b4 = _mm_xor_si128(b4, b3); \ + b3 = b0; \ + b0 = b1; \ + b1 = b4; \ + } while(0); + +#define SBoxE2(b0, b1, b2, b3) \ + do \ + { \ + b0 = _mm_andnot_si128(b0, _mm_set1_epi8(0xFF)); \ + b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \ + __m128i b4 = b0; \ + b0 = _mm_and_si128(b0, b1); \ + b2 = _mm_xor_si128(b2, b0); \ + b0 = _mm_or_si128(b0, b3); \ + b3 = _mm_xor_si128(b3, b2); \ + b1 = _mm_xor_si128(b1, b0); \ + b0 = _mm_xor_si128(b0, b4); \ + b4 = _mm_or_si128(b4, b1); \ + b1 = _mm_xor_si128(b1, b3); \ + b2 = _mm_or_si128(b2, b0); \ + b2 = _mm_and_si128(b2, b4); \ + b0 = _mm_xor_si128(b0, b1); \ + b1 = _mm_and_si128(b1, b2); \ + b1 = _mm_xor_si128(b1, b0); \ + b0 = _mm_and_si128(b0, b2); \ + b4 = _mm_xor_si128(b4, b0); \ + b0 = b2; \ + b2 = b3; \ + b3 = b1; \ + b1 = b4; \ + } while(0); + +#define SBoxE3(b0, b1, b2, b3) \ + do \ + { \ + __m128i b4 = b0; \ + b0 = _mm_and_si128(b0, b2); \ + b0 = _mm_xor_si128(b0, b3); \ + b2 = _mm_xor_si128(b2, b1); \ + b2 = _mm_xor_si128(b2, b0); \ + b3 = _mm_or_si128(b3, b4); \ + b3 = _mm_xor_si128(b3, b1); \ + b4 = _mm_xor_si128(b4, b2); \ + b1 = b3; \ + b3 = _mm_or_si128(b3, b4); \ + b3 = _mm_xor_si128(b3, b0); \ + b0 = _mm_and_si128(b0, b1); \ + b4 = _mm_xor_si128(b4, b0); \ + b1 = _mm_xor_si128(b1, b3); \ + b1 = _mm_xor_si128(b1, b4); \ + b4 = _mm_andnot_si128(b4, _mm_set1_epi8(0xFF)); \ + b0 = b2; \ + b2 = b1; \ + b1 = b3; \ + b3 = b4; \ + } while(0); + +#define SBoxE4(b0, b1, b2, b3) \ + do \ + { \ + __m128i b4 = b0; \ + b0 = _mm_or_si128(b0, b3); \ + b3 = _mm_xor_si128(b3, b1); \ + b1 = _mm_and_si128(b1, b4); \ + b4 = _mm_xor_si128(b4, b2); \ + b2 = _mm_xor_si128(b2, b3); \ + b3 = _mm_and_si128(b3, b0); \ + b4 = _mm_or_si128(b4, b1); \ + b3 = _mm_xor_si128(b3, b4); \ + b0 = _mm_xor_si128(b0, b1); \ + b4 = _mm_and_si128(b4, b0); \ + b1 = _mm_xor_si128(b1, b3); \ + b4 = _mm_xor_si128(b4, b2); \ + b1 = _mm_or_si128(b1, b0); \ + b1 = _mm_xor_si128(b1, b2); \ + b0 = _mm_xor_si128(b0, b3); \ + b2 = b1; \ + b1 = _mm_or_si128(b1, b3); \ + b1 = _mm_xor_si128(b1, b0); \ + b0 = b1; \ + b1 = b2; \ + b2 = b3; \ + b3 = b4; \ + } while(0); + +#define SBoxE5(b0, b1, b2, b3) \ + do \ + { \ + b1 = _mm_xor_si128(b1, b3); \ + b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \ + b2 = _mm_xor_si128(b2, b3); \ + b3 = _mm_xor_si128(b3, b0); \ + __m128i b4 = b1; \ + b1 = _mm_and_si128(b1, b3); \ + b1 = _mm_xor_si128(b1, b2); \ + b4 = _mm_xor_si128(b4, b3); \ + b0 = _mm_xor_si128(b0, b4); \ + b2 = _mm_and_si128(b2, b4); \ + b2 = _mm_xor_si128(b2, b0); \ + b0 = _mm_and_si128(b0, b1); \ + b3 = _mm_xor_si128(b3, b0); \ + b4 = _mm_or_si128(b4, b1); \ + b4 = _mm_xor_si128(b4, b0); \ + b0 = _mm_or_si128(b0, b3); \ + b0 = _mm_xor_si128(b0, b2); \ + b2 = _mm_and_si128(b2, b3); \ + b0 = _mm_andnot_si128(b0, _mm_set1_epi8(0xFF)); \ + b4 = _mm_xor_si128(b4, b2); \ + b2 = b0; \ + b0 = b1; \ + b1 = b4; \ + } while(0); + +#define SBoxE6(b0, b1, b2, b3) \ + do \ + { \ + b0 = _mm_xor_si128(b0, b1); \ + b1 = _mm_xor_si128(b1, b3); \ + b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \ + __m128i b4 = b1; \ + b1 = _mm_and_si128(b1, b0); \ + b2 = _mm_xor_si128(b2, b3); \ + b1 = _mm_xor_si128(b1, b2); \ + b2 = _mm_or_si128(b2, b4); \ + b4 = _mm_xor_si128(b4, b3); \ + b3 = _mm_and_si128(b3, b1); \ + b3 = _mm_xor_si128(b3, b0); \ + b4 = _mm_xor_si128(b4, b1); \ + b4 = _mm_xor_si128(b4, b2); \ + b2 = _mm_xor_si128(b2, b0); \ + b0 = _mm_and_si128(b0, b3); \ + b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \ + b0 = _mm_xor_si128(b0, b4); \ + b4 = _mm_or_si128(b4, b3); \ + b4 = _mm_xor_si128(b4, b2); \ + b2 = b0; \ + b0 = b1; \ + b1 = b3; \ + b3 = b4; \ + } while(0); + +#define SBoxE7(b0, b1, b2, b3) \ + do \ + { \ + b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \ + __m128i b4 = b3; \ + b3 = _mm_and_si128(b3, b0); \ + b0 = _mm_xor_si128(b0, b4); \ + b3 = _mm_xor_si128(b3, b2); \ + b2 = _mm_or_si128(b2, b4); \ + b1 = _mm_xor_si128(b1, b3); \ + b2 = _mm_xor_si128(b2, b0); \ + b0 = _mm_or_si128(b0, b1); \ + b2 = _mm_xor_si128(b2, b1); \ + b4 = _mm_xor_si128(b4, b0); \ + b0 = _mm_or_si128(b0, b3); \ + b0 = _mm_xor_si128(b0, b2); \ + b4 = _mm_xor_si128(b4, b3); \ + b4 = _mm_xor_si128(b4, b0); \ + b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \ + b2 = _mm_and_si128(b2, b4); \ + b3 = _mm_xor_si128(b3, b2); \ + b2 = b4; \ + } while(0); + +#define SBoxE8(b0, b1, b2, b3) \ + do \ + { \ + __m128i b4 = b1; \ + b1 = _mm_or_si128(b1, b2); \ + b1 = _mm_xor_si128(b1, b3); \ + b4 = _mm_xor_si128(b4, b2); \ + b2 = _mm_xor_si128(b2, b1); \ + b3 = _mm_or_si128(b3, b4); \ + b3 = _mm_and_si128(b3, b0); \ + b4 = _mm_xor_si128(b4, b2); \ + b3 = _mm_xor_si128(b3, b1); \ + b1 = _mm_or_si128(b1, b4); \ + b1 = _mm_xor_si128(b1, b0); \ + b0 = _mm_or_si128(b0, b4); \ + b0 = _mm_xor_si128(b0, b2); \ + b1 = _mm_xor_si128(b1, b4); \ + b2 = _mm_xor_si128(b2, b1); \ + b1 = _mm_and_si128(b1, b0); \ + b1 = _mm_xor_si128(b1, b4); \ + b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \ + b2 = _mm_or_si128(b2, b0); \ + b4 = _mm_xor_si128(b4, b2); \ + b2 = b1; \ + b1 = b3; \ + b3 = b0; \ + b0 = b4; \ + } while(0); + +#endif |