aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlloyd <[email protected]>2009-08-12 12:23:16 +0000
committerlloyd <[email protected]>2009-08-12 12:23:16 +0000
commit285d350ed62d2714592a27bf577832a92cb6902f (patch)
tree09a477bb63f8f98ce68e108a84fb2bab240897dc
parentc8c3d7f6eecd753aa87a882b1458346682e606db (diff)
Add full 4-way SSE2 Serpent encryption. Load/store operations are via
unions and can be made much faster using interleave operations I think. Currently ~2.5x faster in ECB or CTR mode on a Core2, which isn't too bad.
-rw-r--r--src/block/serpent_sse2/info.txt1
-rw-r--r--src/block/serpent_sse2/serp_sse2.cpp111
-rw-r--r--src/block/serpent_sse2/serp_sse2_sbox.h225
3 files changed, 290 insertions, 47 deletions
diff --git a/src/block/serpent_sse2/info.txt b/src/block/serpent_sse2/info.txt
index ad8323f53..09733e98f 100644
--- a/src/block/serpent_sse2/info.txt
+++ b/src/block/serpent_sse2/info.txt
@@ -7,6 +7,7 @@ load_on auto
<add>
serp_sse2.cpp
serp_sse2.h
+serp_sse2_sbox.h
</add>
<arch>
diff --git a/src/block/serpent_sse2/serp_sse2.cpp b/src/block/serpent_sse2/serp_sse2.cpp
index 759f3e1d6..3e78f0bac 100644
--- a/src/block/serpent_sse2/serp_sse2.cpp
+++ b/src/block/serpent_sse2/serp_sse2.cpp
@@ -6,6 +6,7 @@
*/
#include <botan/serp_sse2.h>
+#include <botan/serp_sse2_sbox.h>
#include <botan/loadstor.h>
#include <emmintrin.h>
@@ -13,29 +14,6 @@ namespace Botan {
namespace {
-#define SBoxE1(b0, b1, b2, b3, b4) \
- do { \
- b3 = _mm_xor_si128(b3, b0); \
- b4 = b1; \
- b1 = _mm_and_si128(b1, b3); \
- b4 = _mm_xor_si128(b4, b2); \
- b1 = _mm_xor_si128(b1, b0); \
- b0 = _mm_or_si128(b0, b3); \
- b0 = _mm_xor_si128(b0, b4); \
- b4 = _mm_xor_si128(b4, b3); \
- b3 = _mm_xor_si128(b3, b2); \
- b2 = _mm_or_si128(b2, b1); \
- b2 = _mm_xor_si128(b2, b4); \
- b4 = _mm_andnot_si128(b4, all_ones); \
- b4 = _mm_or_si128(b4, b1); \
- b1 = _mm_xor_si128(b1, b3); \
- b1 = _mm_xor_si128(b1, b4); \
- b3 = _mm_or_si128(b3, b0); \
- b1 = _mm_xor_si128(b1, b3); \
- b4 = _mm_xor_si128(b4, b3); \
- b3 = b0; b0 = b1; b1 = b4; \
- } while(0);
-
#define rotate_left_m128(vec, rot) \
_mm_or_si128(_mm_slli_epi32(vec, rot), _mm_srli_epi32(vec, 32-rot))
@@ -89,45 +67,84 @@ void serpent_encrypt_4(const byte in[64],
convert.u32[1] = load_le<u32bit>(in, 4);
convert.u32[2] = load_le<u32bit>(in, 8);
convert.u32[3] = load_le<u32bit>(in, 12);
-
- __m128i b0 = convert.v;
+ __m128i B0 = convert.v;
convert.u32[0] = load_le<u32bit>(in, 1);
convert.u32[1] = load_le<u32bit>(in, 5);
convert.u32[2] = load_le<u32bit>(in, 9);
convert.u32[3] = load_le<u32bit>(in, 13);
-
- __m128i b1 = convert.v;
+ __m128i B1 = convert.v;
convert.u32[0] = load_le<u32bit>(in, 2);
convert.u32[1] = load_le<u32bit>(in, 6);
convert.u32[2] = load_le<u32bit>(in, 10);
convert.u32[3] = load_le<u32bit>(in, 14);
-
- __m128i b2 = convert.v;
+ __m128i B2 = convert.v;
convert.u32[0] = load_le<u32bit>(in, 3);
convert.u32[1] = load_le<u32bit>(in, 7);
convert.u32[2] = load_le<u32bit>(in, 11);
convert.u32[3] = load_le<u32bit>(in, 15);
-
-
- __m128i b3 = convert.v;
-
- __m128i b4; // temp
-
- const __m128i all_ones = _mm_set1_epi8(0xFF);
-
- key_xor(0, b0, b1, b2, b3);
- SBoxE1(b0, b1, b2, b3, b4);
- transform(b0, b1, b2, b3);
-
- key_xor(b0, b1, b2, b3, 1);
-
- print_simd("b0", b0);
- print_simd("b1", b1);
- print_simd("b2", b2);
- print_simd("b3", b3);
+ __m128i B3 = convert.v;
+
+ key_xor(0,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(1,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(2,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(3,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(4,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(5,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(6,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(7,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor( 8,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 9,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(10,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(11,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(12,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(13,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(14,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(15,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(16,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(17,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(18,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(19,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(20,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(21,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(22,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(23,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(24,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(25,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(26,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(27,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(28,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(29,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(30,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(31,B0,B1,B2,B3); SBoxE8(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
+
+ // FIXME: figure out how to do this fast
+ union { __m128i v; u32bit u32[4]; } convert_B0;
+ union { __m128i v; u32bit u32[4]; } convert_B1;
+ union { __m128i v; u32bit u32[4]; } convert_B2;
+ union { __m128i v; u32bit u32[4]; } convert_B3;
+ convert_B0.v = B0;
+ convert_B1.v = B1;
+ convert_B2.v = B2;
+ convert_B3.v = B3;
+ store_le(out,
+ convert_B0.u32[0], convert_B1.u32[0],
+ convert_B2.u32[0], convert_B3.u32[0]);
+
+ store_le(out + 16,
+ convert_B0.u32[1], convert_B1.u32[1],
+ convert_B2.u32[1], convert_B3.u32[1]);
+
+ store_le(out + 32,
+ convert_B0.u32[2], convert_B1.u32[2],
+ convert_B2.u32[2], convert_B3.u32[2]);
+
+ store_le(out + 48,
+ convert_B0.u32[3], convert_B1.u32[3],
+ convert_B2.u32[3], convert_B3.u32[3]);
}
}
diff --git a/src/block/serpent_sse2/serp_sse2_sbox.h b/src/block/serpent_sse2/serp_sse2_sbox.h
new file mode 100644
index 000000000..2c4d9d9cb
--- /dev/null
+++ b/src/block/serpent_sse2/serp_sse2_sbox.h
@@ -0,0 +1,225 @@
+/*
+* Serpent Sboxes in SSE2 form
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef SERPENT_SSE2_SBOXES_H__
+#define SERPENT_SSE2_SBOXES_H__
+
+#define SBoxE1(b0, b1, b2, b3) \
+ do { \
+ b3 = _mm_xor_si128(b3, b0); \
+ __m128i b4 = b1; \
+ b1 = _mm_and_si128(b1, b3); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b1 = _mm_xor_si128(b1, b0); \
+ b0 = _mm_or_si128(b0, b3); \
+ b0 = _mm_xor_si128(b0, b4); \
+ b4 = _mm_xor_si128(b4, b3); \
+ b3 = _mm_xor_si128(b3, b2); \
+ b2 = _mm_or_si128(b2, b1); \
+ b2 = _mm_xor_si128(b2, b4); \
+ b4 = _mm_andnot_si128(b4, _mm_set1_epi8(0xFF)); \
+ b4 = _mm_or_si128(b4, b1); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b1 = _mm_xor_si128(b1, b4); \
+ b3 = _mm_or_si128(b3, b0); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b4 = _mm_xor_si128(b4, b3); \
+ b3 = b0; \
+ b0 = b1; \
+ b1 = b4; \
+ } while(0);
+
+#define SBoxE2(b0, b1, b2, b3) \
+ do \
+ { \
+ b0 = _mm_andnot_si128(b0, _mm_set1_epi8(0xFF)); \
+ b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
+ __m128i b4 = b0; \
+ b0 = _mm_and_si128(b0, b1); \
+ b2 = _mm_xor_si128(b2, b0); \
+ b0 = _mm_or_si128(b0, b3); \
+ b3 = _mm_xor_si128(b3, b2); \
+ b1 = _mm_xor_si128(b1, b0); \
+ b0 = _mm_xor_si128(b0, b4); \
+ b4 = _mm_or_si128(b4, b1); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b2 = _mm_or_si128(b2, b0); \
+ b2 = _mm_and_si128(b2, b4); \
+ b0 = _mm_xor_si128(b0, b1); \
+ b1 = _mm_and_si128(b1, b2); \
+ b1 = _mm_xor_si128(b1, b0); \
+ b0 = _mm_and_si128(b0, b2); \
+ b4 = _mm_xor_si128(b4, b0); \
+ b0 = b2; \
+ b2 = b3; \
+ b3 = b1; \
+ b1 = b4; \
+ } while(0);
+
+#define SBoxE3(b0, b1, b2, b3) \
+ do \
+ { \
+ __m128i b4 = b0; \
+ b0 = _mm_and_si128(b0, b2); \
+ b0 = _mm_xor_si128(b0, b3); \
+ b2 = _mm_xor_si128(b2, b1); \
+ b2 = _mm_xor_si128(b2, b0); \
+ b3 = _mm_or_si128(b3, b4); \
+ b3 = _mm_xor_si128(b3, b1); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b1 = b3; \
+ b3 = _mm_or_si128(b3, b4); \
+ b3 = _mm_xor_si128(b3, b0); \
+ b0 = _mm_and_si128(b0, b1); \
+ b4 = _mm_xor_si128(b4, b0); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b1 = _mm_xor_si128(b1, b4); \
+ b4 = _mm_andnot_si128(b4, _mm_set1_epi8(0xFF)); \
+ b0 = b2; \
+ b2 = b1; \
+ b1 = b3; \
+ b3 = b4; \
+ } while(0);
+
+#define SBoxE4(b0, b1, b2, b3) \
+ do \
+ { \
+ __m128i b4 = b0; \
+ b0 = _mm_or_si128(b0, b3); \
+ b3 = _mm_xor_si128(b3, b1); \
+ b1 = _mm_and_si128(b1, b4); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b2 = _mm_xor_si128(b2, b3); \
+ b3 = _mm_and_si128(b3, b0); \
+ b4 = _mm_or_si128(b4, b1); \
+ b3 = _mm_xor_si128(b3, b4); \
+ b0 = _mm_xor_si128(b0, b1); \
+ b4 = _mm_and_si128(b4, b0); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b1 = _mm_or_si128(b1, b0); \
+ b1 = _mm_xor_si128(b1, b2); \
+ b0 = _mm_xor_si128(b0, b3); \
+ b2 = b1; \
+ b1 = _mm_or_si128(b1, b3); \
+ b1 = _mm_xor_si128(b1, b0); \
+ b0 = b1; \
+ b1 = b2; \
+ b2 = b3; \
+ b3 = b4; \
+ } while(0);
+
+#define SBoxE5(b0, b1, b2, b3) \
+ do \
+ { \
+ b1 = _mm_xor_si128(b1, b3); \
+ b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \
+ b2 = _mm_xor_si128(b2, b3); \
+ b3 = _mm_xor_si128(b3, b0); \
+ __m128i b4 = b1; \
+ b1 = _mm_and_si128(b1, b3); \
+ b1 = _mm_xor_si128(b1, b2); \
+ b4 = _mm_xor_si128(b4, b3); \
+ b0 = _mm_xor_si128(b0, b4); \
+ b2 = _mm_and_si128(b2, b4); \
+ b2 = _mm_xor_si128(b2, b0); \
+ b0 = _mm_and_si128(b0, b1); \
+ b3 = _mm_xor_si128(b3, b0); \
+ b4 = _mm_or_si128(b4, b1); \
+ b4 = _mm_xor_si128(b4, b0); \
+ b0 = _mm_or_si128(b0, b3); \
+ b0 = _mm_xor_si128(b0, b2); \
+ b2 = _mm_and_si128(b2, b3); \
+ b0 = _mm_andnot_si128(b0, _mm_set1_epi8(0xFF)); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b2 = b0; \
+ b0 = b1; \
+ b1 = b4; \
+ } while(0);
+
+#define SBoxE6(b0, b1, b2, b3) \
+ do \
+ { \
+ b0 = _mm_xor_si128(b0, b1); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \
+ __m128i b4 = b1; \
+ b1 = _mm_and_si128(b1, b0); \
+ b2 = _mm_xor_si128(b2, b3); \
+ b1 = _mm_xor_si128(b1, b2); \
+ b2 = _mm_or_si128(b2, b4); \
+ b4 = _mm_xor_si128(b4, b3); \
+ b3 = _mm_and_si128(b3, b1); \
+ b3 = _mm_xor_si128(b3, b0); \
+ b4 = _mm_xor_si128(b4, b1); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b2 = _mm_xor_si128(b2, b0); \
+ b0 = _mm_and_si128(b0, b3); \
+ b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
+ b0 = _mm_xor_si128(b0, b4); \
+ b4 = _mm_or_si128(b4, b3); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b2 = b0; \
+ b0 = b1; \
+ b1 = b3; \
+ b3 = b4; \
+ } while(0);
+
+#define SBoxE7(b0, b1, b2, b3) \
+ do \
+ { \
+ b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
+ __m128i b4 = b3; \
+ b3 = _mm_and_si128(b3, b0); \
+ b0 = _mm_xor_si128(b0, b4); \
+ b3 = _mm_xor_si128(b3, b2); \
+ b2 = _mm_or_si128(b2, b4); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b2 = _mm_xor_si128(b2, b0); \
+ b0 = _mm_or_si128(b0, b1); \
+ b2 = _mm_xor_si128(b2, b1); \
+ b4 = _mm_xor_si128(b4, b0); \
+ b0 = _mm_or_si128(b0, b3); \
+ b0 = _mm_xor_si128(b0, b2); \
+ b4 = _mm_xor_si128(b4, b3); \
+ b4 = _mm_xor_si128(b4, b0); \
+ b3 = _mm_andnot_si128(b3, _mm_set1_epi8(0xFF)); \
+ b2 = _mm_and_si128(b2, b4); \
+ b3 = _mm_xor_si128(b3, b2); \
+ b2 = b4; \
+ } while(0);
+
+#define SBoxE8(b0, b1, b2, b3) \
+ do \
+ { \
+ __m128i b4 = b1; \
+ b1 = _mm_or_si128(b1, b2); \
+ b1 = _mm_xor_si128(b1, b3); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b2 = _mm_xor_si128(b2, b1); \
+ b3 = _mm_or_si128(b3, b4); \
+ b3 = _mm_and_si128(b3, b0); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b3 = _mm_xor_si128(b3, b1); \
+ b1 = _mm_or_si128(b1, b4); \
+ b1 = _mm_xor_si128(b1, b0); \
+ b0 = _mm_or_si128(b0, b4); \
+ b0 = _mm_xor_si128(b0, b2); \
+ b1 = _mm_xor_si128(b1, b4); \
+ b2 = _mm_xor_si128(b2, b1); \
+ b1 = _mm_and_si128(b1, b0); \
+ b1 = _mm_xor_si128(b1, b4); \
+ b2 = _mm_andnot_si128(b2, _mm_set1_epi8(0xFF)); \
+ b2 = _mm_or_si128(b2, b0); \
+ b4 = _mm_xor_si128(b4, b2); \
+ b2 = b1; \
+ b1 = b3; \
+ b3 = b0; \
+ b0 = b4; \
+ } while(0);
+
+#endif