aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlloyd <[email protected]>2009-12-23 16:51:24 +0000
committerlloyd <[email protected]>2009-12-23 16:51:24 +0000
commit92a851901ea42398c221a608348d1336b5529b09 (patch)
tree22dab44199d116f1de2ede8a03b5ffc3a5bc6247
parent3a652cd28ec554267be414d69ed14b46956f84d7 (diff)
Add last nights project, an SSE2 implementation of IDEA. Right about 4x
faster than the scalar version on a Core2.
-rw-r--r--checks/validate.dat17
-rw-r--r--doc/log.txt5
-rw-r--r--src/block/idea/idea.cpp78
-rw-r--r--src/block/idea/idea.h2
-rw-r--r--src/block/idea_sse2/idea_sse2.cpp227
-rw-r--r--src/block/idea_sse2/idea_sse2.h29
-rw-r--r--src/block/idea_sse2/info.txt7
-rw-r--r--src/engine/simd_engine/simd_engine.cpp14
8 files changed, 320 insertions, 59 deletions
diff --git a/checks/validate.dat b/checks/validate.dat
index ff2ed2f4b..9d319eaf3 100644
--- a/checks/validate.dat
+++ b/checks/validate.dat
@@ -4799,6 +4799,12 @@ D5D5D5D5D5D5D5D5:75F7C7005EA47839:D5D5D5D5D5D5D5D5D5D5D5D5D5D5D5D5
0000000000000000:AA553A5DEC50E4A4:00000000000000040000000000000000
0000000000000001:0013FFF500120009:00000000000000000000000000000000
+000000010002000301020304050607080019324B647D96AFF5202D5B9C671B08\
+FAE6D2BEAA96826E0A141E28323C4650050A0F14191E2328050A0F14191E2328:\
+11FBED2B01986DE5540E5FEA18C2F8B19F0A0AB6E10CED78CF18FD7355E2C5C5\
+85DF52005608193D2F7DE750212FB7347B7314925DE59C097B7314925DE59C09:\
+00010002000300040005000600070008
+
# Test vectors taken from ARIB STD-T63-35.203 V6.0.0
[KASUMI]
EA024714AD5C4D84:DF1F9B251C0BF45F:2BD6459F82C5B300952C49104881FF48
@@ -23023,6 +23029,17 @@ AAE1BA501274C49A7A7EC67D7577114B7707DAB9D066AF086C09E7DD4116CEA6\
EE25DA9A65EF05A31ED0BDF56D525EC8968D1D01AF7165C5AEAC76BD367A575A:\
00000000000000000000000000000000
+[IDEA/ECB/NoPadding]
+000000010002000301020304050607080019324B647D96AFF5202D5B9C671B08\
+FAE6D2BEAA96826E0A141E28323C4650050A0F14191E2328050A0F14191E2328\
+000000010002000301020304050607080019324B647D96AFF5202D5B9C671B08\
+FAE6D2BEAA96826E0A141E28323C4650050A0F14191E2328050A0F14191E2328:\
+11FBED2B01986DE5540E5FEA18C2F8B19F0A0AB6E10CED78CF18FD7355E2C5C5\
+85DF52005608193D2F7DE750212FB7347B7314925DE59C097B7314925DE59C09\
+11FBED2B01986DE5540E5FEA18C2F8B19F0A0AB6E10CED78CF18FD7355E2C5C5\
+85DF52005608193D2F7DE750212FB7347B7314925DE59C097B7314925DE59C09:\
+00010002000300040005000600070008
+
[DES/ECB/NoPadding]
059B5E0851CF143A:86A560F10EC6D85B:0113B970FD34F2CE
4E6F772069732074:3FA40E8A984D4815:0123456789ABCDEF
diff --git a/doc/log.txt b/doc/log.txt
index c2eda5092..c150e6f66 100644
--- a/doc/log.txt
+++ b/doc/log.txt
@@ -1,11 +1,12 @@
* 1.9.4-dev, ????-??-??
+ - Add SSE2 implementation of IDEA
+ - Add support for Win32 high resolution system timers
+ - Remove Timer class entirely
- New option --gen-amalgamation for creating a SQLite-style amalgamation
- Many headers are now explicitly internal-use-only and are not installed
- Greatly improve the Win32 installer
- Several fixes for Visual C++ debug builds
- - Add support for Win32 high resolution timers
- - Remove Timer class entirely
* 1.9.3, 2009-11-19
- Add new AES implementation using Intel's AES instruction intrinsics
diff --git a/src/block/idea/idea.cpp b/src/block/idea/idea.cpp
index fb5fe83f1..15ff7c0ec 100644
--- a/src/block/idea/idea.cpp
+++ b/src/block/idea/idea.cpp
@@ -55,13 +55,13 @@ u16bit mul_inv(u16bit x)
return (1 - t0);
}
-}
-
-/*
-* IDEA Encryption
+/**
+* IDEA is involutional, depending only on the key schedule
*/
-void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
+void idea_op(const byte in[], byte out[], u32bit blocks, const u16bit K[52])
{
+ const u32bit BLOCK_SIZE = 8;
+
for(u32bit i = 0; i != blocks; ++i)
{
u16bit X1 = load_be<u16bit>(in, 0);
@@ -71,16 +71,16 @@ void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
for(u32bit j = 0; j != 8; ++j)
{
- X1 = mul(X1, EK[6*j+0]);
- X2 += EK[6*j+1];
- X3 += EK[6*j+2];
- X4 = mul(X4, EK[6*j+3]);
+ X1 = mul(X1, K[6*j+0]);
+ X2 += K[6*j+1];
+ X3 += K[6*j+2];
+ X4 = mul(X4, K[6*j+3]);
u16bit T0 = X3;
- X3 = mul(X3 ^ X1, EK[6*j+4]);
+ X3 = mul(X3 ^ X1, K[6*j+4]);
u16bit T1 = X2;
- X2 = mul((X2 ^ X4) + X3, EK[6*j+5]);
+ X2 = mul((X2 ^ X4) + X3, K[6*j+5]);
X3 += X2;
X1 ^= X2;
@@ -89,10 +89,10 @@ void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
X3 ^= T1;
}
- X1 = mul(X1, EK[48]);
- X2 += EK[50];
- X3 += EK[49];
- X4 = mul(X4, EK[51]);
+ X1 = mul(X1, K[48]);
+ X2 += K[50];
+ X3 += K[49];
+ X4 = mul(X4, K[51]);
store_be(out, X1, X3, X2, X4);
@@ -101,48 +101,22 @@ void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
}
}
+}
+
+/*
+* IDEA Encryption
+*/
+void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
+ {
+ idea_op(in, out, blocks, EK);
+ }
+
/*
* IDEA Decryption
*/
void IDEA::decrypt_n(const byte in[], byte out[], u32bit blocks) const
{
- for(u32bit i = 0; i != blocks; ++i)
- {
- u16bit X1 = load_be<u16bit>(in, 0);
- u16bit X2 = load_be<u16bit>(in, 1);
- u16bit X3 = load_be<u16bit>(in, 2);
- u16bit X4 = load_be<u16bit>(in, 3);
-
- for(u32bit j = 0; j != 8; ++j)
- {
- X1 = mul(X1, DK[6*j+0]);
- X2 += DK[6*j+1];
- X3 += DK[6*j+2];
- X4 = mul(X4, DK[6*j+3]);
-
- u16bit T0 = X3;
- X3 = mul(X3 ^ X1, DK[6*j+4]);
-
- u16bit T1 = X2;
- X2 = mul((X2 ^ X4) + X3, DK[6*j+5]);
- X3 += X2;
-
- X1 ^= X2;
- X4 ^= X3;
- X2 ^= T0;
- X3 ^= T1;
- }
-
- X1 = mul(X1, DK[48]);
- X2 += DK[50];
- X3 += DK[49];
- X4 = mul(X4, DK[51]);
-
- store_be(out, X1, X3, X2, X4);
-
- in += BLOCK_SIZE;
- out += BLOCK_SIZE;
- }
+ idea_op(in, out, blocks, DK);
}
/*
diff --git a/src/block/idea/idea.h b/src/block/idea/idea.h
index c1a79f423..89ec117e3 100644
--- a/src/block/idea/idea.h
+++ b/src/block/idea/idea.h
@@ -26,7 +26,7 @@ class BOTAN_DLL IDEA : public BlockCipher
BlockCipher* clone() const { return new IDEA; }
IDEA() : BlockCipher(8, 16) {}
- private:
+ protected:
void key_schedule(const byte[], u32bit);
SecureBuffer<u16bit, 52> EK, DK;
};
diff --git a/src/block/idea_sse2/idea_sse2.cpp b/src/block/idea_sse2/idea_sse2.cpp
new file mode 100644
index 000000000..c00d13ee9
--- /dev/null
+++ b/src/block/idea_sse2/idea_sse2.cpp
@@ -0,0 +1,227 @@
+/*
+* IDEA in SSE2
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#include <botan/idea_sse2.h>
+#include <botan/loadstor.h>
+#include <emmintrin.h>
+
+namespace Botan {
+
+namespace {
+
+inline __m128i mul(__m128i X, u16bit K_16)
+ {
+ const __m128i zeros = _mm_set1_epi16(0);
+ const __m128i ones = _mm_set1_epi16(1);
+ const __m128i high_bit = _mm_set1_epi16(0x8000);
+
+ const __m128i K = _mm_set1_epi16(K_16);
+
+ const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
+ const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
+
+ const __m128i mul_lo = _mm_mullo_epi16(X, K);
+ const __m128i mul_hi = _mm_mulhi_epu16(X, K);
+
+ __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
+
+ // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
+ const __m128i cmp = _mm_srli_epi16(_mm_cmpgt_epi16(
+ _mm_add_epi16(mul_hi, high_bit),
+ _mm_add_epi16(mul_lo, high_bit)),
+ 15);
+
+ T = _mm_add_epi16(T, cmp);
+
+ /* Selection: if X[i] is zero then assign 1-K
+ if K is zero then assign 1-X[i]
+
+ Could if() off value of K_16 for the second, but this gives a
+ constant time implementation which is a nice bonus.
+ */
+
+ T = _mm_or_si128(
+ _mm_andnot_si128(X_is_zero, T),
+ _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
+
+ T = _mm_or_si128(
+ _mm_andnot_si128(K_is_zero, T),
+ _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
+
+ return T;
+ }
+
+/*
+* 4x8 matrix transpose
+*
+* FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
+* transpose_out doesn't need it. Something with the shuffle? Removing
+* that extra unpack could easily save 3-4 cycles per block, and would
+* also help a lot with register pressure on 32-bit x86
+*/
+void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
+ {
+ __m128i T0 = _mm_unpackhi_epi32(B0, B1);
+ __m128i T1 = _mm_unpacklo_epi32(B0, B1);
+ __m128i T2 = _mm_unpackhi_epi32(B2, B3);
+ __m128i T3 = _mm_unpacklo_epi32(B2, B3);
+
+ __m128i T4 = _mm_unpacklo_epi32(T0, T1);
+ __m128i T5 = _mm_unpackhi_epi32(T0, T1);
+ __m128i T6 = _mm_unpacklo_epi32(T2, T3);
+ __m128i T7 = _mm_unpackhi_epi32(T2, T3);
+
+ T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
+ T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
+ T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
+ T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
+
+ T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
+ T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
+ T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
+ T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
+
+ T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
+ T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
+ T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
+ T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+ B0 = _mm_unpacklo_epi64(T0, T2);
+ B1 = _mm_unpackhi_epi64(T0, T2);
+ B2 = _mm_unpacklo_epi64(T1, T3);
+ B3 = _mm_unpackhi_epi64(T1, T3);
+ }
+
+/*
+* 4x8 matrix transpose (reverse)
+*/
+void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
+ {
+ __m128i T0 = _mm_unpacklo_epi64(B0, B1);
+ __m128i T1 = _mm_unpacklo_epi64(B2, B3);
+ __m128i T2 = _mm_unpackhi_epi64(B0, B1);
+ __m128i T3 = _mm_unpackhi_epi64(B2, B3);
+
+ T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
+ T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
+ T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
+ T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+ T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
+ T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
+ T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
+ T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+ T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
+ T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
+ T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
+ T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+ B0 = _mm_unpacklo_epi32(T0, T1);
+ B1 = _mm_unpackhi_epi32(T0, T1);
+ B2 = _mm_unpacklo_epi32(T2, T3);
+ B3 = _mm_unpackhi_epi32(T2, T3);
+ }
+
+/*
+* IDEA encryption/decryption in SSE2
+*/
+void idea_op_8(const byte in[64], byte out[64], const u16bit EK[52])
+ {
+ __m128i B0 = _mm_loadu_si128((const __m128i*)in);
+ __m128i B1 = _mm_loadu_si128((const __m128i*)in + 1);
+ __m128i B2 = _mm_loadu_si128((const __m128i*)in + 2);
+ __m128i B3 = _mm_loadu_si128((const __m128i*)in + 3);
+
+ transpose_in(B0, B1, B2, B3);
+
+ // byte swap
+ B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
+ B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
+ B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
+ B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
+
+ for(u32bit i = 0; i != 8; ++i)
+ {
+ B0 = mul(B0, EK[6*i+0]);
+ B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
+ B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
+ B3 = mul(B3, EK[6*i+3]);
+
+ __m128i T0 = B2;
+
+ B2 = _mm_xor_si128(B2, B0);
+ B2 = mul(B2, EK[6*i+4]);
+
+ __m128i T1 = B1;
+
+ B1 = _mm_xor_si128(B1, B3);
+ B1 = _mm_add_epi16(B1, B2);
+ B1 = mul(B1, EK[6*i+5]);
+
+ B2 = _mm_add_epi16(B2, B1);
+
+ B0 = _mm_xor_si128(B0, B1);
+ B1 = _mm_xor_si128(B1, T0);
+ B3 = _mm_xor_si128(B3, B2);
+ B2 = _mm_xor_si128(B2, T1);
+ }
+
+ B0 = mul(B0, EK[48]);
+ B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
+ B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
+ B3 = mul(B3, EK[51]);
+
+ // byte swap
+ B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
+ B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
+ B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
+ B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
+
+ transpose_out(B0, B2, B1, B3);
+
+ _mm_storeu_si128((__m128i*)out, B0);
+ _mm_storeu_si128((__m128i*)out + 1, B2);
+ _mm_storeu_si128((__m128i*)out + 2, B1);
+ _mm_storeu_si128((__m128i*)out + 3, B3);
+ }
+
+}
+
+/*
+* IDEA Encryption
+*/
+void IDEA_SSE2::encrypt_n(const byte in[], byte out[], u32bit blocks) const
+ {
+ while(blocks >= 8)
+ {
+ idea_op_8(in, out, this->EK);
+ in += 8 * BLOCK_SIZE;
+ out += 8 * BLOCK_SIZE;
+ blocks -= 8;
+ }
+
+ IDEA::encrypt_n(in, out, blocks);
+ }
+
+/*
+* IDEA Decryption
+*/
+void IDEA_SSE2::decrypt_n(const byte in[], byte out[], u32bit blocks) const
+ {
+ while(blocks >= 8)
+ {
+ idea_op_8(in, out, this->DK);
+ in += 8 * BLOCK_SIZE;
+ out += 8 * BLOCK_SIZE;
+ blocks -= 8;
+ }
+
+ IDEA::decrypt_n(in, out, blocks);
+ }
+
+}
diff --git a/src/block/idea_sse2/idea_sse2.h b/src/block/idea_sse2/idea_sse2.h
new file mode 100644
index 000000000..167c981f8
--- /dev/null
+++ b/src/block/idea_sse2/idea_sse2.h
@@ -0,0 +1,29 @@
+/*
+* IDEA in SSE2
+* (C) 2009 Jack Lloyd
+*
+* Distributed under the terms of the Botan license
+*/
+
+#ifndef BOTAN_IDEA_SSE2_H__
+#define BOTAN_IDEA_SSE2_H__
+
+#include <botan/idea.h>
+
+namespace Botan {
+
+/*
+* IDEA in SSE2
+*/
+class BOTAN_DLL IDEA_SSE2 : public IDEA
+ {
+ public:
+ void encrypt_n(const byte in[], byte out[], u32bit blocks) const;
+ void decrypt_n(const byte in[], byte out[], u32bit blocks) const;
+
+ BlockCipher* clone() const { return new IDEA_SSE2; }
+ };
+
+}
+
+#endif
diff --git a/src/block/idea_sse2/info.txt b/src/block/idea_sse2/info.txt
new file mode 100644
index 000000000..fe09d3ee5
--- /dev/null
+++ b/src/block/idea_sse2/info.txt
@@ -0,0 +1,7 @@
+define IDEA_SSE2
+
+need_isa sse2
+
+<requires>
+idea
+</requires>
diff --git a/src/engine/simd_engine/simd_engine.cpp b/src/engine/simd_engine/simd_engine.cpp
index 892221f22..b8ebd6a80 100644
--- a/src/engine/simd_engine/simd_engine.cpp
+++ b/src/engine/simd_engine/simd_engine.cpp
@@ -17,6 +17,10 @@
#include <botan/xtea_simd.h>
#endif
+#if defined(BOTAN_HAS_IDEA_SSE2)
+ #include <botan/idea_sse2.h>
+#endif
+
#if defined(BOTAN_HAS_SHA1_SSE2)
#include <botan/sha1_sse2.h>
#endif
@@ -27,16 +31,18 @@ BlockCipher*
SIMD_Engine::find_block_cipher(const SCAN_Name& request,
Algorithm_Factory&) const
{
- if(!SIMD_32::enabled())
- return 0;
+#if defined(BOTAN_HAS_IDEA_SSE2)
+ if(request.algo_name() == "IDEA" && CPUID::has_sse2())
+ return new IDEA_SSE2;
+#endif
#if defined(BOTAN_HAS_SERPENT_SIMD)
- if(request.algo_name() == "Serpent")
+ if(request.algo_name() == "Serpent" && SIMD_32::enabled())
return new Serpent_SIMD;
#endif
#if defined(BOTAN_HAS_XTEA_SIMD)
- if(request.algo_name() == "XTEA")
+ if(request.algo_name() == "XTEA" && SIMD_32::enabled())
return new XTEA_SIMD;
#endif