From 92a851901ea42398c221a608348d1336b5529b09 Mon Sep 17 00:00:00 2001 From: lloyd Date: Wed, 23 Dec 2009 16:51:24 +0000 Subject: Add last nights project, an SSE2 implementation of IDEA. Right about 4x faster than the scalar version on a Core2. --- checks/validate.dat | 17 +++ doc/log.txt | 5 +- src/block/idea/idea.cpp | 78 ++++------- src/block/idea/idea.h | 2 +- src/block/idea_sse2/idea_sse2.cpp | 227 +++++++++++++++++++++++++++++++++ src/block/idea_sse2/idea_sse2.h | 29 +++++ src/block/idea_sse2/info.txt | 7 + src/engine/simd_engine/simd_engine.cpp | 14 +- 8 files changed, 320 insertions(+), 59 deletions(-) create mode 100644 src/block/idea_sse2/idea_sse2.cpp create mode 100644 src/block/idea_sse2/idea_sse2.h create mode 100644 src/block/idea_sse2/info.txt diff --git a/checks/validate.dat b/checks/validate.dat index ff2ed2f4b..9d319eaf3 100644 --- a/checks/validate.dat +++ b/checks/validate.dat @@ -4799,6 +4799,12 @@ D5D5D5D5D5D5D5D5:75F7C7005EA47839:D5D5D5D5D5D5D5D5D5D5D5D5D5D5D5D5 0000000000000000:AA553A5DEC50E4A4:00000000000000040000000000000000 0000000000000001:0013FFF500120009:00000000000000000000000000000000 +000000010002000301020304050607080019324B647D96AFF5202D5B9C671B08\ +FAE6D2BEAA96826E0A141E28323C4650050A0F14191E2328050A0F14191E2328:\ +11FBED2B01986DE5540E5FEA18C2F8B19F0A0AB6E10CED78CF18FD7355E2C5C5\ +85DF52005608193D2F7DE750212FB7347B7314925DE59C097B7314925DE59C09:\ +00010002000300040005000600070008 + # Test vectors taken from ARIB STD-T63-35.203 V6.0.0 [KASUMI] EA024714AD5C4D84:DF1F9B251C0BF45F:2BD6459F82C5B300952C49104881FF48 @@ -23023,6 +23029,17 @@ AAE1BA501274C49A7A7EC67D7577114B7707DAB9D066AF086C09E7DD4116CEA6\ EE25DA9A65EF05A31ED0BDF56D525EC8968D1D01AF7165C5AEAC76BD367A575A:\ 00000000000000000000000000000000 +[IDEA/ECB/NoPadding] +000000010002000301020304050607080019324B647D96AFF5202D5B9C671B08\ +FAE6D2BEAA96826E0A141E28323C4650050A0F14191E2328050A0F14191E2328\ +000000010002000301020304050607080019324B647D96AFF5202D5B9C671B08\ +FAE6D2BEAA96826E0A141E28323C4650050A0F14191E2328050A0F14191E2328:\ +11FBED2B01986DE5540E5FEA18C2F8B19F0A0AB6E10CED78CF18FD7355E2C5C5\ +85DF52005608193D2F7DE750212FB7347B7314925DE59C097B7314925DE59C09\ +11FBED2B01986DE5540E5FEA18C2F8B19F0A0AB6E10CED78CF18FD7355E2C5C5\ +85DF52005608193D2F7DE750212FB7347B7314925DE59C097B7314925DE59C09:\ +00010002000300040005000600070008 + [DES/ECB/NoPadding] 059B5E0851CF143A:86A560F10EC6D85B:0113B970FD34F2CE 4E6F772069732074:3FA40E8A984D4815:0123456789ABCDEF diff --git a/doc/log.txt b/doc/log.txt index c2eda5092..c150e6f66 100644 --- a/doc/log.txt +++ b/doc/log.txt @@ -1,11 +1,12 @@ * 1.9.4-dev, ????-??-?? + - Add SSE2 implementation of IDEA + - Add support for Win32 high resolution system timers + - Remove Timer class entirely - New option --gen-amalgamation for creating a SQLite-style amalgamation - Many headers are now explicitly internal-use-only and are not installed - Greatly improve the Win32 installer - Several fixes for Visual C++ debug builds - - Add support for Win32 high resolution timers - - Remove Timer class entirely * 1.9.3, 2009-11-19 - Add new AES implementation using Intel's AES instruction intrinsics diff --git a/src/block/idea/idea.cpp b/src/block/idea/idea.cpp index fb5fe83f1..15ff7c0ec 100644 --- a/src/block/idea/idea.cpp +++ b/src/block/idea/idea.cpp @@ -55,13 +55,13 @@ u16bit mul_inv(u16bit x) return (1 - t0); } -} - -/* -* IDEA Encryption +/** +* IDEA is involutional, depending only on the key schedule */ -void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const +void idea_op(const byte in[], byte out[], u32bit blocks, const u16bit K[52]) { + const u32bit BLOCK_SIZE = 8; + for(u32bit i = 0; i != blocks; ++i) { u16bit X1 = load_be(in, 0); @@ -71,16 +71,16 @@ void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const for(u32bit j = 0; j != 8; ++j) { - X1 = mul(X1, EK[6*j+0]); - X2 += EK[6*j+1]; - X3 += EK[6*j+2]; - X4 = mul(X4, EK[6*j+3]); + X1 = mul(X1, K[6*j+0]); + X2 += K[6*j+1]; + X3 += K[6*j+2]; + X4 = mul(X4, K[6*j+3]); u16bit T0 = X3; - X3 = mul(X3 ^ X1, EK[6*j+4]); + X3 = mul(X3 ^ X1, K[6*j+4]); u16bit T1 = X2; - X2 = mul((X2 ^ X4) + X3, EK[6*j+5]); + X2 = mul((X2 ^ X4) + X3, K[6*j+5]); X3 += X2; X1 ^= X2; @@ -89,10 +89,10 @@ void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const X3 ^= T1; } - X1 = mul(X1, EK[48]); - X2 += EK[50]; - X3 += EK[49]; - X4 = mul(X4, EK[51]); + X1 = mul(X1, K[48]); + X2 += K[50]; + X3 += K[49]; + X4 = mul(X4, K[51]); store_be(out, X1, X3, X2, X4); @@ -101,48 +101,22 @@ void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const } } +} + +/* +* IDEA Encryption +*/ +void IDEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const + { + idea_op(in, out, blocks, EK); + } + /* * IDEA Decryption */ void IDEA::decrypt_n(const byte in[], byte out[], u32bit blocks) const { - for(u32bit i = 0; i != blocks; ++i) - { - u16bit X1 = load_be(in, 0); - u16bit X2 = load_be(in, 1); - u16bit X3 = load_be(in, 2); - u16bit X4 = load_be(in, 3); - - for(u32bit j = 0; j != 8; ++j) - { - X1 = mul(X1, DK[6*j+0]); - X2 += DK[6*j+1]; - X3 += DK[6*j+2]; - X4 = mul(X4, DK[6*j+3]); - - u16bit T0 = X3; - X3 = mul(X3 ^ X1, DK[6*j+4]); - - u16bit T1 = X2; - X2 = mul((X2 ^ X4) + X3, DK[6*j+5]); - X3 += X2; - - X1 ^= X2; - X4 ^= X3; - X2 ^= T0; - X3 ^= T1; - } - - X1 = mul(X1, DK[48]); - X2 += DK[50]; - X3 += DK[49]; - X4 = mul(X4, DK[51]); - - store_be(out, X1, X3, X2, X4); - - in += BLOCK_SIZE; - out += BLOCK_SIZE; - } + idea_op(in, out, blocks, DK); } /* diff --git a/src/block/idea/idea.h b/src/block/idea/idea.h index c1a79f423..89ec117e3 100644 --- a/src/block/idea/idea.h +++ b/src/block/idea/idea.h @@ -26,7 +26,7 @@ class BOTAN_DLL IDEA : public BlockCipher BlockCipher* clone() const { return new IDEA; } IDEA() : BlockCipher(8, 16) {} - private: + protected: void key_schedule(const byte[], u32bit); SecureBuffer EK, DK; }; diff --git a/src/block/idea_sse2/idea_sse2.cpp b/src/block/idea_sse2/idea_sse2.cpp new file mode 100644 index 000000000..c00d13ee9 --- /dev/null +++ b/src/block/idea_sse2/idea_sse2.cpp @@ -0,0 +1,227 @@ +/* +* IDEA in SSE2 +* (C) 2009 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#include +#include +#include + +namespace Botan { + +namespace { + +inline __m128i mul(__m128i X, u16bit K_16) + { + const __m128i zeros = _mm_set1_epi16(0); + const __m128i ones = _mm_set1_epi16(1); + const __m128i high_bit = _mm_set1_epi16(0x8000); + + const __m128i K = _mm_set1_epi16(K_16); + + const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros); + const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros); + + const __m128i mul_lo = _mm_mullo_epi16(X, K); + const __m128i mul_hi = _mm_mulhi_epu16(X, K); + + __m128i T = _mm_sub_epi16(mul_lo, mul_hi); + + // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0 + const __m128i cmp = _mm_srli_epi16(_mm_cmpgt_epi16( + _mm_add_epi16(mul_hi, high_bit), + _mm_add_epi16(mul_lo, high_bit)), + 15); + + T = _mm_add_epi16(T, cmp); + + /* Selection: if X[i] is zero then assign 1-K + if K is zero then assign 1-X[i] + + Could if() off value of K_16 for the second, but this gives a + constant time implementation which is a nice bonus. + */ + + T = _mm_or_si128( + _mm_andnot_si128(X_is_zero, T), + _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero)); + + T = _mm_or_si128( + _mm_andnot_si128(K_is_zero, T), + _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero)); + + return T; + } + +/* +* 4x8 matrix transpose +* +* FIXME: why do I need the extra set of unpack_epi32 here? Inverse in +* transpose_out doesn't need it. Something with the shuffle? Removing +* that extra unpack could easily save 3-4 cycles per block, and would +* also help a lot with register pressure on 32-bit x86 +*/ +void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) + { + __m128i T0 = _mm_unpackhi_epi32(B0, B1); + __m128i T1 = _mm_unpacklo_epi32(B0, B1); + __m128i T2 = _mm_unpackhi_epi32(B2, B3); + __m128i T3 = _mm_unpacklo_epi32(B2, B3); + + __m128i T4 = _mm_unpacklo_epi32(T0, T1); + __m128i T5 = _mm_unpackhi_epi32(T0, T1); + __m128i T6 = _mm_unpacklo_epi32(T2, T3); + __m128i T7 = _mm_unpackhi_epi32(T2, T3); + + T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2)); + T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2)); + T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2)); + T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2)); + + T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2)); + T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2)); + T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2)); + T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2)); + + T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); + T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); + T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); + T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); + + B0 = _mm_unpacklo_epi64(T0, T2); + B1 = _mm_unpackhi_epi64(T0, T2); + B2 = _mm_unpacklo_epi64(T1, T3); + B3 = _mm_unpackhi_epi64(T1, T3); + } + +/* +* 4x8 matrix transpose (reverse) +*/ +void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) + { + __m128i T0 = _mm_unpacklo_epi64(B0, B1); + __m128i T1 = _mm_unpacklo_epi64(B2, B3); + __m128i T2 = _mm_unpackhi_epi64(B0, B1); + __m128i T3 = _mm_unpackhi_epi64(B2, B3); + + T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); + T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); + T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); + T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); + + T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); + T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); + T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); + T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); + + T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); + T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); + T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); + T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); + + B0 = _mm_unpacklo_epi32(T0, T1); + B1 = _mm_unpackhi_epi32(T0, T1); + B2 = _mm_unpacklo_epi32(T2, T3); + B3 = _mm_unpackhi_epi32(T2, T3); + } + +/* +* IDEA encryption/decryption in SSE2 +*/ +void idea_op_8(const byte in[64], byte out[64], const u16bit EK[52]) + { + __m128i B0 = _mm_loadu_si128((const __m128i*)in); + __m128i B1 = _mm_loadu_si128((const __m128i*)in + 1); + __m128i B2 = _mm_loadu_si128((const __m128i*)in + 2); + __m128i B3 = _mm_loadu_si128((const __m128i*)in + 3); + + transpose_in(B0, B1, B2, B3); + + // byte swap + B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8)); + B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8)); + B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8)); + B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8)); + + for(u32bit i = 0; i != 8; ++i) + { + B0 = mul(B0, EK[6*i+0]); + B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1])); + B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2])); + B3 = mul(B3, EK[6*i+3]); + + __m128i T0 = B2; + + B2 = _mm_xor_si128(B2, B0); + B2 = mul(B2, EK[6*i+4]); + + __m128i T1 = B1; + + B1 = _mm_xor_si128(B1, B3); + B1 = _mm_add_epi16(B1, B2); + B1 = mul(B1, EK[6*i+5]); + + B2 = _mm_add_epi16(B2, B1); + + B0 = _mm_xor_si128(B0, B1); + B1 = _mm_xor_si128(B1, T0); + B3 = _mm_xor_si128(B3, B2); + B2 = _mm_xor_si128(B2, T1); + } + + B0 = mul(B0, EK[48]); + B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50])); + B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49])); + B3 = mul(B3, EK[51]); + + // byte swap + B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8)); + B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8)); + B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8)); + B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8)); + + transpose_out(B0, B2, B1, B3); + + _mm_storeu_si128((__m128i*)out, B0); + _mm_storeu_si128((__m128i*)out + 1, B2); + _mm_storeu_si128((__m128i*)out + 2, B1); + _mm_storeu_si128((__m128i*)out + 3, B3); + } + +} + +/* +* IDEA Encryption +*/ +void IDEA_SSE2::encrypt_n(const byte in[], byte out[], u32bit blocks) const + { + while(blocks >= 8) + { + idea_op_8(in, out, this->EK); + in += 8 * BLOCK_SIZE; + out += 8 * BLOCK_SIZE; + blocks -= 8; + } + + IDEA::encrypt_n(in, out, blocks); + } + +/* +* IDEA Decryption +*/ +void IDEA_SSE2::decrypt_n(const byte in[], byte out[], u32bit blocks) const + { + while(blocks >= 8) + { + idea_op_8(in, out, this->DK); + in += 8 * BLOCK_SIZE; + out += 8 * BLOCK_SIZE; + blocks -= 8; + } + + IDEA::decrypt_n(in, out, blocks); + } + +} diff --git a/src/block/idea_sse2/idea_sse2.h b/src/block/idea_sse2/idea_sse2.h new file mode 100644 index 000000000..167c981f8 --- /dev/null +++ b/src/block/idea_sse2/idea_sse2.h @@ -0,0 +1,29 @@ +/* +* IDEA in SSE2 +* (C) 2009 Jack Lloyd +* +* Distributed under the terms of the Botan license +*/ + +#ifndef BOTAN_IDEA_SSE2_H__ +#define BOTAN_IDEA_SSE2_H__ + +#include + +namespace Botan { + +/* +* IDEA in SSE2 +*/ +class BOTAN_DLL IDEA_SSE2 : public IDEA + { + public: + void encrypt_n(const byte in[], byte out[], u32bit blocks) const; + void decrypt_n(const byte in[], byte out[], u32bit blocks) const; + + BlockCipher* clone() const { return new IDEA_SSE2; } + }; + +} + +#endif diff --git a/src/block/idea_sse2/info.txt b/src/block/idea_sse2/info.txt new file mode 100644 index 000000000..fe09d3ee5 --- /dev/null +++ b/src/block/idea_sse2/info.txt @@ -0,0 +1,7 @@ +define IDEA_SSE2 + +need_isa sse2 + + +idea + diff --git a/src/engine/simd_engine/simd_engine.cpp b/src/engine/simd_engine/simd_engine.cpp index 892221f22..b8ebd6a80 100644 --- a/src/engine/simd_engine/simd_engine.cpp +++ b/src/engine/simd_engine/simd_engine.cpp @@ -17,6 +17,10 @@ #include #endif +#if defined(BOTAN_HAS_IDEA_SSE2) + #include +#endif + #if defined(BOTAN_HAS_SHA1_SSE2) #include #endif @@ -27,16 +31,18 @@ BlockCipher* SIMD_Engine::find_block_cipher(const SCAN_Name& request, Algorithm_Factory&) const { - if(!SIMD_32::enabled()) - return 0; +#if defined(BOTAN_HAS_IDEA_SSE2) + if(request.algo_name() == "IDEA" && CPUID::has_sse2()) + return new IDEA_SSE2; +#endif #if defined(BOTAN_HAS_SERPENT_SIMD) - if(request.algo_name() == "Serpent") + if(request.algo_name() == "Serpent" && SIMD_32::enabled()) return new Serpent_SIMD; #endif #if defined(BOTAN_HAS_XTEA_SIMD) - if(request.algo_name() == "XTEA") + if(request.algo_name() == "XTEA" && SIMD_32::enabled()) return new XTEA_SIMD; #endif -- cgit v1.2.3