diff options
author | lloyd <[email protected]> | 2011-05-12 17:56:08 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2011-05-12 17:56:08 +0000 |
commit | ae7868f4985ce52cefeb5864715b53ecee78f740 (patch) | |
tree | fb33fad8be4cbc98be6ad604249aada897f345f1 | |
parent | 44aa55d0aeb96d452450df5c024db7275a560925 (diff) |
Fix the problem that prevented the SSE2 IDEA implementation from
working correctly under Clang - the technique for emulating unsigned
compare relied on signed overflow. The new method does not, and works
under GCC, ICC, and Clang. Even better, the compare takes only 2
instructions instead of 4.
Prevent using any of the asm implementations under Clang on x86-32.
All of them crash under Clang 2.9, unclear why.
-rw-r--r-- | doc/log.txt | 6 | ||||
-rw-r--r-- | src/block/idea_sse2/idea_sse2.cpp | 6 | ||||
-rw-r--r-- | src/utils/asm_x86_32/info.txt | 1 |
3 files changed, 7 insertions, 6 deletions
diff --git a/doc/log.txt b/doc/log.txt index 988e4d71d..012a37041 100644 --- a/doc/log.txt +++ b/doc/log.txt @@ -15,6 +15,12 @@ Version 1.9.18, Not Yet Released * Add support for compiling SSL using Visual C++ 2010's TR1 implementation. +* The SSE2 implementation of IDEA did not work correctly when compiled + by Clang, because the trick it used to emulate a 16 bit unsigned + compare in SSE (which doesn't contain one natively) relied on signed + overflow working in the 'usual' way. A different method that doesn't + rely on signed overflow is now used. + * Fix a bug under Visual C++ 2010 which would cause ``hex_encode`` to crash if given a zero-sized input to encode. diff --git a/src/block/idea_sse2/idea_sse2.cpp b/src/block/idea_sse2/idea_sse2.cpp index b92f51ac3..81b0fd9c1 100644 --- a/src/block/idea_sse2/idea_sse2.cpp +++ b/src/block/idea_sse2/idea_sse2.cpp @@ -16,7 +16,6 @@ inline __m128i mul(__m128i X, u16bit K_16) { const __m128i zeros = _mm_set1_epi16(0); const __m128i ones = _mm_set1_epi16(1); - const __m128i high_bit = _mm_set1_epi16(-32767); // 0x8000 const __m128i K = _mm_set1_epi16(K_16); @@ -29,10 +28,7 @@ inline __m128i mul(__m128i X, u16bit K_16) __m128i T = _mm_sub_epi16(mul_lo, mul_hi); // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0 - const __m128i cmp = _mm_srli_epi16(_mm_cmpgt_epi16( - _mm_add_epi16(mul_hi, high_bit), - _mm_add_epi16(mul_lo, high_bit)), - 15); + const __m128i cmp = _mm_min_epu8(ones, _mm_subs_epu16(mul_hi, mul_lo)); T = _mm_add_epi16(T, cmp); diff --git a/src/utils/asm_x86_32/info.txt b/src/utils/asm_x86_32/info.txt index 8534d9aef..21244968f 100644 --- a/src/utils/asm_x86_32/info.txt +++ b/src/utils/asm_x86_32/info.txt @@ -19,7 +19,6 @@ solaris </os> <cc> -clang gcc icc </cc> |