From ae7868f4985ce52cefeb5864715b53ecee78f740 Mon Sep 17 00:00:00 2001
From: lloyd <lloyd@randombit.net>
Date: Thu, 12 May 2011 17:56:08 +0000
Subject: Fix the problem that prevented the SSE2 IDEA implementation from
 working correctly under Clang - the technique for emulating unsigned compare
 relied on signed overflow. The new method does not, and works under GCC, ICC,
 and Clang. Even better, the compare takes only 2 instructions instead of 4.

Prevent using any of the asm implementations under Clang on x86-32.
All of them crash under Clang 2.9, unclear why.
---
 doc/log.txt                       | 6 ++++++
 src/block/idea_sse2/idea_sse2.cpp | 6 +-----
 src/utils/asm_x86_32/info.txt     | 1 -
 3 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/doc/log.txt b/doc/log.txt
index 988e4d71d..012a37041 100644
--- a/doc/log.txt
+++ b/doc/log.txt
@@ -15,6 +15,12 @@ Version 1.9.18, Not Yet Released
 * Add support for compiling SSL using Visual C++ 2010's TR1
   implementation.
 
+* The SSE2 implementation of IDEA did not work correctly when compiled
+  by Clang, because the trick it used to emulate a 16 bit unsigned
+  compare in SSE (which doesn't contain one natively) relied on signed
+  overflow working in the 'usual' way. A different method that doesn't
+  rely on signed overflow is now used.
+
 * Fix a bug under Visual C++ 2010 which would cause ``hex_encode`` to
   crash if given a zero-sized input to encode.
 
diff --git a/src/block/idea_sse2/idea_sse2.cpp b/src/block/idea_sse2/idea_sse2.cpp
index b92f51ac3..81b0fd9c1 100644
--- a/src/block/idea_sse2/idea_sse2.cpp
+++ b/src/block/idea_sse2/idea_sse2.cpp
@@ -16,7 +16,6 @@ inline __m128i mul(__m128i X, u16bit K_16)
    {
    const __m128i zeros = _mm_set1_epi16(0);
    const __m128i ones = _mm_set1_epi16(1);
-   const __m128i high_bit = _mm_set1_epi16(-32767); // 0x8000
 
    const __m128i K = _mm_set1_epi16(K_16);
 
@@ -29,10 +28,7 @@ inline __m128i mul(__m128i X, u16bit K_16)
    __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
 
    // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
-   const __m128i cmp = _mm_srli_epi16(_mm_cmpgt_epi16(
-                                         _mm_add_epi16(mul_hi, high_bit),
-                                         _mm_add_epi16(mul_lo, high_bit)),
-                                      15);
+   const __m128i cmp = _mm_min_epu8(ones, _mm_subs_epu16(mul_hi, mul_lo));
 
    T = _mm_add_epi16(T, cmp);
 
diff --git a/src/utils/asm_x86_32/info.txt b/src/utils/asm_x86_32/info.txt
index 8534d9aef..21244968f 100644
--- a/src/utils/asm_x86_32/info.txt
+++ b/src/utils/asm_x86_32/info.txt
@@ -19,7 +19,6 @@ solaris
 </os>
 
 <cc>
-clang
 gcc
 icc
 </cc>
-- 
cgit v1.2.3