2 files changed, 76 insertions, 15 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7878544436d..d23ff0bf996 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -60,6 +60,7 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
+#include "float.h"
 
 #define EXP_POLY_DEGREE 5
 
@@ -1953,8 +1954,7 @@ lp_build_rcp(struct lp_build_context *bld,
  *
  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
  *
- * See also:
- * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ * See also Intel 64 and IA-32 Architectures Optimization Manual.
  */
 static INLINE LLVMValueRef
 lp_build_rsqrt_refine(struct lp_build_context *bld,
@@ -1977,7 +1977,8 @@ lp_build_rsqrt_refine(struct lp_build_context *bld,
 
 
 /**
- * Generate 1/sqrt(a)
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
  */
 LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
@@ -1990,8 +1991,11 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
-        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+   /*
+    * This should be faster but all denormals will end up as infinity.
+    */
+   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
       const unsigned num_iterations = 1;
       LLVMValueRef res;
       unsigned i;
@@ -2003,12 +2007,41 @@ lp_build_rsqrt(struct lp_build_context *bld,
       else {
          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
       }
+      if (num_iterations) {
+         /*
+          * Newton-Raphson will result in NaN instead of infinity for zero,
+          * and NaN instead of zero for infinity.
+          * Also, need to ensure rsqrt(1.0) == 1.0.
+          * All numbers smaller than FLT_MIN will result in +infinity
+          * (rsqrtps treats all denormals as zero).
+          */
+         /*
+          * Certain non-c99 compilers don't know INFINITY and might not support
+          * hacks to evaluate it at compile time neither.
+          */
+         const unsigned posinf_int = 0x7F800000;
+         LLVMValueRef cmp;
+         LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
+         LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
 
-      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+         inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
 
+         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+
+         for (i = 0; i < num_iterations; ++i) {
+            res = lp_build_rsqrt_refine(bld, a, res);
+         }
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
+         res = lp_build_select(bld, cmp, inf, res);
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
+         res = lp_build_select(bld, cmp, bld->zero, res);
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
+         res = lp_build_select(bld, cmp, bld->one, res);
+      }
+      else {
+         /* rsqrt(1.0) != 1.0 here */
+         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
-      for (i = 0; i < num_iterations; ++i) {
-         res = lp_build_rsqrt_refine(bld, a, res);
       }
 
       return res;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
index 6e09f7e67b0..99928b8ab6e 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -150,19 +150,42 @@ const float log2_values[] = {
 };
 
 
+static float rcpf(float x)
+{
+   return 1.0/x;
+}
+
+
+const float rcp_values[] = {
+   -0.0, 0.0,
+   -1.0, 1.0,
+   -1e-007, 1e-007,
+   -4.0, 4.0,
+   -1e+035, -100000,
+   100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY, -INFINITY,
+#endif
+};
+
+
 static float rsqrtf(float x)
 {
-   return 1.0/sqrt(x);
+   return 1.0/(float)sqrt(x);
 }
 
 
 const float rsqrt_values[] = {
-   -1, -1e-007,
-   1e-007, 1,
-   -4, -1,
-   1, 4,
-   -1e+035, -100000,
+   // http://msdn.microsoft.com/en-us/library/windows/desktop/bb147346.aspx
+   0.0, // must yield infinity
+   1.0, // must yield 1.0
+   1e-007, 4.0,
    100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY,
+#endif
 };
 
 
@@ -231,6 +254,7 @@ unary_tests[] = {
    {"log2", &lp_build_log2, &log2f, log2_values, Elements(log2_values), 20.0 },
    {"exp", &lp_build_exp, &expf, exp2_values, Elements(exp2_values), 18.0 },
    {"log", &lp_build_log, &logf, log2_values, Elements(log2_values), 20.0 },
+   {"rcp", &lp_build_rcp, &rcpf, rcp_values, Elements(rcp_values), 20.0 },
    {"rsqrt", &lp_build_rsqrt, &rsqrtf, rsqrt_values, Elements(rsqrt_values), 20.0 },
    {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
    {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
@@ -330,7 +354,11 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test)
          double error, precision;
          bool pass;
 
-         error = fabs(out[i] - ref);
+         if (util_inf_sign(ref) && util_inf_sign(out[i]) == util_inf_sign(ref)) {
+            error = 0;
+         } else {
+            error = fabs(out[i] - ref);
+         }
          precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
 
          pass = precision >= test->precision;