gallivm: fix rsqrt failures

lp_build_rsqrt initially did not do any newton-raphson step. This meant that precision was only ~11 bits, but this handled both input 0.0 and +infinity correctly. It did not however handle input 1.0 accurately, and denormals always generated infinity result. Doing a newton-raphson step increased precision significantly (but notably input 1.0 still doesn't give output 1.0), however this fails for inputs 0.0 and infinity (both result in NaNs). Try to fix this up by using cmp/select but since this is all quite fishy (and still doesn't handle denormals) disable for now. Note that even with workarounds it should still have been faster since the fallback uses sqrt/div (which both use the usually unpipelined and slow divider hw). Also add some more test values to lp_test_arit and test lp_build_rcp() too while there. v2: based on José's feedback, avoid hacky infinity definition which doesn't work with msvc (unfortunately using INFINITY won't cut it neither on non-c99 compilers) in lp_build_rsqrt, and while here fix up the input infinity case too (it's disabled anyway). Only test infinity input case if we have c99, and use float cast for calculating reference rsqrt value so we really get what we expect. Reviewed-by: José Fonseca <jfonseca@vmware.com>
author: Roland Scheidegger <sroland@vmware.com> 2012-09-21 17:03:48 +0200
committer: José Fonseca <jfonseca@vmware.com> 2012-10-12 18:51:18 +0100
commit: d366520e8553f4a16151ee946d6e8136cab3de5e (patch)
tree: c2f1df1a2710f2c1e4157a40a9a9623d85912bfa /src
parent: 2a4105cbc067dcd38057a877d606e9493e9ed53a (diff)
2 files changed, 76 insertions, 15 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7878544436d..d23ff0bf996 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -60,6 +60,7 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
+#include "float.h"
 
 #define EXP_POLY_DEGREE 5
 
@@ -1953,8 +1954,7 @@ lp_build_rcp(struct lp_build_context *bld,
  *
  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
  *
- * See also:
- * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ * See also Intel 64 and IA-32 Architectures Optimization Manual.
  */
 static INLINE LLVMValueRef
 lp_build_rsqrt_refine(struct lp_build_context *bld,
@@ -1977,7 +1977,8 @@ lp_build_rsqrt_refine(struct lp_build_context *bld,
 
 
 /**
- * Generate 1/sqrt(a)
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
  */
 LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
@@ -1990,8 +1991,11 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
-        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+   /*
+    * This should be faster but all denormals will end up as infinity.
+    */
+   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
       const unsigned num_iterations = 1;
       LLVMValueRef res;
       unsigned i;
@@ -2003,12 +2007,41 @@ lp_build_rsqrt(struct lp_build_context *bld,
       else {
          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
       }
+      if (num_iterations) {
+         /*
+          * Newton-Raphson will result in NaN instead of infinity for zero,
+          * and NaN instead of zero for infinity.
+          * Also, need to ensure rsqrt(1.0) == 1.0.
+          * All numbers smaller than FLT_MIN will result in +infinity
+          * (rsqrtps treats all denormals as zero).
+          */
+         /*
+          * Certain non-c99 compilers don't know INFINITY and might not support
+          * hacks to evaluate it at compile time neither.
+          */
+         const unsigned posinf_int = 0x7F800000;
+         LLVMValueRef cmp;
+         LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
+         LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
 
-      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+         inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
 
+         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+
+         for (i = 0; i < num_iterations; ++i) {
+            res = lp_build_rsqrt_refine(bld, a, res);
+         }
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
+         res = lp_build_select(bld, cmp, inf, res);
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
+         res = lp_build_select(bld, cmp, bld->zero, res);
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
+         res = lp_build_select(bld, cmp, bld->one, res);
+      }
+      else {
+         /* rsqrt(1.0) != 1.0 here */
+         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
-      for (i = 0; i < num_iterations; ++i) {
-         res = lp_build_rsqrt_refine(bld, a, res);
       }
 
       return res;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
index 6e09f7e67b0..99928b8ab6e 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -150,19 +150,42 @@ const float log2_values[] = {
 };
 
 
+static float rcpf(float x)
+{
+   return 1.0/x;
+}
+
+
+const float rcp_values[] = {
+   -0.0, 0.0,
+   -1.0, 1.0,
+   -1e-007, 1e-007,
+   -4.0, 4.0,
+   -1e+035, -100000,
+   100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY, -INFINITY,
+#endif
+};
+
+
 static float rsqrtf(float x)
 {
-   return 1.0/sqrt(x);
+   return 1.0/(float)sqrt(x);
 }
 
 
 const float rsqrt_values[] = {
-   -1, -1e-007,
-   1e-007, 1,
-   -4, -1,
-   1, 4,
-   -1e+035, -100000,
+   // http://msdn.microsoft.com/en-us/library/windows/desktop/bb147346.aspx
+   0.0, // must yield infinity
+   1.0, // must yield 1.0
+   1e-007, 4.0,
    100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY,
+#endif
 };
 
 
@@ -231,6 +254,7 @@ unary_tests[] = {
    {"log2", &lp_build_log2, &log2f, log2_values, Elements(log2_values), 20.0 },
    {"exp", &lp_build_exp, &expf, exp2_values, Elements(exp2_values), 18.0 },
    {"log", &lp_build_log, &logf, log2_values, Elements(log2_values), 20.0 },
+   {"rcp", &lp_build_rcp, &rcpf, rcp_values, Elements(rcp_values), 20.0 },
    {"rsqrt", &lp_build_rsqrt, &rsqrtf, rsqrt_values, Elements(rsqrt_values), 20.0 },
    {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
    {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
@@ -330,7 +354,11 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test)
          double error, precision;
          bool pass;
 
-         error = fabs(out[i] - ref);
+         if (util_inf_sign(ref) && util_inf_sign(out[i]) == util_inf_sign(ref)) {
+            error = 0;
+         } else {
+            error = fabs(out[i] - ref);
+         }
          precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
 
          pass = precision >= test->precision;
author	Roland Scheidegger <sroland@vmware.com>	2012-09-21 17:03:48 +0200
committer	José Fonseca <jfonseca@vmware.com>	2012-10-12 18:51:18 +0100
commit	d366520e8553f4a16151ee946d6e8136cab3de5e (patch)
tree	c2f1df1a2710f2c1e4157a40a9a9623d85912bfa /src
parent	2a4105cbc067dcd38057a877d606e9493e9ed53a (diff)