gallivm: Emit DIVPS instead of RCPPS.

See comments for detailed rationale. Thanks to Michal Krol and Zack Rusin for detecting and investigating this in detail.
author: José Fonseca <[email protected]> 2010-08-21 21:58:22 +0100
committer: José Fonseca <[email protected]> 2010-08-21 21:58:22 +0100
commit: 0d96cbe4a5e0a39c17db007f3815868c6a766382 (patch)
tree: c554638d8c0444d71beb2485e3d2a9f09f3ec3a8
parent: 42210f44645c154983705a2caea6af8fbdafbc12 (diff)
1 files changed, 24 insertions, 12 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7b35dd4bb49..bb30e6e9dfb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -59,14 +59,6 @@
 #include "lp_bld_arit.h"
 
 
-/*
- * XXX: Increasing eliminates some artifacts, but adds others, most
- * noticeably corruption in the Earth halo in Google Earth.
- */
-#define RCP_NEWTON_STEPS 0
-
-#define RSQRT_NEWTON_STEPS 0
-
 #define EXP_POLY_DEGREE 3
 
 #define LOG_POLY_DEGREE 5
@@ -1266,6 +1258,11 @@ lp_build_sqrt(struct lp_build_context *bld,
  *
  *   x_{i+1} = x_i * (2 - a * x_i)
  *
+ * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
+ * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
+ * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * halo. It would be necessary to clamp the argument to prevent this.
+ *
  * See also:
  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
@@ -1306,13 +1303,27 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   /*
+    * We don't use RCPPS because:
+    * - it only has 10bits of precision
+    * - it doesn't even get the reciprocate of 1.0 exactly
+    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
+    * - for recent processors the benefit over DIVPS is marginal, a case
+    *   depedent
+    *
+    * We could still use it on certain processors if benchmarks show that the
+    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
+    * particular uses that require less workarounds.
+    */
+
+   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
 
       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
 
-      for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rcp_refine(bld, a, res);
       }
 
@@ -1363,13 +1374,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
 
       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
 
-      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rsqrt_refine(bld, a, res);
       }
author	José Fonseca <[email protected]>	2010-08-21 21:58:22 +0100
committer	José Fonseca <[email protected]>	2010-08-21 21:58:22 +0100
commit	0d96cbe4a5e0a39c17db007f3815868c6a766382 (patch)
tree	c554638d8c0444d71beb2485e3d2a9f09f3ec3a8
parent	42210f44645c154983705a2caea6af8fbdafbc12 (diff)