gallivm: better support for fast rsqrt

We had to disable fast rsqrt before because it wasn't precise enough etc. However in situations when we know we're not going to need more precision we can still use a fast rsqrt (which can be several times faster than the quite expensive sqrt). Hence introduce a new helper which does exactly that - it is probably not useful calling it in some situations if there's no fast rsqrt available so make it queryable if it's available too. v2: use fast_rsqrt consistently instead of rsqrt_fast, fix indentation, let rsqrt use fast_rsqrt. Reviewed-by: Brian Paul <[email protected]> Reviewed-by: Jose Fonseca <[email protected]>
author: Roland Scheidegger <[email protected]> 2013-07-11 23:15:44 +0200
committer: Roland Scheidegger <[email protected]> 2013-07-13 18:42:17 +0200
commit: 9b8d97e5bf76219e84e4f4e9c90c16a543eb837d (patch)
tree: 19a50aa27f05b4fa8cd762656cb0eea6016ea401 /src
parent: 45574ab2e92f0bf74b18448baff49cb2eb5db620 (diff)
2 files changed, 63 insertions, 16 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index c006ac537c1..7d6fe04f50d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -2306,19 +2306,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
    /*
     * This should be faster but all denormals will end up as infinity.
     */
-   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
-        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
+   if (0 && lp_build_fast_rsqrt_available(type)) {
       const unsigned num_iterations = 1;
       LLVMValueRef res;
       unsigned i;
-      const char *intrinsic = NULL;
 
-      if (type.length == 4) {
-         intrinsic = "llvm.x86.sse.rsqrt.ps";
-      }
-      else {
-         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
-      }
+      /* rsqrt(1.0) != 1.0 here */
+      res = lp_build_fast_rsqrt(bld, a);
+
       if (num_iterations) {
          /*
           * Newton-Raphson will result in NaN instead of infinity for zero,
@@ -2338,8 +2333,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
 
-         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-
          for (i = 0; i < num_iterations; ++i) {
             res = lp_build_rsqrt_refine(bld, a, res);
          }
@@ -2350,11 +2343,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
          res = lp_build_select(bld, cmp, bld->one, res);
       }
-      else {
-         /* rsqrt(1.0) != 1.0 here */
-         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-
-      }
 
       return res;
    }
@@ -2362,6 +2350,58 @@ lp_build_rsqrt(struct lp_build_context *bld,
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
 
+/**
+ * If there's a fast (inaccurate) rsqrt instruction available
+ * (caller may want to avoid to call rsqrt_fast if it's not available,
+ * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
+ * unavailable it would result in sqrt/div/mul so obviously
+ * much better to just call sqrt, skipping both div and mul).
+ */
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type)
+{
+   assert(type.floating);
+
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+      return true;
+   }
+   return false;
+}
+
+
+/**
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
+ * Precision is limited, only ~10 bits guaranteed
+ * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
+ */
+LLVMValueRef
+lp_build_fast_rsqrt(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const struct lp_type type = bld->type;
+
+   assert(lp_check_value(type, a));
+
+   if (lp_build_fast_rsqrt_available(type)) {
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+   }
+   else {
+      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
+   }
+   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
+}
+
 
 /**
  * Generate sin(a) using SSE2
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 966796c3c4d..920e339cda5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -231,6 +231,13 @@ LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
                LLVMValueRef a);
 
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type);
+
+LLVMValueRef
+lp_build_fast_rsqrt(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
 LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
              LLVMValueRef a);
author	Roland Scheidegger <[email protected]>	2013-07-11 23:15:44 +0200
committer	Roland Scheidegger <[email protected]>	2013-07-13 18:42:17 +0200
commit	9b8d97e5bf76219e84e4f4e9c90c16a543eb837d (patch)
tree	19a50aa27f05b4fa8cd762656cb0eea6016ea401 /src
parent	45574ab2e92f0bf74b18448baff49cb2eb5db620 (diff)