diff options
author | Roland Scheidegger <[email protected]> | 2013-07-11 23:15:44 +0200 |
---|---|---|
committer | Roland Scheidegger <[email protected]> | 2013-07-13 18:42:17 +0200 |
commit | 9b8d97e5bf76219e84e4f4e9c90c16a543eb837d (patch) | |
tree | 19a50aa27f05b4fa8cd762656cb0eea6016ea401 /src/gallium/auxiliary/gallivm/lp_bld_arit.c | |
parent | 45574ab2e92f0bf74b18448baff49cb2eb5db620 (diff) |
gallivm: better support for fast rsqrt
We had to disable fast rsqrt before because it wasn't precise enough etc.
However in situations when we know we're not going to need more precision
we can still use a fast rsqrt (which can be several times faster than
the quite expensive sqrt). Hence introduce a new helper which does exactly
that - it is probably not useful calling it in some situations if there's
no fast rsqrt available so make it queryable if it's available too.
v2: use fast_rsqrt consistently instead of rsqrt_fast, fix indentation,
let rsqrt use fast_rsqrt.
Reviewed-by: Brian Paul <[email protected]>
Reviewed-by: Jose Fonseca <[email protected]>
Diffstat (limited to 'src/gallium/auxiliary/gallivm/lp_bld_arit.c')
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_arit.c | 72 |
1 files changed, 56 insertions, 16 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index c006ac537c1..7d6fe04f50d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -2306,19 +2306,14 @@ lp_build_rsqrt(struct lp_build_context *bld, /* * This should be faster but all denormals will end up as infinity. */ - if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || - (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) { + if (0 && lp_build_fast_rsqrt_available(type)) { const unsigned num_iterations = 1; LLVMValueRef res; unsigned i; - const char *intrinsic = NULL; - if (type.length == 4) { - intrinsic = "llvm.x86.sse.rsqrt.ps"; - } - else { - intrinsic = "llvm.x86.avx.rsqrt.ps.256"; - } + /* rsqrt(1.0) != 1.0 here */ + res = lp_build_fast_rsqrt(bld, a); + if (num_iterations) { /* * Newton-Raphson will result in NaN instead of infinity for zero, @@ -2338,8 +2333,6 @@ lp_build_rsqrt(struct lp_build_context *bld, inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), ""); - res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); - for (i = 0; i < num_iterations; ++i) { res = lp_build_rsqrt_refine(bld, a, res); } @@ -2350,11 +2343,6 @@ lp_build_rsqrt(struct lp_build_context *bld, cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one); res = lp_build_select(bld, cmp, bld->one, res); } - else { - /* rsqrt(1.0) != 1.0 here */ - res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); - - } return res; } @@ -2362,6 +2350,58 @@ lp_build_rsqrt(struct lp_build_context *bld, return lp_build_rcp(bld, lp_build_sqrt(bld, a)); } +/** + * If there's a fast (inaccurate) rsqrt instruction available + * (caller may want to avoid to call rsqrt_fast if it's not available, + * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if + * unavailable it would result in sqrt/div/mul so obviously + * much better to just call sqrt, skipping both div and mul). + */ +boolean +lp_build_fast_rsqrt_available(struct lp_type type) +{ + assert(type.floating); + + if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { + return true; + } + return false; +} + + +/** + * Generate 1/sqrt(a). + * Result is undefined for values < 0, infinity for +0. + * Precision is limited, only ~10 bits guaranteed + * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). + */ +LLVMValueRef +lp_build_fast_rsqrt(struct lp_build_context *bld, + LLVMValueRef a) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + const struct lp_type type = bld->type; + + assert(lp_check_value(type, a)); + + if (lp_build_fast_rsqrt_available(type)) { + const char *intrinsic = NULL; + + if (type.length == 4) { + intrinsic = "llvm.x86.sse.rsqrt.ps"; + } + else { + intrinsic = "llvm.x86.avx.rsqrt.ps.256"; + } + return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); + } + else { + debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); + } + return lp_build_rcp(bld, lp_build_sqrt(bld, a)); +} + /** * Generate sin(a) using SSE2 |