gallivm: add smallfloat to float conversion not relying on cpu denorm handling

The previous code relied on cpu denorm support for converting small float formats (such r11g11b10_float and r16_float) to floats, otherwise denorms are flushed to zero. We worked around that in llvmpipe blend code by reenabling denorms, but this did nothing for texture sampling. Now it would be possible to reenable it there too but I'm not really a fan of messing with fpu flags (and it seems we can't actually do it reliably with llvm in any case looking at some bug reports). (Not to mention if you actually have a lot of denorms in there, you can expect some order-of-magnitude slowdown with x86 cpus.) So instead use code which adjusts exponents etc. directly hence not relying on cpu denorm support for the rescaling mul. (We still need the fpu flag handling as we can't do float-to-smallfloat without using cpu denorms at least for now - I actually wanted to keep both the old and new code and using one or the other depending on from where it's called but that didn't work out as the parameter would have to be passed through too many layers than I'd like.) Reviewed-by: Zack Rusin <[email protected]> Reviewed-by: Si Chen <[email protected]>
author: Roland Scheidegger <[email protected]> 2014-02-20 03:09:17 +0100
committer: Roland Scheidegger <[email protected]> 2014-02-20 18:41:42 +0100
commit: b2b2a2c06c20f3ca592af6e96222deab67ea239c (patch)
tree: c9aa3cb32cbb94078779830dda2f26fc8ea33aee
parent: 0206f0b3d4923411036711d9e7b31e33cd793a4e (diff)
1 files changed, 65 insertions, 20 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
index f68a617b810..b87174e4a20 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
@@ -309,33 +309,78 @@ lp_build_smallfloat_to_float(struct gallivm_state *gallivm,
                                     ((1 << (mantissa_bits + exponent_bits)) - 1)
                                     << (23 - mantissa_bits));
    srcabs = lp_build_and(&i32_bld, src, maskabs);
-   srcabs = LLVMBuildBitCast(builder, srcabs, f32_bld.vec_type, "");
 
    /* now do the actual scaling */
    smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
                                          ((1 << exponent_bits) - 1) << 23);
    i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
-   /*
-    * magic number has exponent new exp bias + (new exp bias - old exp bias),
-    * mantissa is 0.
-    */
-   magic = lp_build_const_int_vec(gallivm, i32_type,
-                                  (255 - (1 << (exponent_bits - 1))) << 23);
-   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
 
-   /* adjust exponent and fix denorms */
-   res = lp_build_mul(&f32_bld, srcabs, magic);
+   if (0) {
+     /*
+      * Note that this code path, while simpler, will convert small
+      * float denorms to floats according to current cpu denorm mode, if
+      * denorms are disabled it will flush them to zero!
+      * If cpu denorms are enabled, it should be faster though as long as
+      * there's no denorms in the inputs, but if there are actually denorms
+      * it's likely to be an order of magnitude slower (on x86 cpus).
+      */
 
-   /*
-    * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
-    * so a simple "or" will do (because exp adjust will leave mantissa intact)
-    */
-   /* use float compare (better for AVX 8-wide / no AVX2 but else should use int) */
-   smallexpmask = LLVMBuildBitCast(builder, smallexpmask, f32_bld.vec_type, "");
-   wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
-   res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
-   tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
-   res = lp_build_or(&i32_bld, tmp, res);
+      srcabs = LLVMBuildBitCast(builder, srcabs, f32_bld.vec_type, "");
+
+      /*
+       * magic number has exponent new exp bias + (new exp bias - old exp bias),
+       * mantissa is 0.
+       */
+      magic = lp_build_const_int_vec(gallivm, i32_type,
+                                     (255 - (1 << (exponent_bits - 1))) << 23);
+      magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+
+      /* adjust exponent and fix denorms */
+      res = lp_build_mul(&f32_bld, srcabs, magic);
+
+      /*
+       * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
+       * so a simple "or" will do (because exp adjust will leave mantissa intact)
+       */
+      /* use float compare (better for AVX 8-wide / no AVX2 but else should use int) */
+      smallexpmask = LLVMBuildBitCast(builder, smallexpmask, f32_bld.vec_type, "");
+      wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
+      res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
+      tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
+      res = lp_build_or(&i32_bld, tmp, res);
+   }
+
+   else {
+      LLVMValueRef exp_one, isdenorm, denorm, normal, exp_adj;
+
+      /* denorm (or zero) if exponent is zero */
+      exp_one = lp_build_const_int_vec(gallivm, i32_type, 1 << 23);
+      isdenorm = lp_build_cmp(&i32_bld, PIPE_FUNC_LESS, srcabs, exp_one);
+
+      /* inf or nan if exponent is max */
+      wasinfnan = lp_build_cmp(&i32_bld, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
+
+      /* for denormal (or zero), add (== or) magic exp to mantissa (== srcabs) (as int)
+       * then subtract it (as float).
+       * Another option would be to just do inttofp then do a rescale mul.
+       */
+      magic = lp_build_const_int_vec(gallivm, i32_type,
+                                     (127 - ((1 << (exponent_bits - 1)) - 2)) << 23);
+      denorm = lp_build_or(&i32_bld, srcabs, magic);
+      denorm = LLVMBuildBitCast(builder, denorm, f32_bld.vec_type, "");
+      denorm = lp_build_sub(&f32_bld, denorm,
+                            LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""));
+      denorm = LLVMBuildBitCast(builder, denorm, i32_bld.vec_type, "");
+
+      /* for normals, Infs, Nans fix up exponent */
+      exp_adj = lp_build_const_int_vec(gallivm, i32_type,
+                                      (127 - ((1 << (exponent_bits - 1)) - 1)) << 23);
+      normal = lp_build_add(&i32_bld, srcabs, exp_adj);
+      tmp = lp_build_and(&i32_bld, wasinfnan, i32_floatexpmask);
+      normal = lp_build_or(&i32_bld, tmp, normal);
+
+      res = lp_build_select(&i32_bld, isdenorm, denorm, normal);
+   }
 
    if (has_sign) {
       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000);
author	Roland Scheidegger <[email protected]>	2014-02-20 03:09:17 +0100
committer	Roland Scheidegger <[email protected]>	2014-02-20 18:41:42 +0100
commit	b2b2a2c06c20f3ca592af6e96222deab67ea239c (patch)
tree	c9aa3cb32cbb94078779830dda2f26fc8ea33aee
parent	0206f0b3d4923411036711d9e7b31e33cd793a4e (diff)