gallium/auxiliary: optimize rgb9e5 helper some more

I used this as some testing ground for investigating some compiler bits initially (e.g. lrint calls etc.), figured I could do much better in the end just for fun... This is mathematically equivalent, but uses some tricks to avoid doubles and also replaces some float math with ints. Good for another performance doubling or so. As a side note, some quick tests show that llvm's loop vectorizer would be able to properly vectorize this version (which it failed to do earlier due to doubles, producing a mess), giving another 3 times performance increase with sse2 (more with sse4.1), but this may not apply to mesa. No piglit change. Acked-by: Marek Olšák <[email protected]>
author: Roland Scheidegger <[email protected]> 2015-08-09 02:50:10 +0200
committer: Roland Scheidegger <[email protected]> 2015-08-26 02:57:38 +0200
commit: 48e6404c04da6c9655d7a8b625830d0d40f393ae (patch)
tree: d8924b77e5b3e15fb6efda8fca508d2e5ad29cf1 /src
parent: 941346a80323c9419b70e3987b900a69ebb08fb4 (diff)
1 files changed, 42 insertions, 45 deletions
diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index d11bfa833f1..21feba7b710 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -74,62 +74,59 @@ typedef union {
    } field;
 } rgb9e5;
 
-static inline float rgb9e5_ClampRange(float x)
-{
-   if (x > 0.0f) {
-      if (x >= MAX_RGB9E5) {
-         return MAX_RGB9E5;
-      } else {
-         return x;
-      }
-   } else {
-      /* NaN gets here too since comparisons with NaN always fail! */
-      return 0.0f;
-   }
-}
 
-/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
-   are going to do a max of this value with the minimum rgb9e5 exponent
-   that will hide these problem cases. */
-static inline int rgb9e5_FloorLog2(float x)
+static inline int rgb9e5_ClampRange(float x)
 {
    float754 f;
-
+   float754 max;
    f.value = x;
-   return (f.field.biasedexponent - 127);
+   max.value = MAX_RGB9E5;
+
+   if (f.raw > 0x7f800000)
+  /* catches neg, NaNs */
+      return 0;
+   else if (f.raw >= max.raw)
+      return max.raw;
+   else
+      return f.raw;
 }
 
 static inline unsigned float3_to_rgb9e5(const float rgb[3])
 {
    rgb9e5 retval;
-   float maxrgb;
-   int rm, gm, bm;
-   float rc, gc, bc;
-   int exp_shared, maxm;
+   int rm, gm, bm, exp_shared;
    float754 revdenom = {0};
-
-   rc = rgb9e5_ClampRange(rgb[0]);
-   gc = rgb9e5_ClampRange(rgb[1]);
-   bc = rgb9e5_ClampRange(rgb[2]);
-
-   maxrgb = MAX3(rc, gc, bc);
-   exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
+   float754 rc, bc, gc, maxrgb;
+
+   rc.raw = rgb9e5_ClampRange(rgb[0]);
+   gc.raw = rgb9e5_ClampRange(rgb[1]);
+   bc.raw = rgb9e5_ClampRange(rgb[2]);
+   maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
+
+   /*
+    * Compared to what the spec suggests, instead of conditionally adjusting
+    * the exponent after the fact do it here by doing the equivalent of +0.5 -
+    * the int add will spill over into the exponent in this case.
+    */
+   maxrgb.raw += maxrgb.raw & (1 << (23-9));
+   exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
+                1 + RGB9E5_EXP_BIAS - 127;
+   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
+                                          RGB9E5_MANTISSA_BITS) + 1;
    assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
-   assert(exp_shared >= 0);
-   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
-
-   maxm = (int) (maxrgb * revdenom.value + 0.5);
-   if (maxm == MAX_RGB9E5_MANTISSA + 1) {
-      revdenom.value *= 0.5f;
-      exp_shared += 1;
-      assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
-   } else {
-      assert(maxm <= MAX_RGB9E5_MANTISSA);
-   }
-
-   rm = (int) (rc * revdenom.value + 0.5);
-   gm = (int) (gc * revdenom.value + 0.5);
-   bm = (int) (bc * revdenom.value + 0.5);
+
+   /*
+    * The spec uses strict round-up behavior (d3d10 disagrees, but in any case
+    * must match what is done above for figuring out exponent).
+    * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
+    * ourselves (revdenom was adjusted by +1, above).
+    */
+   rm = (int) (rc.value * revdenom.value);
+   gm = (int) (gc.value * revdenom.value);
+   bm = (int) (bc.value * revdenom.value);
+   rm = (rm & 1) + (rm >> 1);
+   gm = (gm & 1) + (gm >> 1);
+   bm = (bm & 1) + (bm >> 1);
 
    assert(rm <= MAX_RGB9E5_MANTISSA);
    assert(gm <= MAX_RGB9E5_MANTISSA);
author	Roland Scheidegger <[email protected]>	2015-08-09 02:50:10 +0200
committer	Roland Scheidegger <[email protected]>	2015-08-26 02:57:38 +0200
commit	48e6404c04da6c9655d7a8b625830d0d40f393ae (patch)
tree	d8924b77e5b3e15fb6efda8fca508d2e5ad29cf1 /src
parent	941346a80323c9419b70e3987b900a69ebb08fb4 (diff)