From 48e6404c04da6c9655d7a8b625830d0d40f393ae Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Sun, 9 Aug 2015 02:50:10 +0200 Subject: gallium/auxiliary: optimize rgb9e5 helper some more MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I used this as some testing ground for investigating some compiler bits initially (e.g. lrint calls etc.), figured I could do much better in the end just for fun... This is mathematically equivalent, but uses some tricks to avoid doubles and also replaces some float math with ints. Good for another performance doubling or so. As a side note, some quick tests show that llvm's loop vectorizer would be able to properly vectorize this version (which it failed to do earlier due to doubles, producing a mess), giving another 3 times performance increase with sse2 (more with sse4.1), but this may not apply to mesa. No piglit change. Acked-by: Marek Olšák --- src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 45 deletions(-) (limited to 'src') diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h index d11bfa833f1..21feba7b710 100644 --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h @@ -74,62 +74,59 @@ typedef union { } field; } rgb9e5; -static inline float rgb9e5_ClampRange(float x) -{ - if (x > 0.0f) { - if (x >= MAX_RGB9E5) { - return MAX_RGB9E5; - } else { - return x; - } - } else { - /* NaN gets here too since comparisons with NaN always fail! */ - return 0.0f; - } -} -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we - are going to do a max of this value with the minimum rgb9e5 exponent - that will hide these problem cases. */ -static inline int rgb9e5_FloorLog2(float x) +static inline int rgb9e5_ClampRange(float x) { float754 f; - + float754 max; f.value = x; - return (f.field.biasedexponent - 127); + max.value = MAX_RGB9E5; + + if (f.raw > 0x7f800000) + /* catches neg, NaNs */ + return 0; + else if (f.raw >= max.raw) + return max.raw; + else + return f.raw; } static inline unsigned float3_to_rgb9e5(const float rgb[3]) { rgb9e5 retval; - float maxrgb; - int rm, gm, bm; - float rc, gc, bc; - int exp_shared, maxm; + int rm, gm, bm, exp_shared; float754 revdenom = {0}; - - rc = rgb9e5_ClampRange(rgb[0]); - gc = rgb9e5_ClampRange(rgb[1]); - bc = rgb9e5_ClampRange(rgb[2]); - - maxrgb = MAX3(rc, gc, bc); - exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS; + float754 rc, bc, gc, maxrgb; + + rc.raw = rgb9e5_ClampRange(rgb[0]); + gc.raw = rgb9e5_ClampRange(rgb[1]); + bc.raw = rgb9e5_ClampRange(rgb[2]); + maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw); + + /* + * Compared to what the spec suggests, instead of conditionally adjusting + * the exponent after the fact do it here by doing the equivalent of +0.5 - + * the int add will spill over into the exponent in this case. + */ + maxrgb.raw += maxrgb.raw & (1 << (23-9)); + exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) + + 1 + RGB9E5_EXP_BIAS - 127; + revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - + RGB9E5_MANTISSA_BITS) + 1; assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); - assert(exp_shared >= 0); - revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS); - - maxm = (int) (maxrgb * revdenom.value + 0.5); - if (maxm == MAX_RGB9E5_MANTISSA + 1) { - revdenom.value *= 0.5f; - exp_shared += 1; - assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); - } else { - assert(maxm <= MAX_RGB9E5_MANTISSA); - } - - rm = (int) (rc * revdenom.value + 0.5); - gm = (int) (gc * revdenom.value + 0.5); - bm = (int) (bc * revdenom.value + 0.5); + + /* + * The spec uses strict round-up behavior (d3d10 disagrees, but in any case + * must match what is done above for figuring out exponent). + * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding + * ourselves (revdenom was adjusted by +1, above). + */ + rm = (int) (rc.value * revdenom.value); + gm = (int) (gc.value * revdenom.value); + bm = (int) (bc.value * revdenom.value); + rm = (rm & 1) + (rm >> 1); + gm = (gm & 1) + (gm >> 1); + bm = (bm & 1) + (bm >> 1); assert(rm <= MAX_RGB9E5_MANTISSA); assert(gm <= MAX_RGB9E5_MANTISSA); -- cgit v1.2.3