soft-fp64: Optimize __fmin64 and __fmax64 by using different evaluation order [v2]

v2: Go to extra effort to avoid flow control inserted to implement short-circuit evaluation rules. Results on the 308 shaders extracted from the fp64 portion of the OpenGL CTS: Tiger Lake and Ice Lake had similar results. (Tiger Lake shown) total instructions in shared programs: 797779 -> 796849 (-0.12%) instructions in affected programs: 3499 -> 2569 (-26.58%) helped: 21 HURT: 0 helped stats (abs) min: 8 max: 112 x̄: 44.29 x̃: 44 helped stats (rel) min: 16.09% max: 33.15% x̄: 25.72% x̃: 24.62% 95% mean confidence interval for instructions value: -55.94 -32.63 95% mean confidence interval for instructions %-change: -28.14% -23.30% Instructions are helped. total cycles in shared programs: 6601355 -> 6588351 (-0.20%) cycles in affected programs: 25376 -> 12372 (-51.25%) helped: 21 HURT: 0 helped stats (abs) min: 156 max: 1410 x̄: 619.24 x̃: 526 helped stats (rel) min: 42.39% max: 53.98% x̄: 50.12% x̃: 50.75% 95% mean confidence interval for cycles value: -776.58 -461.89 95% mean confidence interval for cycles %-change: -51.57% -48.67% Cycles are helped. Reviewed-by: Jason Ekstrand <[email protected]> [v1] Reviewed-by: Matt Turner <[email protected]> [v1] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4142>
author: Ian Romanick <[email protected]> 2020-03-03 10:21:18 -0800
committer: Marge Bot <[email protected]> 2020-03-18 20:36:29 +0000
commit: 5b07f542e5a4ab698becbc238ff2ccc4720418bf (patch)
tree: 589863ba9c02698061b4e98eb10845c20df1a321
parent: 617a69107ee58e23ace06093bc49fa2c86b7dd4b (diff)
1 files changed, 16 insertions, 8 deletions
diff --git a/src/compiler/glsl/float64.glsl b/src/compiler/glsl/float64.glsl
index 67e0e657c55..3cc8aa7fa4b 100644
--- a/src/compiler/glsl/float64.glsl
+++ b/src/compiler/glsl/float64.glsl
@@ -1784,21 +1784,29 @@ __fround64(uint64_t __a)
 uint64_t
 __fmin64(uint64_t a, uint64_t b)
 {
-   if (__is_nan(a)) return b;
-   if (__is_nan(b)) return a;
+   /* This weird layout matters.  Doing the "obvious" thing results in extra
+    * flow control being inserted to implement the short-circuit evaluation
+    * rules.  Flow control is bad!
+    */
+   bool b_nan = __is_nan(b);
+   bool a_lt_b = __flt64_nonnan(a, b);
+   bool a_nan = __is_nan(a);
 
-   if (__flt64_nonnan(a, b)) return a;
-   return b;
+   return (b_nan || a_lt_b) && !a_nan ? a : b;
 }
 
 uint64_t
 __fmax64(uint64_t a, uint64_t b)
 {
-   if (__is_nan(a)) return b;
-   if (__is_nan(b)) return a;
+   /* This weird layout matters.  Doing the "obvious" thing results in extra
+    * flow control being inserted to implement the short-circuit evaluation
+    * rules.  Flow control is bad!
+    */
+   bool b_nan = __is_nan(b);
+   bool a_lt_b = __flt64_nonnan(a, b);
+   bool a_nan = __is_nan(a);
 
-   if (__flt64_nonnan(a, b)) return b;
-   return a;
+   return (b_nan || a_lt_b) && !a_nan ? b : a;
 }
 
 uint64_t
author	Ian Romanick <[email protected]>	2020-03-03 10:21:18 -0800
committer	Marge Bot <[email protected]>	2020-03-18 20:36:29 +0000
commit	5b07f542e5a4ab698becbc238ff2ccc4720418bf (patch)
tree	589863ba9c02698061b4e98eb10845c20df1a321
parent	617a69107ee58e23ace06093bc49fa2c86b7dd4b (diff)