diff options
author | Ian Romanick <[email protected]> | 2018-08-18 16:49:48 -0700 |
---|---|---|
committer | Ian Romanick <[email protected]> | 2019-05-06 22:52:29 -0700 |
commit | 23c5501b77efdb1f071709c23ed21f64f8b9cb00 (patch) | |
tree | 844ed87d475104fadb766da9f5403b00668fa33c | |
parent | dd7135d55d5bfbbcd278a2ca33b6b8cc04776080 (diff) |
nir/flrp: Lower flrp(#a, #b, c) differently
If the magnitudes of #a and #b are such that (b-a) won't lose too much
precision, lower as a+c(b-a).
No changes on any other Intel platforms.
v2: Rebase on 424372e5dd5 ("nir: Use the flrp lowering pass instead of
nir_opt_algebraic")
Iron Lake and GM45 had similar results. (Iron Lake shown)
total instructions in shared programs: 8192503 -> 8192383 (<.01%)
instructions in affected programs: 18417 -> 18297 (-0.65%)
helped: 68
HURT: 0
helped stats (abs) min: 1 max: 18 x̄: 1.76 x̃: 1
helped stats (rel) min: 0.19% max: 7.89% x̄: 1.10% x̃: 0.43%
95% mean confidence interval for instructions value: -2.48 -1.05
95% mean confidence interval for instructions %-change: -1.56% -0.63%
Instructions are helped.
total cycles in shared programs: 188662536 -> 188661956 (<.01%)
cycles in affected programs: 744476 -> 743896 (-0.08%)
helped: 62
HURT: 0
helped stats (abs) min: 4 max: 60 x̄: 9.35 x̃: 6
helped stats (rel) min: 0.02% max: 4.84% x̄: 0.27% x̃: 0.06%
95% mean confidence interval for cycles value: -12.37 -6.34
95% mean confidence interval for cycles %-change: -0.48% -0.06%
Cycles are helped.
Reviewed-by: Matt Turner <[email protected]>
-rw-r--r-- | src/compiler/nir/nir_lower_flrp.c | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/compiler/nir/nir_lower_flrp.c b/src/compiler/nir/nir_lower_flrp.c index 2d57998b41d..952068ec9cc 100644 --- a/src/compiler/nir/nir_lower_flrp.c +++ b/src/compiler/nir/nir_lower_flrp.c @@ -20,6 +20,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ +#include <math.h> #include "nir.h" #include "nir_builder.h" #include "util/u_vector.h" @@ -136,6 +137,58 @@ replace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp, append_flrp_to_dead_list(dead_flrp, alu); } +static bool +sources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr) +{ + nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src); + nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src); + + if (val0 == NULL || val1 == NULL) + return false; + + const uint8_t *const swizzle0 = instr->src[0].swizzle; + const uint8_t *const swizzle1 = instr->src[1].swizzle; + const unsigned num_components = nir_dest_num_components(instr->dest.dest); + + if (instr->dest.dest.ssa.bit_size == 32) { + for (unsigned i = 0; i < num_components; i++) { + int exp0; + int exp1; + + frexpf(val0[swizzle0[i]].f32, &exp0); + frexpf(val1[swizzle1[i]].f32, &exp1); + + /* If the difference between exponents is >= 24, then A+B will always + * have the value whichever between A and B has the largest absolute + * value. So, [0, 23] is the valid range. The smaller the limit + * value, the more precision will be maintained at a potential + * performance cost. Somewhat arbitrarilly split the range in half. + */ + if (abs(exp0 - exp1) > (23 / 2)) + return false; + } + } else { + for (unsigned i = 0; i < num_components; i++) { + int exp0; + int exp1; + + frexp(val0[swizzle0[i]].f64, &exp0); + frexp(val1[swizzle1[i]].f64, &exp1); + + /* If the difference between exponents is >= 53, then A+B will always + * have the value whichever between A and B has the largest absolute + * value. So, [0, 52] is the valid range. The smaller the limit + * value, the more precision will be maintained at a potential + * performance cost. Somewhat arbitrarilly split the range in half. + */ + if (abs(exp0 - exp1) > (52 / 2)) + return false; + } + } + + return true; +} + static void convert_flrp_instruction(nir_builder *bld, struct u_vector *dead_flrp, @@ -197,6 +250,21 @@ convert_flrp_instruction(nir_builder *bld, return; } + /* + * - If x and y are both immediates and the relative magnitude of the + * values is similar (such that x-y does not lose too much precision): + * + * x + t(x - y) + * + * We rely on constant folding to eliminate x-y, and we rely on + * nir_opt_algebraic to possibly generate an FMA. The cost is either one + * FMA or two instructions. + */ + if (sources_are_constants_with_similar_magnitudes(alu)) { + replace_with_fast(bld, dead_flrp, alu); + return; + } + if (have_ffma) { if (always_precise) { replace_with_strict_ffma(bld, dead_flrp, alu); |