diff options
author | Ian Romanick <[email protected]> | 2019-02-27 19:52:12 -0800 |
---|---|---|
committer | Ian Romanick <[email protected]> | 2019-03-08 22:24:19 -0800 |
commit | 37ee462e036b9b3bd90bc2b50fc4b05ac9a63560 (patch) | |
tree | 19a6237e6ba48888d18e9870b5d96c2c7eb90dca /src/compiler/nir | |
parent | 8fdee457a4cb20f4587b5ec817aa1f9325bd5f1c (diff) |
nir/algebraic: Fix up extract_[iu]8 after loop unrolling
Skylake, Broadwell, and Haswell had similar results. (Skylake shown)
total instructions in shared programs: 15256840 -> 15256837 (<.01%)
instructions in affected programs: 4713 -> 4710 (-0.06%)
helped: 3
HURT: 0
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.06% max: 0.08% x̄: 0.06% x̃: 0.06%
total cycles in shared programs: 372286583 -> 372286583 (0.00%)
cycles in affected programs: 198516 -> 198516 (0.00%)
helped: 1
HURT: 1
helped stats (abs) min: 10 max: 10 x̄: 10.00 x̃: 10
helped stats (rel) min: <.01% max: <.01% x̄: <.01% x̃: <.01%
HURT stats (abs) min: 10 max: 10 x̄: 10.00 x̃: 10
HURT stats (rel) min: 0.01% max: 0.01% x̄: 0.01% x̃: 0.01%
No changes on any other Intel platform.
v2: Use a loop to generate patterns. Suggested by Jason.
Reviewed-by: Matt Turner <[email protected]> [v1]
Reviewed-by: Dylan Baker <[email protected]>
Acked-by: Jason Ekstrand <[email protected]>
Diffstat (limited to 'src/compiler/nir')
-rw-r--r-- | src/compiler/nir/nir_opt_algebraic.py | 22 |
1 files changed, 20 insertions, 2 deletions
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 5b2e7ee2405..ac6e5b99220 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -614,8 +614,26 @@ optimizations = [ (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'), (('iand', 0xff, ('ushr', a, 8)), ('extract_u8', a, 1), '!options->lower_extract_byte'), - (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), + (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte') +] + +# The ('extract_u8', a, 0) pattern, above, can trigger in cases where the +# shift count is based on a loop induction variable. Once the loop is +# unrolled, constant folding will generate patterns like those below. +for op in ('ushr', 'ishr'): + optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) + optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) + optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) + +optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) +# The ('extract_[iu]8', a, 3) patterns, above, can trigger in cases where the +# shift count is based on a loop induction variable. Once the loop is +# unrolled, constant folding will generate patterns like those below. +for op in ('extract_u8', 'extract_i8'): + optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), ('extract_u8', a, i)) for i in range(2, -1, -1)]) + +optimizations.extend([ # Word extraction (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), @@ -798,7 +816,7 @@ optimizations = [ 'options->lower_unpack_snorm_4x8'), (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), -] +]) # bit_size dependent lowerings for bit_size in [8, 16, 32, 64]: |