diff options
author | Ian Romanick <[email protected]> | 2018-02-21 18:06:56 -0800 |
---|---|---|
committer | Ian Romanick <[email protected]> | 2018-03-08 15:26:26 -0800 |
commit | 52c7df1643ec9af119fd66f916f7fbdbcc798d2d (patch) | |
tree | 3784e5a34a93ed0b68b6ce790fedfba85c899c70 /src/intel/compiler | |
parent | 70de61594dcf99f24eb31ebf98d62f13e1f44c2e (diff) |
i965/fs: Merge CMP and SEL into CSEL on Gen8+
v2: Fix several problems handling inverted predicates. Add a much
bigger comment around the BRW_CONDITIONAL_NZ case.
v3: Allow uniforms and shader inputs as sources for the original SEL and
CMP instructions. This enables a LOT more shaders to receive CSEL
merging (5816 vs 8564 on SKL).
v4: Report progress.
Broadwell and Skylake had similar results. (Broadwell shown)
helped: 8527
HURT: 0
helped stats (abs) min: 1 max: 27 x̄: 2.44 x̃: 1
helped stats (rel) min: 0.03% max: 17.80% x̄: 1.12% x̃: 0.70%
95% mean confidence interval for instructions value: -2.51 -2.36
95% mean confidence interval for instructions %-change: -1.15% -1.10%
Instructions are helped.
total cycles in shared programs: 559442317 -> 558288357 (-0.21%)
cycles in affected programs: 372699860 -> 371545900 (-0.31%)
helped: 6748
HURT: 1450
helped stats (abs) min: 1 max: 32000 x̄: 182.41 x̃: 12
helped stats (rel) min: <.01% max: 66.08% x̄: 3.42% x̃: 0.70%
HURT stats (abs) min: 1 max: 2538 x̄: 53.08 x̃: 14
HURT stats (rel) min: <.01% max: 96.72% x̄: 3.32% x̃: 0.90%
95% mean confidence interval for cycles value: -179.01 -102.51
95% mean confidence interval for cycles %-change: -2.37% -2.08%
Cycles are helped.
LOST: 0
GAINED: 6
No changes on earlier platforms.
Signed-off-by: Ian Romanick <[email protected]>
Reviewed-by: Samuel Iglesias Gonsálvez <[email protected]> [v1]
Reviewed-by: Kenneth Graunke <[email protected]> [v3]
Reviewed-by: Matt Turner <[email protected]>
Diffstat (limited to 'src/intel/compiler')
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 106 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.h | 1 |
2 files changed, 107 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 02a8ea0fd9d..422eedcf0af 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2844,6 +2844,106 @@ mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) } bool +fs_visitor::opt_peephole_csel() +{ + if (devinfo->gen < 8) + return false; + + bool progress = false; + + foreach_block_reverse(block, cfg) { + int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + ip--; + + if (inst->opcode != BRW_OPCODE_SEL || + inst->predicate != BRW_PREDICATE_NORMAL || + (inst->dst.type != BRW_REGISTER_TYPE_F && + inst->dst.type != BRW_REGISTER_TYPE_D && + inst->dst.type != BRW_REGISTER_TYPE_UD)) + continue; + + /* Because it is a 3-src instruction, CSEL cannot have an immediate + * value as a source, but we can sometimes handle zero. + */ + if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR && + inst->src[0].file != UNIFORM) || + (inst->src[1].file != VGRF && inst->src[1].file != ATTR && + inst->src[1].file != UNIFORM && !inst->src[1].is_zero())) + continue; + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (!scan_inst->flags_written()) + continue; + + if ((scan_inst->opcode != BRW_OPCODE_CMP && + scan_inst->opcode != BRW_OPCODE_MOV) || + scan_inst->predicate != BRW_PREDICATE_NONE || + (scan_inst->src[0].file != VGRF && + scan_inst->src[0].file != ATTR && + scan_inst->src[0].file != UNIFORM) || + scan_inst->src[0].type != BRW_REGISTER_TYPE_F) + break; + + if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero()) + break; + + const brw::fs_builder ibld(this, block, inst); + + const enum brw_conditional_mod cond = + inst->predicate_inverse + ? brw_negate_cmod(scan_inst->conditional_mod) + : scan_inst->conditional_mod; + + fs_inst *csel_inst = NULL; + + if (inst->src[1].file != IMM) { + csel_inst = ibld.CSEL(inst->dst, + inst->src[0], + inst->src[1], + scan_inst->src[0], + cond); + } else if (cond == BRW_CONDITIONAL_NZ) { + /* Consider the sequence + * + * cmp.nz.f0 null<1>F g3<8,8,1>F 0F + * (+f0) sel g124<1>UD g2<8,8,1>UD 0x00000000UD + * + * The sel will pick the immediate value 0 if r0 is ±0.0. + * Therefore, this sequence is equivalent: + * + * cmp.nz.f0 null<1>F g3<8,8,1>F 0F + * (+f0) sel g124<1>F g2<8,8,1>F (abs)g3<8,8,1>F + * + * The abs is ensures that the result is 0UD when g3 is -0.0F. + * By normal cmp-sel merging, this is also equivalent: + * + * csel.nz g124<1>F g2<4,4,1>F (abs)g3<4,4,1>F g3<4,4,1>F + */ + csel_inst = ibld.CSEL(inst->dst, + inst->src[0], + scan_inst->src[0], + scan_inst->src[0], + cond); + + csel_inst->src[1].abs = true; + } + + if (csel_inst != NULL) { + progress = true; + inst->remove(block); + } + + break; + } + } + } + + return progress; +} + +bool fs_visitor::compute_to_mrf() { bool progress = false; @@ -6078,6 +6178,12 @@ fs_visitor::optimize() OPT(compact_virtual_grfs); } while (progress); + /* Do this after cmod propagation has had every possible opportunity to + * propagate results into SEL instructions. + */ + if (OPT(opt_peephole_csel)) + OPT(dead_code_eliminate); + progress = false; pass_num = 0; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 1b7df844696..e384db809dc 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -191,6 +191,7 @@ public: fs_reg resolve_source_modifiers(const fs_reg &src); void emit_discard_jump(); bool opt_peephole_sel(); + bool opt_peephole_csel(); bool opt_peephole_predicated_break(); bool opt_saturate_propagation(); bool opt_cmod_propagation(); |