summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIan Romanick <[email protected]>2018-06-19 18:09:05 -0700
committerIan Romanick <[email protected]>2018-12-17 13:47:06 -0800
commit4cd1a0be76883c2b13aae8c97972e8f1404d06f7 (patch)
tree1089f35e137b29d400d723fef8880e6b3cc60dbe
parent9a83c3d3b3f147eb8ae17c9a40e518e77db41432 (diff)
i965/vec4: Propagate conditional modifiers from more compares to other compares
If there is a CMP.NZ that compares a single component (via a .zzzz swizzle, for example) with 0, it can propagate its conditional modifier back to a previous CMP that writes only that component. The specific case that I saw was: cmp.l.f0(8) g42<1>.xF g61<4>.xF (abs)g18<4>.zF ... cmp.nz.f0(8) null<1>D g42<4>.xD 0D In this case we can just delete the second CMP. No changes on Broadwell or Skylake because they do not use the vec4 backend. Also no changes on GM45 or Iron Lake. Sandy Bridge, Ivy Bridge, and Haswell had similar results. (Sandy Bridge shown) total instructions in shared programs: 10856676 -> 10852569 (-0.04%) instructions in affected programs: 228322 -> 224215 (-1.80%) helped: 1331 HURT: 0 helped stats (abs) min: 1 max: 7 x̄: 3.09 x̃: 4 helped stats (rel) min: 0.11% max: 6.67% x̄: 1.88% x̃: 1.83% 95% mean confidence interval for instructions value: -3.19 -2.99 95% mean confidence interval for instructions %-change: -1.93% -1.83% Instructions are helped. total cycles in shared programs: 154788865 -> 154732047 (-0.04%) cycles in affected programs: 2485892 -> 2429074 (-2.29%) helped: 1097 HURT: 59 helped stats (abs) min: 2 max: 168 x̄: 51.96 x̃: 64 helped stats (rel) min: 0.12% max: 12.70% x̄: 3.44% x̃: 2.22% HURT stats (abs) min: 2 max: 16 x̄: 3.02 x̃: 2 HURT stats (rel) min: 0.18% max: 0.83% x̄: 0.64% x̃: 0.71% 95% mean confidence interval for cycles value: -51.04 -47.26 95% mean confidence interval for cycles %-change: -3.40% -3.07% Cycles are helped. Signed-off-by: Ian Romanick <[email protected]> Reviewed-by: Lionel Landwerlin <[email protected]>
-rw-r--r--src/intel/compiler/brw_vec4_cmod_propagation.cpp103
1 files changed, 100 insertions, 3 deletions
diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
index a1d46dc8dca..760327d559d 100644
--- a/src/intel/compiler/brw_vec4_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
@@ -47,7 +47,7 @@ writemasks_incompatible(const vec4_instruction *earlier,
}
static bool
-opt_cmod_propagation_local(bblock_t *block)
+opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
{
bool progress = false;
int ip = block->end_ip + 1;
@@ -146,12 +146,109 @@ opt_cmod_propagation_local(bblock_t *block)
scan_inst->dst, scan_inst->size_written)) {
if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
scan_inst->dst.offset != inst->src[0].offset ||
- writemasks_incompatible(scan_inst, inst) ||
scan_inst->exec_size != inst->exec_size ||
scan_inst->group != inst->group) {
break;
}
+ /* If scan_inst is a CMP that produces a single value and inst is
+ * a CMP.NZ that consumes only that value, remove inst.
+ */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ (inst->src[0].type == BRW_REGISTER_TYPE_D ||
+ inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
+ (inst->opcode == BRW_OPCODE_CMP ||
+ inst->opcode == BRW_OPCODE_MOV) &&
+ scan_inst->opcode == BRW_OPCODE_CMP &&
+ ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
+ scan_inst->dst.writemask == WRITEMASK_X) ||
+ (inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
+ scan_inst->dst.writemask == WRITEMASK_Y) ||
+ (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
+ scan_inst->dst.writemask == WRITEMASK_Z) ||
+ (inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
+ scan_inst->dst.writemask == WRITEMASK_W))) {
+ if (inst->dst.writemask != scan_inst->dst.writemask) {
+ src_reg temp(v, glsl_type::vec4_type, 1);
+
+ /* Given a sequence like:
+ *
+ * cmp.ge.f0(8) g21<1>.xF g20<4>.xF g18<4>.xF
+ * ...
+ * cmp.nz.f0(8) null<1>D g21<4>.xD 0D
+ *
+ * Replace it with something like:
+ *
+ * cmp.ge.f0(8) g22<1>F g20<4>.xF g18<4>.xF
+ * mov(8) g21<1>.xF g22<1>.xxxxF
+ *
+ * The added MOV will most likely be removed later. In the
+ * worst case, it should be cheaper to schedule.
+ */
+ temp.swizzle = inst->src[0].swizzle;
+ temp.type = scan_inst->src[0].type;
+
+ vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
+
+ /* Modify the source swizzles on scan_inst. If scan_inst
+ * was
+ *
+ * cmp.ge.f0(8) g21<1>.zF g20<4>.wzyxF g18<4>.yxwzF
+ *
+ * replace it with
+ *
+ * cmp.ge.f0(8) g21<1>.zF g20<4>.yyyyF g18<4>.wwwwF
+ */
+ unsigned src0_chan;
+ unsigned src1_chan;
+ switch (scan_inst->dst.writemask) {
+ case WRITEMASK_X:
+ src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
+ src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
+ break;
+ case WRITEMASK_Y:
+ src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
+ src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
+ break;
+ case WRITEMASK_Z:
+ src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
+ src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
+ break;
+ case WRITEMASK_W:
+ src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
+ src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
+ break;
+ default:
+ unreachable("Impossible writemask");
+ }
+
+ scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
+ src0_chan,
+ src0_chan,
+ src0_chan);
+
+ /* There's no swizzle on immediate value sources. */
+ if (scan_inst->src[1].file != IMM) {
+ scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
+ src1_chan,
+ src1_chan,
+ src1_chan);
+ }
+
+ scan_inst->dst = dst_reg(temp);
+ scan_inst->dst.writemask = inst->dst.writemask;
+
+ scan_inst->insert_after(block, mov);
+ }
+
+ inst->remove(block);
+ progress = true;
+ break;
+ }
+
+ if (writemasks_incompatible(scan_inst, inst))
+ break;
+
/* CMP's result is the same regardless of dest type. */
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
scan_inst->opcode == BRW_OPCODE_CMP &&
@@ -256,7 +353,7 @@ vec4_visitor::opt_cmod_propagation()
bool progress = false;
foreach_block_reverse(block, cfg) {
- progress = opt_cmod_propagation_local(block) || progress;
+ progress = opt_cmod_propagation_local(block, this) || progress;
}
if (progress)