i965/vs: Add a pass to set dependency control fields on instructions.

This is a more aggressive version of the old brw_optimize() path. Reduces cycles spent in the vertex shader on minecraft by 18.6% +/- 10.0% (n=15). Reviewed-by: Kenneth Graunke <[email protected]>
author: Eric Anholt <[email protected]> 2012-11-30 18:29:34 -0800
committer: Eric Anholt <[email protected]> 2013-04-01 11:36:05 -0700
commit: 4fee05b020af72ee802d4349de76fbc36cdd53a9 (patch)
tree: ecfe2d3d96455b50e7f6ca2612392f2f78bead28 /src
parent: 229a51cdbe3128626fd359fe03722a55e40927d7 (diff)
3 files changed, 126 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 184eff9345b..c58fb444b94 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -22,6 +22,7 @@
  */
 
 #include "brw_vec4.h"
+#include "brw_cfg.h"
 #include "glsl/ir_print_visitor.h"
 
 extern "C" {
@@ -620,6 +621,112 @@ vec4_visitor::move_push_constants_to_pull_constants()
    pack_uniform_registers();
 }
 
+/**
+ * Sets the dependency control fields on instructions after register
+ * allocation and before the generator is run.
+ *
+ * When you have a sequence of instructions like:
+ *
+ * DP4 temp.x vertex uniform[0]
+ * DP4 temp.y vertex uniform[0]
+ * DP4 temp.z vertex uniform[0]
+ * DP4 temp.w vertex uniform[0]
+ *
+ * The hardware doesn't know that it can actually run the later instructions
+ * while the previous ones are in flight, producing stalls.  However, we have
+ * manual fields we can set in the instructions that let it do so.
+ */
+void
+vec4_visitor::opt_set_dependency_control()
+{
+   vec4_instruction *last_grf_write[BRW_MAX_GRF];
+   uint8_t grf_channels_written[BRW_MAX_GRF];
+   vec4_instruction *last_mrf_write[BRW_MAX_GRF];
+   uint8_t mrf_channels_written[BRW_MAX_GRF];
+
+   cfg_t cfg(this);
+
+   assert(prog_data->total_grf || !"Must be called after register allocation");
+
+   for (int i = 0; i < cfg.num_blocks; i++) {
+      bblock_t *bblock = cfg.blocks[i];
+      vec4_instruction *inst;
+
+      memset(last_grf_write, 0, sizeof(last_grf_write));
+      memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+      for (inst = (vec4_instruction *)bblock->start;
+           inst != (vec4_instruction *)bblock->end->next;
+           inst = (vec4_instruction *)inst->next) {
+         /* If we read from a register that we were doing dependency control
+          * on, don't do dependency control across the read.
+          */
+         for (int i = 0; i < 3; i++) {
+            int reg = inst->src[i].reg + inst->src[i].reg_offset;
+            if (inst->src[i].file == GRF) {
+               last_grf_write[reg] = NULL;
+            } else if (inst->src[i].file == HW_REG) {
+               memset(last_grf_write, 0, sizeof(last_grf_write));
+               break;
+            }
+            assert(inst->src[i].file != MRF);
+         }
+
+         /* In the presence of send messages, totally interrupt dependency
+          * control.  They're long enough that the chance of dependency
+          * control around them just doesn't matter.
+          */
+         if (inst->mlen) {
+            memset(last_grf_write, 0, sizeof(last_grf_write));
+            memset(last_mrf_write, 0, sizeof(last_mrf_write));
+            continue;
+         }
+
+         /* It looks like setting dependency control on a predicated
+          * instruction hangs the GPU.
+          */
+         if (inst->predicate) {
+            memset(last_grf_write, 0, sizeof(last_grf_write));
+            memset(last_mrf_write, 0, sizeof(last_mrf_write));
+            continue;
+         }
+
+         /* Now, see if we can do dependency control for this instruction
+          * against a previous one writing to its destination.
+          */
+         int reg = inst->dst.reg + inst->dst.reg_offset;
+         if (inst->dst.file == GRF) {
+            if (last_grf_write[reg] &&
+                !(inst->dst.writemask & grf_channels_written[reg])) {
+               last_grf_write[reg]->no_dd_clear = true;
+               inst->no_dd_check = true;
+            } else {
+               grf_channels_written[reg] = 0;
+            }
+
+            last_grf_write[reg] = inst;
+            grf_channels_written[reg] |= inst->dst.writemask;
+         } else if (inst->dst.file == MRF) {
+            if (last_mrf_write[reg] &&
+                !(inst->dst.writemask & mrf_channels_written[reg])) {
+               last_mrf_write[reg]->no_dd_clear = true;
+               inst->no_dd_check = true;
+            } else {
+               mrf_channels_written[reg] = 0;
+            }
+
+            last_mrf_write[reg] = inst;
+            mrf_channels_written[reg] |= inst->dst.writemask;
+         } else if (inst->dst.reg == HW_REG) {
+            if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
+               memset(last_grf_write, 0, sizeof(last_grf_write));
+            if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
+               memset(last_mrf_write, 0, sizeof(last_mrf_write));
+         }
+      }
+   }
+}
+
 bool
 vec4_instruction::can_reswizzle_dst(int dst_writemask,
                                     int swizzle,
@@ -1355,6 +1462,8 @@ vec4_visitor::run()
          break;
    }
 
+   opt_set_dependency_control();
+
    /* If any state parameters were appended, then ParameterValues could have
     * been realloced, in which case the driver uniform storage set up by
     * _mesa_associate_uniform_storage() would point to freed memory.  Make
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 1f832d19cd8..8f130e15428 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -172,6 +172,7 @@ public:
 
    bool saturate;
    bool force_writemask_all;
+   bool no_dd_clear, no_dd_check;
 
    int conditional_mod; /**< BRW_CONDITIONAL_* */
 
@@ -337,6 +338,7 @@ public:
    bool opt_copy_propagation();
    bool opt_algebraic();
    bool opt_register_coalesce();
+   void opt_set_dependency_control();
 
    bool can_do_source_mods(vec4_instruction *inst);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index cb49a042390..e378f7fd5f0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -742,6 +742,8 @@ vec4_generator::generate_code(exec_list *instructions)
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
 
+      unsigned pre_emit_nr_insn = p->nr_insn;
+
       switch (inst->opcode) {
       case BRW_OPCODE_MOV:
 	 brw_MOV(p, dst, src[0]);
@@ -868,6 +870,19 @@ vec4_generator::generate_code(exec_list *instructions)
 	 break;
       }
 
+      if (inst->no_dd_clear || inst->no_dd_check) {
+         assert(p->nr_insn == pre_emit_nr_insn + 1 ||
+                !"no_dd_check or no_dd_clear set for IR emitting more "
+                "than 1 instruction");
+
+         struct brw_instruction *last = &p->store[pre_emit_nr_insn];
+
+         if (inst->no_dd_clear)
+            last->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
+         if (inst->no_dd_check)
+            last->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
+      }
+
       if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
 	 brw_dump_compile(p, stdout,
 			  last_native_insn_offset, p->next_insn_offset);
author	Eric Anholt <[email protected]>	2012-11-30 18:29:34 -0800
committer	Eric Anholt <[email protected]>	2013-04-01 11:36:05 -0700
commit	4fee05b020af72ee802d4349de76fbc36cdd53a9 (patch)
tree	ecfe2d3d96455b50e7f6ca2612392f2f78bead28 /src
parent	229a51cdbe3128626fd359fe03722a55e40927d7 (diff)