1 files changed, 67 insertions, 15 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 8437a4e02f4..5093dd59782 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -68,6 +68,7 @@ public:
       this->child_count = 0;
       this->parent_count = 0;
       this->unblocked_time = 0;
+      this->cand_generation = 0;
 
       /* We can't measure Gen6 timings directly but expect them to be much
        * closer to Gen7 than Gen4.
@@ -91,6 +92,12 @@ public:
    int latency;
 
    /**
+    * Which iteration of pushing groups of children onto the candidates list
+    * this node was a part of.
+    */
+   unsigned cand_generation;
+
+   /**
     * This is the sum of the instruction's latency plus the maximum delay of
     * its children, or just the issue_time if it's a leaf node.
     */
@@ -973,26 +980,68 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
        * variables so that we can avoid register spilling, or get 16-wide
        * shaders which naturally do a better job of hiding instruction
        * latency.
-       *
-       * To do so, schedule our instructions in a roughly LIFO/depth-first
-       * order: when new instructions become available as a result of
-       * scheduling something, choose those first so that our result
-       * hopefully is consumed quickly.
-       *
-       * The exception is messages that generate more than one result
-       * register (AKA texturing).  In those cases, the LIFO search would
-       * normally tend to choose them quickly (because scheduling the
-       * previous message not only unblocked the children using its result,
-       * but also the MRF setup for the next sampler message, which in turn
-       * unblocks the next sampler message).
        */
       foreach_list(node, &instructions) {
          schedule_node *n = (schedule_node *)node;
          fs_inst *inst = (fs_inst *)n->inst;
 
-         chosen = n;
-         if (v->brw->gen >= 7 || inst->regs_written <= 1)
-            break;
+         if (!chosen) {
+            chosen = n;
+            continue;
+         }
+
+         /* Prefer instructions that recently became available for scheduling.
+          * These are the things that are most likely to (eventually) make a
+          * variable dead and reduce register pressure.  Typical register
+          * pressure estimates don't work for us because most of our pressure
+          * comes from texturing, where no single instruction to schedule will
+          * make a vec4 value dead.
+          */
+         if (n->cand_generation > chosen->cand_generation) {
+            chosen = n;
+            continue;
+         } else if (n->cand_generation < chosen->cand_generation) {
+            continue;
+         }
+
+         /* On MRF-using chips, prefer non-SEND instructions.  If we don't do
+          * this, then because we prefer instructions that just became
+          * candidates, we'll end up in a pattern of scheduling a SEND, then
+          * the MRFs for the next SEND, then the next SEND, then the MRFs,
+          * etc., without ever consuming the results of a send.
+          */
+         if (v->brw->gen < 7) {
+            fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+            /* We use regs_written > 1 as our test for the kind of send
+             * instruction to avoid -- only sends generate many regs, and a
+             * single-result send is probably actually reducing register
+             * pressure.
+             */
+            if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+               chosen = n;
+               continue;
+            } else if (inst->regs_written > chosen_inst->regs_written) {
+               continue;
+            }
+         }
+
+         /* For instructions pushed on the cands list at the same time, prefer
+          * the one with the highest delay to the end of the program.  This is
+          * most likely to have its values able to be consumed first (such as
+          * for a large tree of lowered ubo loads, which appear reversed in
+          * the instruction stream with respect to when they can be consumed).
+          */
+         if (n->delay > chosen->delay) {
+            chosen = n;
+            continue;
+         } else if (n->delay < chosen->delay) {
+            continue;
+         }
+
+         /* If all other metrics are equal, we prefer the first instruction in
+          * the list (program execution).
+          */
       }
    }
 
@@ -1048,6 +1097,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
 	 n->remove();
    }
 
+   unsigned cand_generation = 1;
    while (!instructions.is_empty()) {
       schedule_node *chosen = choose_instruction_to_schedule();
 
@@ -1090,6 +1140,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
             bv->dump_instruction(child->inst);
          }
 
+         child->cand_generation = cand_generation;
 	 child->parent_count--;
 	 if (child->parent_count == 0) {
             if (debug) {
@@ -1098,6 +1149,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
 	    instructions.push_head(child);
 	 }
       }
+      cand_generation++;
 
       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
        * but it's more limited pre-gen6, so if we send something off to it then