summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp82
1 files changed, 67 insertions, 15 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 8437a4e02f4..5093dd59782 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -68,6 +68,7 @@ public:
this->child_count = 0;
this->parent_count = 0;
this->unblocked_time = 0;
+ this->cand_generation = 0;
/* We can't measure Gen6 timings directly but expect them to be much
* closer to Gen7 than Gen4.
@@ -91,6 +92,12 @@ public:
int latency;
/**
+ * Which iteration of pushing groups of children onto the candidates list
+ * this node was a part of.
+ */
+ unsigned cand_generation;
+
+ /**
* This is the sum of the instruction's latency plus the maximum delay of
* its children, or just the issue_time if it's a leaf node.
*/
@@ -973,26 +980,68 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
* variables so that we can avoid register spilling, or get 16-wide
* shaders which naturally do a better job of hiding instruction
* latency.
- *
- * To do so, schedule our instructions in a roughly LIFO/depth-first
- * order: when new instructions become available as a result of
- * scheduling something, choose those first so that our result
- * hopefully is consumed quickly.
- *
- * The exception is messages that generate more than one result
- * register (AKA texturing). In those cases, the LIFO search would
- * normally tend to choose them quickly (because scheduling the
- * previous message not only unblocked the children using its result,
- * but also the MRF setup for the next sampler message, which in turn
- * unblocks the next sampler message).
*/
foreach_list(node, &instructions) {
schedule_node *n = (schedule_node *)node;
fs_inst *inst = (fs_inst *)n->inst;
- chosen = n;
- if (v->brw->gen >= 7 || inst->regs_written <= 1)
- break;
+ if (!chosen) {
+ chosen = n;
+ continue;
+ }
+
+ /* Prefer instructions that recently became available for scheduling.
+ * These are the things that are most likely to (eventually) make a
+ * variable dead and reduce register pressure. Typical register
+ * pressure estimates don't work for us because most of our pressure
+ * comes from texturing, where no single instruction to schedule will
+ * make a vec4 value dead.
+ */
+ if (n->cand_generation > chosen->cand_generation) {
+ chosen = n;
+ continue;
+ } else if (n->cand_generation < chosen->cand_generation) {
+ continue;
+ }
+
+ /* On MRF-using chips, prefer non-SEND instructions. If we don't do
+ * this, then because we prefer instructions that just became
+ * candidates, we'll end up in a pattern of scheduling a SEND, then
+ * the MRFs for the next SEND, then the next SEND, then the MRFs,
+ * etc., without ever consuming the results of a send.
+ */
+ if (v->brw->gen < 7) {
+ fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+ /* We use regs_written > 1 as our test for the kind of send
+ * instruction to avoid -- only sends generate many regs, and a
+ * single-result send is probably actually reducing register
+ * pressure.
+ */
+ if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+ chosen = n;
+ continue;
+ } else if (inst->regs_written > chosen_inst->regs_written) {
+ continue;
+ }
+ }
+
+ /* For instructions pushed on the cands list at the same time, prefer
+ * the one with the highest delay to the end of the program. This is
+ * most likely to have its values able to be consumed first (such as
+ * for a large tree of lowered ubo loads, which appear reversed in
+ * the instruction stream with respect to when they can be consumed).
+ */
+ if (n->delay > chosen->delay) {
+ chosen = n;
+ continue;
+ } else if (n->delay < chosen->delay) {
+ continue;
+ }
+
+ /* If all other metrics are equal, we prefer the first instruction in
+ * the list (program execution).
+ */
}
}
@@ -1048,6 +1097,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
n->remove();
}
+ unsigned cand_generation = 1;
while (!instructions.is_empty()) {
schedule_node *chosen = choose_instruction_to_schedule();
@@ -1090,6 +1140,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
bv->dump_instruction(child->inst);
}
+ child->cand_generation = cand_generation;
child->parent_count--;
if (child->parent_count == 0) {
if (debug) {
@@ -1098,6 +1149,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
instructions.push_head(child);
}
}
+ cand_generation++;
/* Shared resource: the mathbox. There's one mathbox per EU on Gen6+
* but it's more limited pre-gen6, so if we send something off to it then