diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 82 |
1 files changed, 67 insertions, 15 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 8437a4e02f4..5093dd59782 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -68,6 +68,7 @@ public: this->child_count = 0; this->parent_count = 0; this->unblocked_time = 0; + this->cand_generation = 0; /* We can't measure Gen6 timings directly but expect them to be much * closer to Gen7 than Gen4. @@ -91,6 +92,12 @@ public: int latency; /** + * Which iteration of pushing groups of children onto the candidates list + * this node was a part of. + */ + unsigned cand_generation; + + /** * This is the sum of the instruction's latency plus the maximum delay of * its children, or just the issue_time if it's a leaf node. */ @@ -973,26 +980,68 @@ fs_instruction_scheduler::choose_instruction_to_schedule() * variables so that we can avoid register spilling, or get 16-wide * shaders which naturally do a better job of hiding instruction * latency. - * - * To do so, schedule our instructions in a roughly LIFO/depth-first - * order: when new instructions become available as a result of - * scheduling something, choose those first so that our result - * hopefully is consumed quickly. - * - * The exception is messages that generate more than one result - * register (AKA texturing). In those cases, the LIFO search would - * normally tend to choose them quickly (because scheduling the - * previous message not only unblocked the children using its result, - * but also the MRF setup for the next sampler message, which in turn - * unblocks the next sampler message). */ foreach_list(node, &instructions) { schedule_node *n = (schedule_node *)node; fs_inst *inst = (fs_inst *)n->inst; - chosen = n; - if (v->brw->gen >= 7 || inst->regs_written <= 1) - break; + if (!chosen) { + chosen = n; + continue; + } + + /* Prefer instructions that recently became available for scheduling. + * These are the things that are most likely to (eventually) make a + * variable dead and reduce register pressure. Typical register + * pressure estimates don't work for us because most of our pressure + * comes from texturing, where no single instruction to schedule will + * make a vec4 value dead. + */ + if (n->cand_generation > chosen->cand_generation) { + chosen = n; + continue; + } else if (n->cand_generation < chosen->cand_generation) { + continue; + } + + /* On MRF-using chips, prefer non-SEND instructions. If we don't do + * this, then because we prefer instructions that just became + * candidates, we'll end up in a pattern of scheduling a SEND, then + * the MRFs for the next SEND, then the next SEND, then the MRFs, + * etc., without ever consuming the results of a send. + */ + if (v->brw->gen < 7) { + fs_inst *chosen_inst = (fs_inst *)chosen->inst; + + /* We use regs_written > 1 as our test for the kind of send + * instruction to avoid -- only sends generate many regs, and a + * single-result send is probably actually reducing register + * pressure. + */ + if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) { + chosen = n; + continue; + } else if (inst->regs_written > chosen_inst->regs_written) { + continue; + } + } + + /* For instructions pushed on the cands list at the same time, prefer + * the one with the highest delay to the end of the program. This is + * most likely to have its values able to be consumed first (such as + * for a large tree of lowered ubo loads, which appear reversed in + * the instruction stream with respect to when they can be consumed). + */ + if (n->delay > chosen->delay) { + chosen = n; + continue; + } else if (n->delay < chosen->delay) { + continue; + } + + /* If all other metrics are equal, we prefer the first instruction in + * the list (program execution). + */ } } @@ -1048,6 +1097,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea n->remove(); } + unsigned cand_generation = 1; while (!instructions.is_empty()) { schedule_node *chosen = choose_instruction_to_schedule(); @@ -1090,6 +1140,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea bv->dump_instruction(child->inst); } + child->cand_generation = cand_generation; child->parent_count--; if (child->parent_count == 0) { if (debug) { @@ -1098,6 +1149,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea instructions.push_head(child); } } + cand_generation++; /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+ * but it's more limited pre-gen6, so if we send something off to it then |