diff options
author | Eric Anholt <[email protected]> | 2014-12-09 14:05:52 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2014-12-09 18:32:36 -0800 |
commit | c5b544403fbc955dd441fb5a2e11f0de2a75e9e4 (patch) | |
tree | 112b865dd1f4b5fadb92f4718688d8fe42adb2bf /src/gallium/drivers/vc4 | |
parent | 45a89237711acff7ee31c854361f8f580ccdcc9f (diff) |
vc4: Populate the delay field better, and schedule high delay first.
This is a standard scheduling heuristic, and clearly helps.
total instructions in shared programs: 46418 -> 44467 (-4.20%)
instructions in affected programs: 42531 -> 40580 (-4.59%)
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_schedule.c | 50 |
1 files changed, 49 insertions, 1 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 8df816fa211..c733e6e7f30 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -49,7 +49,19 @@ struct schedule_node { uint32_t child_count; uint32_t child_array_size; uint32_t parent_count; + + /** + * Minimum number of cycles from scheduling this instruction until the + * end of the program, based on the slowest dependency chain through + * the children. + */ uint32_t delay; + + /** + * cycles between this instruction being scheduled and when its result + * can be consumed. + */ + uint32_t latency; }; struct schedule_node_child { @@ -548,6 +560,13 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, } else if (prio < chosen_prio) { continue; } + + if (n->delay > chosen->delay) { + chosen = n; + chosen_prio = prio; + } else if (n->delay < chosen->delay) { + continue; + } } return chosen; @@ -612,7 +631,7 @@ compute_delay(struct schedule_node *n) if (!n->children[i].node->delay) compute_delay(n->children[i].node); n->delay = MAX2(n->delay, - n->children[i].node->delay + 1); + n->children[i].node->delay + n->latency); } } } @@ -734,6 +753,33 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) } } +static uint32_t waddr_latency(uint32_t waddr) +{ + if (waddr < 32) + return 2; + + /* Some huge number, really. */ + if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B) + return 10; + + switch(waddr) { + case QPU_W_SFU_RECIP: + case QPU_W_SFU_RECIPSQRT: + case QPU_W_SFU_EXP: + case QPU_W_SFU_LOG: + return 3; + default: + return 1; + } +} + +static uint32_t +instruction_latency(uint64_t inst) +{ + return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)), + waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); +} + void qpu_schedule_instructions(struct vc4_compile *c) { @@ -761,6 +807,8 @@ qpu_schedule_instructions(struct vc4_compile *c) struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); n->inst = inst; + n->latency = instruction_latency(inst->inst); + remove_from_list(&inst->link); insert_at_tail(&schedule_list, &n->link); } |