diff options
author | Jonas Pfeil <[email protected]> | 2016-11-20 20:45:13 +0100 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2016-12-29 14:41:09 -0800 |
commit | d82dbc4cde1415560e259b5aac36f36175e8939a (patch) | |
tree | e55b49fedf24303dd2c8a217de2c30cbb1a38d92 | |
parent | 63e7671c7e65f9df1678d3d79c92f358ae0bdc82 (diff) |
vc4: Fill thread switching delay slots
Scan for instructions without a signal set in front of the switching
instruction and move the signal up there.
shader-db results:
total instructions in shared programs: 94494 -> 93027 (-1.55%)
instructions in affected programs: 23545 -> 22078 (-6.23%)
v2: Fix re-emitting of the instruction in the loop trying to emit NOPs,
drop a scheduling change from branch delay slots. (by anholt)
Signed-off-by: Jonas Pfeil <[email protected]>
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_schedule.c | 45 |
1 files changed, 38 insertions, 7 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 4b2cb9dbd37..cf916198334 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -830,6 +830,7 @@ schedule_instructions(struct vc4_compile *c, uint32_t *next_uniform) { uint32_t time = 0; + uint32_t last_thread_switch = 0; if (debug) { fprintf(stderr, "initial deps:\n"); @@ -944,14 +945,44 @@ schedule_instructions(struct vc4_compile *c, qpu_serialize_one_inst(c, inst); } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { - /* The thread switch occurs after two delay slots. We - * should fit things in these slots, but we don't - * currently. + int last = c->qpu_inst_count - 1; + + /* The thread switch occurs after two delay slots. + * Shift the signal upwards, if there is an + * instruction without a signal there. Watch out for + * the last thread switch as theoretically it could be + * only two instructions away. */ - inst = qpu_NOP(); - update_scoreboard_for_chosen(scoreboard, inst); - qpu_serialize_one_inst(c, inst); - qpu_serialize_one_inst(c, inst); + + /* Remove sig from the instruction */ + enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG); + c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last], + QPU_SIG_NONE, + QPU_SIG); + /* Compute how far we can shift */ + int max_shift = MIN2(last - last_thread_switch, 2); + /* If both instructions in front have a signal set, + * reset the signal on the current instruction.*/ + int shift; + for (shift = max_shift; shift >= 0; --shift) { + int ip = last - shift; + if (QPU_GET_FIELD(c->qpu_insts[ip], + QPU_SIG) == QPU_SIG_NONE) { + c->qpu_insts[ip] = + QPU_UPDATE_FIELD( + c->qpu_insts[ip], + sig, QPU_SIG); + break; + } + } + /* If necessarry, add filling NOPs*/ + for (int i = 0; i < 2 - shift; ++i) { + update_scoreboard_for_chosen(scoreboard, + qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); + } + /* Avoid branching in a thread switch*/ + last_thread_switch = c->qpu_inst_count - 1; } } |