summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonas Pfeil <[email protected]>2016-11-20 20:45:13 +0100
committerEric Anholt <[email protected]>2016-12-29 14:41:09 -0800
commitd82dbc4cde1415560e259b5aac36f36175e8939a (patch)
treee55b49fedf24303dd2c8a217de2c30cbb1a38d92
parent63e7671c7e65f9df1678d3d79c92f358ae0bdc82 (diff)
vc4: Fill thread switching delay slots
Scan for instructions without a signal set in front of the switching instruction and move the signal up there. shader-db results: total instructions in shared programs: 94494 -> 93027 (-1.55%) instructions in affected programs: 23545 -> 22078 (-6.23%) v2: Fix re-emitting of the instruction in the loop trying to emit NOPs, drop a scheduling change from branch delay slots. (by anholt) Signed-off-by: Jonas Pfeil <[email protected]>
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_schedule.c45
1 files changed, 38 insertions, 7 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 4b2cb9dbd37..cf916198334 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -830,6 +830,7 @@ schedule_instructions(struct vc4_compile *c,
uint32_t *next_uniform)
{
uint32_t time = 0;
+ uint32_t last_thread_switch = 0;
if (debug) {
fprintf(stderr, "initial deps:\n");
@@ -944,14 +945,44 @@ schedule_instructions(struct vc4_compile *c,
qpu_serialize_one_inst(c, inst);
} else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
- /* The thread switch occurs after two delay slots. We
- * should fit things in these slots, but we don't
- * currently.
+ int last = c->qpu_inst_count - 1;
+
+ /* The thread switch occurs after two delay slots.
+ * Shift the signal upwards, if there is an
+ * instruction without a signal there. Watch out for
+ * the last thread switch as theoretically it could be
+ * only two instructions away.
*/
- inst = qpu_NOP();
- update_scoreboard_for_chosen(scoreboard, inst);
- qpu_serialize_one_inst(c, inst);
- qpu_serialize_one_inst(c, inst);
+
+ /* Remove sig from the instruction */
+ enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG);
+ c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last],
+ QPU_SIG_NONE,
+ QPU_SIG);
+ /* Compute how far we can shift */
+ int max_shift = MIN2(last - last_thread_switch, 2);
+ /* If both instructions in front have a signal set,
+ * reset the signal on the current instruction.*/
+ int shift;
+ for (shift = max_shift; shift >= 0; --shift) {
+ int ip = last - shift;
+ if (QPU_GET_FIELD(c->qpu_insts[ip],
+ QPU_SIG) == QPU_SIG_NONE) {
+ c->qpu_insts[ip] =
+ QPU_UPDATE_FIELD(
+ c->qpu_insts[ip],
+ sig, QPU_SIG);
+ break;
+ }
+ }
+ /* If necessarry, add filling NOPs*/
+ for (int i = 0; i < 2 - shift; ++i) {
+ update_scoreboard_for_chosen(scoreboard,
+ qpu_NOP());
+ qpu_serialize_one_inst(c, qpu_NOP());
+ }
+ /* Avoid branching in a thread switch*/
+ last_thread_switch = c->qpu_inst_count - 1;
}
}