vc4: Add support for scheduling of branch instructions.

For now we don't fill the delay slots, and instead just drop in NOPs.
author: Eric Anholt <[email protected]> 2016-04-27 12:14:07 -0700
committer: Eric Anholt <[email protected]> 2016-07-13 23:54:15 -0700
commit: 44df061aaad96fc5db630ae69fb2fe2a03bb5659 (patch)
tree: fca4235e0746387a22f7ab33dcf385015a904cab /src/gallium/drivers/vc4/vc4_qpu_schedule.c
parent: a59da513d3229c883809ac2088c9612abcec1470 (diff)
1 files changed, 103 insertions, 17 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index fad10e509e2..a55b0351402 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -354,7 +354,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 
         if (sig != QPU_SIG_LOAD_IMM) {
                 process_raddr_deps(state, n, raddr_a, true);
-                if (sig != QPU_SIG_SMALL_IMM)
+                if (sig != QPU_SIG_SMALL_IMM &&
+                    sig != QPU_SIG_BRANCH)
                         process_raddr_deps(state, n, raddr_b, false);
         }
 
@@ -392,20 +393,23 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                 add_read_dep(state, state->last_tlb, n);
                 break;
 
+        case QPU_SIG_BRANCH:
+                add_read_dep(state, state->last_sf, n);
+                break;
+
         case QPU_SIG_PROG_END:
         case QPU_SIG_WAIT_FOR_SCOREBOARD:
         case QPU_SIG_SCOREBOARD_UNLOCK:
         case QPU_SIG_COVERAGE_LOAD:
         case QPU_SIG_COLOR_LOAD_END:
         case QPU_SIG_ALPHA_MASK_LOAD:
-        case QPU_SIG_BRANCH:
                 fprintf(stderr, "Unhandled signal bits %d\n", sig);
                 abort();
         }
 
         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
-        if (inst & QPU_SF)
+        if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
                 add_write_dep(state, &state->last_sf, n);
 }
 
@@ -525,6 +529,16 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 uint64_t inst = n->inst->inst;
 
+                /* Don't choose the branch instruction until it's the last one
+                 * left.  XXX: We could potentially choose it before it's the
+                 * last one, if the remaining instructions fit in the delay
+                 * slots.
+                 */
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
+                    !list_is_singular(schedule_list)) {
+                        continue;
+                }
+
                 /* "An instruction must not read from a location in physical
                  *  regfile A or B that was written to by the previous
                  *  instruction."
@@ -722,19 +736,16 @@ mark_instruction_scheduled(struct list_head *schedule_list,
 }
 
 static uint32_t
-schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
+schedule_instructions(struct vc4_compile *c,
+                      struct choose_scoreboard *scoreboard,
+                      struct qblock *block,
+                      struct list_head *schedule_list,
                       enum quniform_contents *orig_uniform_contents,
                       uint32_t *orig_uniform_data,
                       uint32_t *next_uniform)
 {
-        struct choose_scoreboard scoreboard;
         uint32_t time = 0;
 
-        memset(&scoreboard, 0, sizeof(scoreboard));
-        scoreboard.last_waddr_a = ~0;
-        scoreboard.last_waddr_b = ~0;
-        scoreboard.last_sfu_write_tick = -10;
-
         if (debug) {
                 fprintf(stderr, "initial deps:\n");
                 dump_state(schedule_list);
@@ -749,7 +760,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
 
         while (!list_empty(schedule_list)) {
                 struct schedule_node *chosen =
-                        choose_instruction_to_schedule(&scoreboard,
+                        choose_instruction_to_schedule(scoreboard,
                                                        schedule_list,
                                                        NULL);
                 struct schedule_node *merge = NULL;
@@ -784,7 +795,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
                                 (*next_uniform)++;
                         }
 
-                        merge = choose_instruction_to_schedule(&scoreboard,
+                        merge = choose_instruction_to_schedule(scoreboard,
                                                                schedule_list,
                                                                chosen);
                         if (merge) {
@@ -818,7 +829,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
 
                 qpu_serialize_one_inst(c, inst);
 
-                update_scoreboard_for_chosen(&scoreboard, inst);
+                update_scoreboard_for_chosen(scoreboard, inst);
 
                 /* Now that we've scheduled a new instruction, some of its
                  * children can be promoted to the list of instructions ready to
@@ -828,15 +839,34 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
                 mark_instruction_scheduled(schedule_list, time, chosen, false);
                 mark_instruction_scheduled(schedule_list, time, merge, false);
 
-                scoreboard.tick++;
+                scoreboard->tick++;
                 time++;
+
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
+                        block->branch_qpu_ip = c->qpu_inst_count - 1;
+                        /* Fill the delay slots.
+                         *
+                         * We should fill these with actual instructions,
+                         * instead, but that will probably need to be done
+                         * after this, once we know what the leading
+                         * instructions of the successors are (so we can
+                         * handle A/B register file write latency)
+                        */
+                        inst = qpu_NOP();
+                        update_scoreboard_for_chosen(scoreboard, inst);
+                        qpu_serialize_one_inst(c, inst);
+                        qpu_serialize_one_inst(c, inst);
+                        qpu_serialize_one_inst(c, inst);
+                }
         }
 
         return time;
 }
 
 static uint32_t
-qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
+qpu_schedule_instructions_block(struct vc4_compile *c,
+                                struct choose_scoreboard *scoreboard,
+                                struct qblock *block,
                                 enum quniform_contents *orig_uniform_contents,
                                 uint32_t *orig_uniform_data,
                                 uint32_t *next_uniform)
@@ -871,7 +901,8 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
                 compute_delay(n);
         }
 
-        uint32_t cycles = schedule_instructions(c, &schedule_list,
+        uint32_t cycles = schedule_instructions(c, scoreboard, block,
+                                                &schedule_list,
                                                 orig_uniform_contents,
                                                 orig_uniform_data,
                                                 next_uniform);
@@ -881,6 +912,46 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
         return cycles;
 }
 
+static void
+qpu_set_branch_targets(struct vc4_compile *c)
+{
+        qir_for_each_block(block, c) {
+                /* The end block of the program has no branch. */
+                if (!block->successors[0])
+                        continue;
+
+                /* If there was no branch instruction, then the successor
+                 * block must follow immediately after this one.
+                 */
+                if (block->branch_qpu_ip == ~0) {
+                        assert(block->end_qpu_ip + 1 ==
+                               block->successors[0]->start_qpu_ip);
+                        continue;
+                }
+
+                /* Set the branch target for the block that doesn't follow
+                 * immediately after ours.
+                 */
+                uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
+                assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
+                assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
+
+                uint32_t branch_target =
+                        (block->successors[0]->start_qpu_ip -
+                         (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
+                *branch_inst = (*branch_inst |
+                                QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
+
+                /* Make sure that the if-we-don't-jump successor was scheduled
+                 * just after the delay slots.
+                 */
+                if (block->successors[1]) {
+                        assert(block->successors[1]->start_qpu_ip ==
+                               block->branch_qpu_ip + 4);
+                }
+        }
+}
+
 uint32_t
 qpu_schedule_instructions(struct vc4_compile *c)
 {
@@ -895,6 +966,12 @@ qpu_schedule_instructions(struct vc4_compile *c)
         c->uniform_array_size = c->num_uniforms;
         uint32_t next_uniform = 0;
 
+        struct choose_scoreboard scoreboard;
+        memset(&scoreboard, 0, sizeof(scoreboard));
+        scoreboard.last_waddr_a = ~0;
+        scoreboard.last_waddr_b = ~0;
+        scoreboard.last_sfu_write_tick = -10;
+
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
                 qir_for_each_block(block, c) {
@@ -910,12 +987,21 @@ qpu_schedule_instructions(struct vc4_compile *c)
 
         uint32_t cycles = 0;
         qir_for_each_block(block, c) {
-                cycles += qpu_schedule_instructions_block(c, block,
+                block->start_qpu_ip = c->qpu_inst_count;
+                block->branch_qpu_ip = ~0;
+
+                cycles += qpu_schedule_instructions_block(c,
+                                                          &scoreboard,
+                                                          block,
                                                           uniform_contents,
                                                           uniform_data,
                                                           &next_uniform);
+
+                block->end_qpu_ip = c->qpu_inst_count - 1;
         }
 
+        qpu_set_branch_targets(c);
+
         assert(next_uniform == c->num_uniforms);
 
         if (debug) {
author	Eric Anholt <[email protected]>	2016-04-27 12:14:07 -0700
committer	Eric Anholt <[email protected]>	2016-07-13 23:54:15 -0700
commit	44df061aaad96fc5db630ae69fb2fe2a03bb5659 (patch)
tree	fca4235e0746387a22f7ab33dcf385015a904cab /src/gallium/drivers/vc4/vc4_qpu_schedule.c
parent	a59da513d3229c883809ac2088c9612abcec1470 (diff)