summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2016-04-27 12:14:07 -0700
committerEric Anholt <[email protected]>2016-07-13 23:54:15 -0700
commit44df061aaad96fc5db630ae69fb2fe2a03bb5659 (patch)
treefca4235e0746387a22f7ab33dcf385015a904cab
parenta59da513d3229c883809ac2088c9612abcec1470 (diff)
vc4: Add support for scheduling of branch instructions.
For now we don't fill the delay slots, and instead just drop in NOPs.
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.h11
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_schedule.c120
2 files changed, 114 insertions, 17 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index fabdf10e24d..e7ddfaa1fcb 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -362,6 +362,17 @@ struct qblock {
int index;
+ /* Instruction IPs for the first and last instruction of the block.
+ * Set by vc4_qpu_schedule.c.
+ */
+ uint32_t start_qpu_ip;
+ uint32_t end_qpu_ip;
+
+ /* Instruction IP for the branch instruction of the block. Set by
+ * vc4_qpu_schedule.c.
+ */
+ uint32_t branch_qpu_ip;
+
/** @{ used by vc4_qir_live_variables.c */
BITSET_WORD *def;
BITSET_WORD *use;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index fad10e509e2..a55b0351402 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -354,7 +354,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
if (sig != QPU_SIG_LOAD_IMM) {
process_raddr_deps(state, n, raddr_a, true);
- if (sig != QPU_SIG_SMALL_IMM)
+ if (sig != QPU_SIG_SMALL_IMM &&
+ sig != QPU_SIG_BRANCH)
process_raddr_deps(state, n, raddr_b, false);
}
@@ -392,20 +393,23 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_read_dep(state, state->last_tlb, n);
break;
+ case QPU_SIG_BRANCH:
+ add_read_dep(state, state->last_sf, n);
+ break;
+
case QPU_SIG_PROG_END:
case QPU_SIG_WAIT_FOR_SCOREBOARD:
case QPU_SIG_SCOREBOARD_UNLOCK:
case QPU_SIG_COVERAGE_LOAD:
case QPU_SIG_COLOR_LOAD_END:
case QPU_SIG_ALPHA_MASK_LOAD:
- case QPU_SIG_BRANCH:
fprintf(stderr, "Unhandled signal bits %d\n", sig);
abort();
}
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
- if (inst & QPU_SF)
+ if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
add_write_dep(state, &state->last_sf, n);
}
@@ -525,6 +529,16 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
list_for_each_entry(struct schedule_node, n, schedule_list, link) {
uint64_t inst = n->inst->inst;
+ /* Don't choose the branch instruction until it's the last one
+ * left. XXX: We could potentially choose it before it's the
+ * last one, if the remaining instructions fit in the delay
+ * slots.
+ */
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
+ !list_is_singular(schedule_list)) {
+ continue;
+ }
+
/* "An instruction must not read from a location in physical
* regfile A or B that was written to by the previous
* instruction."
@@ -722,19 +736,16 @@ mark_instruction_scheduled(struct list_head *schedule_list,
}
static uint32_t
-schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
+schedule_instructions(struct vc4_compile *c,
+ struct choose_scoreboard *scoreboard,
+ struct qblock *block,
+ struct list_head *schedule_list,
enum quniform_contents *orig_uniform_contents,
uint32_t *orig_uniform_data,
uint32_t *next_uniform)
{
- struct choose_scoreboard scoreboard;
uint32_t time = 0;
- memset(&scoreboard, 0, sizeof(scoreboard));
- scoreboard.last_waddr_a = ~0;
- scoreboard.last_waddr_b = ~0;
- scoreboard.last_sfu_write_tick = -10;
-
if (debug) {
fprintf(stderr, "initial deps:\n");
dump_state(schedule_list);
@@ -749,7 +760,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
while (!list_empty(schedule_list)) {
struct schedule_node *chosen =
- choose_instruction_to_schedule(&scoreboard,
+ choose_instruction_to_schedule(scoreboard,
schedule_list,
NULL);
struct schedule_node *merge = NULL;
@@ -784,7 +795,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
(*next_uniform)++;
}
- merge = choose_instruction_to_schedule(&scoreboard,
+ merge = choose_instruction_to_schedule(scoreboard,
schedule_list,
chosen);
if (merge) {
@@ -818,7 +829,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
qpu_serialize_one_inst(c, inst);
- update_scoreboard_for_chosen(&scoreboard, inst);
+ update_scoreboard_for_chosen(scoreboard, inst);
/* Now that we've scheduled a new instruction, some of its
* children can be promoted to the list of instructions ready to
@@ -828,15 +839,34 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
mark_instruction_scheduled(schedule_list, time, chosen, false);
mark_instruction_scheduled(schedule_list, time, merge, false);
- scoreboard.tick++;
+ scoreboard->tick++;
time++;
+
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
+ block->branch_qpu_ip = c->qpu_inst_count - 1;
+ /* Fill the delay slots.
+ *
+ * We should fill these with actual instructions,
+ * instead, but that will probably need to be done
+ * after this, once we know what the leading
+ * instructions of the successors are (so we can
+ * handle A/B register file write latency)
+ */
+ inst = qpu_NOP();
+ update_scoreboard_for_chosen(scoreboard, inst);
+ qpu_serialize_one_inst(c, inst);
+ qpu_serialize_one_inst(c, inst);
+ qpu_serialize_one_inst(c, inst);
+ }
}
return time;
}
static uint32_t
-qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
+qpu_schedule_instructions_block(struct vc4_compile *c,
+ struct choose_scoreboard *scoreboard,
+ struct qblock *block,
enum quniform_contents *orig_uniform_contents,
uint32_t *orig_uniform_data,
uint32_t *next_uniform)
@@ -871,7 +901,8 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
compute_delay(n);
}
- uint32_t cycles = schedule_instructions(c, &schedule_list,
+ uint32_t cycles = schedule_instructions(c, scoreboard, block,
+ &schedule_list,
orig_uniform_contents,
orig_uniform_data,
next_uniform);
@@ -881,6 +912,46 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
return cycles;
}
+static void
+qpu_set_branch_targets(struct vc4_compile *c)
+{
+ qir_for_each_block(block, c) {
+ /* The end block of the program has no branch. */
+ if (!block->successors[0])
+ continue;
+
+ /* If there was no branch instruction, then the successor
+ * block must follow immediately after this one.
+ */
+ if (block->branch_qpu_ip == ~0) {
+ assert(block->end_qpu_ip + 1 ==
+ block->successors[0]->start_qpu_ip);
+ continue;
+ }
+
+ /* Set the branch target for the block that doesn't follow
+ * immediately after ours.
+ */
+ uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
+ assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
+ assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
+
+ uint32_t branch_target =
+ (block->successors[0]->start_qpu_ip -
+ (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
+ *branch_inst = (*branch_inst |
+ QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
+
+ /* Make sure that the if-we-don't-jump successor was scheduled
+ * just after the delay slots.
+ */
+ if (block->successors[1]) {
+ assert(block->successors[1]->start_qpu_ip ==
+ block->branch_qpu_ip + 4);
+ }
+ }
+}
+
uint32_t
qpu_schedule_instructions(struct vc4_compile *c)
{
@@ -895,6 +966,12 @@ qpu_schedule_instructions(struct vc4_compile *c)
c->uniform_array_size = c->num_uniforms;
uint32_t next_uniform = 0;
+ struct choose_scoreboard scoreboard;
+ memset(&scoreboard, 0, sizeof(scoreboard));
+ scoreboard.last_waddr_a = ~0;
+ scoreboard.last_waddr_b = ~0;
+ scoreboard.last_sfu_write_tick = -10;
+
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
qir_for_each_block(block, c) {
@@ -910,12 +987,21 @@ qpu_schedule_instructions(struct vc4_compile *c)
uint32_t cycles = 0;
qir_for_each_block(block, c) {
- cycles += qpu_schedule_instructions_block(c, block,
+ block->start_qpu_ip = c->qpu_inst_count;
+ block->branch_qpu_ip = ~0;
+
+ cycles += qpu_schedule_instructions_block(c,
+ &scoreboard,
+ block,
uniform_contents,
uniform_data,
&next_uniform);
+
+ block->end_qpu_ip = c->qpu_inst_count - 1;
}
+ qpu_set_branch_targets(c);
+
assert(next_uniform == c->num_uniforms);
if (debug) {