diff options
author | Eric Anholt <[email protected]> | 2018-01-08 11:55:31 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2018-01-12 21:55:30 -0800 |
commit | 90269ba353333be13e54549ecff3adb8803661db (patch) | |
tree | 228c677b3b355ea9e0dee929e6ed8d0afb85dd0c /src/broadcom/compiler/qpu_schedule.c | |
parent | 86a12b4d5a49c68f4613513d2846c5eb8e56a677 (diff) |
broadcom/vc5: Use THRSW to enable multi-threaded shaders.
This is a major performance boost on all of V3D, but is required on V3D
4.x where shaders are always either 2- or 4-threaded.
Diffstat (limited to 'src/broadcom/compiler/qpu_schedule.c')
-rw-r--r-- | src/broadcom/compiler/qpu_schedule.c | 96 |
1 files changed, 56 insertions, 40 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index fdec5252b1f..c3b88c345d1 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -1097,13 +1097,30 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, } static bool -valid_thrend_sequence(struct v3d_compile *c, - struct qinst *qinst, int instructions_in_sequence) +valid_thrsw_sequence(struct v3d_compile *c, + struct qinst *qinst, int instructions_in_sequence, + bool is_thrend) { for (int slot = 0; slot < instructions_in_sequence; slot++) { - if (!qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) + /* No scheduling SFU when the result would land in the other + * thread. The simulator complains for safety, though it + * would only occur for dead code in our case. + */ + if (slot > 0 && + qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || + v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { + return false; + } + + if (slot > 0 && qinst->qpu.sig.ldvary) return false; + if (is_thrend && + !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { + return false; + } + /* Note that the list is circular, so we can only do this up * to instructions_in_sequence. */ @@ -1121,7 +1138,8 @@ static int emit_thrsw(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, - struct qinst *inst) + struct qinst *inst, + bool is_thrend) { int time = 0; @@ -1143,20 +1161,25 @@ emit_thrsw(struct v3d_compile *c, if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) break; - if (!valid_thrend_sequence(c, prev_inst, slots_filled + 1)) + if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1, + is_thrend)) { break; + } merge_inst = prev_inst; if (++slots_filled == 3) break; } + bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; + needs_free = true; } else { insert_scheduled_instruction(c, block, scoreboard, inst); time++; slots_filled++; + merge_inst = inst; } /* Insert any extra delay slot NOPs we need. */ @@ -1165,10 +1188,19 @@ emit_thrsw(struct v3d_compile *c, time++; } + /* If we're emitting the last THRSW (other than program end), then + * signal that to the HW by emitting two THRSWs in a row. + */ + if (inst->is_last_thrsw) { + struct qinst *second_inst = + (struct qinst *)merge_inst->link.next; + second_inst->qpu.sig.thrsw = true; + } + /* If we put our THRSW into another instruction, free up the * instruction that didn't end up scheduled into the list. */ - if (merge_inst) + if (needs_free) free(inst); return time; @@ -1293,40 +1325,24 @@ schedule_instructions(struct v3d_compile *c, free(merge->inst); } - if (0 && inst->sig.thrsw) { - /* XXX emit_thrsw(c, scoreboard, qinst); */ + if (inst->sig.thrsw) { + time += emit_thrsw(c, block, scoreboard, qinst, false); } else { - c->qpu_inst_count++; - list_addtail(&qinst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, inst); - } - - scoreboard->tick++; - time++; - - if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH || - inst->sig.thrsw /* XXX */) { - block->branch_qpu_ip = c->qpu_inst_count - 1; - /* Fill the delay slots. - * - * We should fill these with actual instructions, - * instead, but that will probably need to be done - * after this, once we know what the leading - * instructions of the successors are (so we can - * handle A/B register file write latency) - */ - /* XXX: scoreboard */ - int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ? - 3 : 2); - for (int i = 0; i < slots; i++) { - struct qinst *nop = vir_nop(); - list_addtail(&nop->link, &block->instructions); - - update_scoreboard_for_chosen(scoreboard, - &nop->qpu); - c->qpu_inst_count++; - scoreboard->tick++; - time++; + insert_scheduled_instruction(c, block, + scoreboard, qinst); + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { + block->branch_qpu_ip = c->qpu_inst_count - 1; + /* Fill the delay slots. + * + * We should fill these with actual instructions, + * instead, but that will probably need to be done + * after this, once we know what the leading + * instructions of the successors are (so we can + * handle A/B register file write latency) + */ + for (int i = 0; i < 3; i++) + emit_nop(c, block, scoreboard); } } } @@ -1488,7 +1504,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) /* Emit the program-end THRSW instruction. */; struct qinst *thrsw = vir_nop(); thrsw->qpu.sig.thrsw = true; - emit_thrsw(c, end_block, &scoreboard, thrsw); + emit_thrsw(c, end_block, &scoreboard, thrsw, true); qpu_set_branch_targets(c); |