diff options
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_schedule.c | 121 |
1 files changed, 75 insertions, 46 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index cf916198334..9141396c872 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -575,15 +575,28 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, struct schedule_node *chosen = NULL; int chosen_prio = 0; + /* Don't pair up anything with a thread switch signal -- emit_thrsw() + * will handle pairing it along with filling the delay slots. + */ + if (prev_inst) { + uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst, + QPU_SIG); + if (prev_sig == QPU_SIG_THREAD_SWITCH || + prev_sig == QPU_SIG_LAST_THREAD_SWITCH) { + return NULL; + } + } + list_for_each_entry(struct schedule_node, n, schedule_list, link) { uint64_t inst = n->inst->inst; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); /* Don't choose the branch instruction until it's the last one * left. XXX: We could potentially choose it before it's the * last one, if the remaining instructions fit in the delay * slots. */ - if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH && + if (sig == QPU_SIG_BRANCH && !list_is_singular(schedule_list)) { continue; } @@ -607,6 +620,14 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, * that they're compatible. */ if (prev_inst) { + /* Don't pair up a thread switch signal -- we'll + * handle pairing it when we pick it on its own. + */ + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + continue; + } + if (prev_inst->uniform != -1 && n->uniform != -1) continue; @@ -820,6 +841,51 @@ mark_instruction_scheduled(struct list_head *schedule_list, } } +/** + * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair + * with another instruction. + */ +static void +emit_thrsw(struct vc4_compile *c, + struct choose_scoreboard *scoreboard, + uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + /* There should be nothing in a thrsw inst being scheduled other than + * the signal bits. + */ + assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP); + assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP); + + /* Try to find an earlier scheduled instruction that we can merge the + * thrsw into. + */ + int thrsw_ip = c->qpu_inst_count; + for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { + uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; + uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); + + if (prev_sig == QPU_SIG_NONE) + thrsw_ip = c->qpu_inst_count - i; + } + + if (thrsw_ip != c->qpu_inst_count) { + /* Merge the thrsw into the existing instruction. */ + c->qpu_insts[thrsw_ip] = + QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + + /* Fill the delay slots. */ + while (c->qpu_inst_count < thrsw_ip + 3) { + update_scoreboard_for_chosen(scoreboard, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); + } +} + static uint32_t schedule_instructions(struct vc4_compile *c, struct choose_scoreboard *scoreboard, @@ -830,7 +896,6 @@ schedule_instructions(struct vc4_compile *c, uint32_t *next_uniform) { uint32_t time = 0; - uint32_t last_thread_switch = 0; if (debug) { fprintf(stderr, "initial deps:\n"); @@ -913,10 +978,6 @@ schedule_instructions(struct vc4_compile *c, fprintf(stderr, "\n"); } - qpu_serialize_one_inst(c, inst); - - update_scoreboard_for_chosen(scoreboard, inst); - /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this @@ -925,6 +986,14 @@ schedule_instructions(struct vc4_compile *c, mark_instruction_scheduled(schedule_list, time, chosen, false); mark_instruction_scheduled(schedule_list, time, merge, false); + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || + QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { + emit_thrsw(c, scoreboard, inst); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + scoreboard->tick++; time++; @@ -943,46 +1012,6 @@ schedule_instructions(struct vc4_compile *c, qpu_serialize_one_inst(c, inst); qpu_serialize_one_inst(c, inst); qpu_serialize_one_inst(c, inst); - } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || - QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { - int last = c->qpu_inst_count - 1; - - /* The thread switch occurs after two delay slots. - * Shift the signal upwards, if there is an - * instruction without a signal there. Watch out for - * the last thread switch as theoretically it could be - * only two instructions away. - */ - - /* Remove sig from the instruction */ - enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG); - c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last], - QPU_SIG_NONE, - QPU_SIG); - /* Compute how far we can shift */ - int max_shift = MIN2(last - last_thread_switch, 2); - /* If both instructions in front have a signal set, - * reset the signal on the current instruction.*/ - int shift; - for (shift = max_shift; shift >= 0; --shift) { - int ip = last - shift; - if (QPU_GET_FIELD(c->qpu_insts[ip], - QPU_SIG) == QPU_SIG_NONE) { - c->qpu_insts[ip] = - QPU_UPDATE_FIELD( - c->qpu_insts[ip], - sig, QPU_SIG); - break; - } - } - /* If necessarry, add filling NOPs*/ - for (int i = 0; i < 2 - shift; ++i) { - update_scoreboard_for_chosen(scoreboard, - qpu_NOP()); - qpu_serialize_one_inst(c, qpu_NOP()); - } - /* Avoid branching in a thread switch*/ - last_thread_switch = c->qpu_inst_count - 1; } } |