diff options
Diffstat (limited to 'src/broadcom')
-rw-r--r-- | src/broadcom/cle/v3d_packet_v33.xml | 8 | ||||
-rw-r--r-- | src/broadcom/compiler/nir_to_vir.c | 111 | ||||
-rw-r--r-- | src/broadcom/compiler/qpu_schedule.c | 96 | ||||
-rw-r--r-- | src/broadcom/compiler/qpu_validate.c | 70 | ||||
-rw-r--r-- | src/broadcom/compiler/v3d_compiler.h | 22 | ||||
-rw-r--r-- | src/broadcom/compiler/vir.c | 6 | ||||
-rw-r--r-- | src/broadcom/compiler/vir_register_allocate.c | 46 | ||||
-rw-r--r-- | src/broadcom/compiler/vir_to_qpu.c | 4 |
8 files changed, 285 insertions, 78 deletions
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml index 094ee00cf12..6be632112a2 100644 --- a/src/broadcom/cle/v3d_packet_v33.xml +++ b/src/broadcom/cle/v3d_packet_v33.xml @@ -700,13 +700,17 @@ <field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/> <field name="Address of default attribute values" size="32" start="8b" type="address"/> <field name="Fragment Shader Code Address" size="29" start="99" type="address"/> - <field name="2-way threadable" size="1" start="96" type="bool"/> - <field name="4-way threadable" size="1" start="97" type="bool"/> + <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/> + <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/> <field name="Propagate NaNs" size="1" start="98" type="bool"/> <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/> <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/> + <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/> + <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/> <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/> <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/> + <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/> + <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/> <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/> </struct> diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 1882c5ace7e..0400a683b71 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -65,6 +65,23 @@ resize_qreg_array(struct v3d_compile *c, (*regs)[i] = c->undef; } +static void +vir_emit_thrsw(struct v3d_compile *c) +{ + if (c->threads == 1) + return; + + /* Always thread switch after each texture operation for now. + * + * We could do better by batching a bunch of texture fetches up and + * then doing one thread switch and collecting all their results + * afterward. + */ + c->last_thrsw = vir_NOP(c); + c->last_thrsw->qpu.sig.thrsw = true; + c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); +} + static struct qreg vir_SFU(struct v3d_compile *c, int waddr, struct qreg src) { @@ -118,6 +135,7 @@ indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr) vir_uniform(c, QUNIFORM_UBO_ADDR, 0), indirect_offset); + vir_emit_thrsw(c); return vir_LDTMU(c); } @@ -488,6 +506,8 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) } } + vir_emit_thrsw(c); + struct qreg return_values[4]; for (int i = 0; i < 4; i++) { /* Swizzling .zw of an RG texture should give undefined @@ -1685,6 +1705,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_get_src(c, instr->src[1], 0), vir_uniform_ui(c, i * 4))); + vir_emit_thrsw(c); + ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); } break; @@ -2124,6 +2146,62 @@ count_nir_instrs(nir_shader *nir) } #endif +/** + * When demoting a shader down to single-threaded, removes the THRSW + * instructions (one will still be inserted at v3d_vir_to_qpu() for the + * program end). + */ +static void +vir_remove_thrsw(struct v3d_compile *c) +{ + vir_for_each_block(block, c) { + vir_for_each_inst_safe(inst, block) { + if (inst->qpu.sig.thrsw) + vir_remove_instruction(c, inst); + } + } + + c->last_thrsw = NULL; +} + +static void +vir_emit_last_thrsw(struct v3d_compile *c) +{ + /* On V3D before 4.1, we need a TMU op to be outstanding when thread + * switching, so disable threads if we didn't do any TMU ops (each of + * which would have emitted a THRSW). + */ + if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { + c->threads = 1; + if (c->last_thrsw) + vir_remove_thrsw(c); + return; + } + + /* If we're threaded and the last THRSW was in conditional code, then + * we need to emit another one so that we can flag it as the last + * thrsw. + */ + if (c->last_thrsw && !c->last_thrsw_at_top_level) { + assert(c->devinfo->ver >= 41); + vir_emit_thrsw(c); + } + + /* If we're threaded, then we need to mark the last THRSW instruction + * so we can emit a pair of them at QPU emit time. + * + * For V3D 4.x, we can spawn the non-fragment shaders already in the + * post-last-THRSW state, so we can skip this. + */ + if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { + assert(c->devinfo->ver >= 41); + vir_emit_thrsw(c); + } + + if (c->last_thrsw) + c->last_thrsw->is_last_thrsw = true; +} + void v3d_nir_to_vir(struct v3d_compile *c) { @@ -2137,6 +2215,9 @@ v3d_nir_to_vir(struct v3d_compile *c) nir_to_vir(c); + /* Emit the last THRSW before STVPM and TLB writes. */ + vir_emit_last_thrsw(c); + switch (c->s->info.stage) { case MESA_SHADER_FRAGMENT: emit_frag_end(c); @@ -2171,5 +2252,33 @@ v3d_nir_to_vir(struct v3d_compile *c) fprintf(stderr, "\n"); } - v3d_vir_to_qpu(c); + /* Compute the live ranges so we can figure out interference. */ + vir_calculate_live_intervals(c); + + /* Attempt to allocate registers for the temporaries. If we fail, + * reduce thread count and try again. + */ + int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; + struct qpu_reg *temp_registers; + while (true) { + temp_registers = v3d_register_allocate(c); + + if (temp_registers) + break; + + if (c->threads == min_threads) { + fprintf(stderr, "Failed to register allocate at %d threads:\n", + c->threads); + vir_dump(c); + c->failed = true; + return; + } + + c->threads /= 2; + + if (c->threads == 1) + vir_remove_thrsw(c); + } + + v3d_vir_to_qpu(c, temp_registers); } diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index fdec5252b1f..c3b88c345d1 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -1097,13 +1097,30 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, } static bool -valid_thrend_sequence(struct v3d_compile *c, - struct qinst *qinst, int instructions_in_sequence) +valid_thrsw_sequence(struct v3d_compile *c, + struct qinst *qinst, int instructions_in_sequence, + bool is_thrend) { for (int slot = 0; slot < instructions_in_sequence; slot++) { - if (!qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) + /* No scheduling SFU when the result would land in the other + * thread. The simulator complains for safety, though it + * would only occur for dead code in our case. + */ + if (slot > 0 && + qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || + v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { + return false; + } + + if (slot > 0 && qinst->qpu.sig.ldvary) return false; + if (is_thrend && + !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { + return false; + } + /* Note that the list is circular, so we can only do this up * to instructions_in_sequence. */ @@ -1121,7 +1138,8 @@ static int emit_thrsw(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, - struct qinst *inst) + struct qinst *inst, + bool is_thrend) { int time = 0; @@ -1143,20 +1161,25 @@ emit_thrsw(struct v3d_compile *c, if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) break; - if (!valid_thrend_sequence(c, prev_inst, slots_filled + 1)) + if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1, + is_thrend)) { break; + } merge_inst = prev_inst; if (++slots_filled == 3) break; } + bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; + needs_free = true; } else { insert_scheduled_instruction(c, block, scoreboard, inst); time++; slots_filled++; + merge_inst = inst; } /* Insert any extra delay slot NOPs we need. */ @@ -1165,10 +1188,19 @@ emit_thrsw(struct v3d_compile *c, time++; } + /* If we're emitting the last THRSW (other than program end), then + * signal that to the HW by emitting two THRSWs in a row. + */ + if (inst->is_last_thrsw) { + struct qinst *second_inst = + (struct qinst *)merge_inst->link.next; + second_inst->qpu.sig.thrsw = true; + } + /* If we put our THRSW into another instruction, free up the * instruction that didn't end up scheduled into the list. */ - if (merge_inst) + if (needs_free) free(inst); return time; @@ -1293,40 +1325,24 @@ schedule_instructions(struct v3d_compile *c, free(merge->inst); } - if (0 && inst->sig.thrsw) { - /* XXX emit_thrsw(c, scoreboard, qinst); */ + if (inst->sig.thrsw) { + time += emit_thrsw(c, block, scoreboard, qinst, false); } else { - c->qpu_inst_count++; - list_addtail(&qinst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, inst); - } - - scoreboard->tick++; - time++; - - if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH || - inst->sig.thrsw /* XXX */) { - block->branch_qpu_ip = c->qpu_inst_count - 1; - /* Fill the delay slots. - * - * We should fill these with actual instructions, - * instead, but that will probably need to be done - * after this, once we know what the leading - * instructions of the successors are (so we can - * handle A/B register file write latency) - */ - /* XXX: scoreboard */ - int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ? - 3 : 2); - for (int i = 0; i < slots; i++) { - struct qinst *nop = vir_nop(); - list_addtail(&nop->link, &block->instructions); - - update_scoreboard_for_chosen(scoreboard, - &nop->qpu); - c->qpu_inst_count++; - scoreboard->tick++; - time++; + insert_scheduled_instruction(c, block, + scoreboard, qinst); + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { + block->branch_qpu_ip = c->qpu_inst_count - 1; + /* Fill the delay slots. + * + * We should fill these with actual instructions, + * instead, but that will probably need to be done + * after this, once we know what the leading + * instructions of the successors are (so we can + * handle A/B register file write latency) + */ + for (int i = 0; i < 3; i++) + emit_nop(c, block, scoreboard); } } } @@ -1488,7 +1504,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) /* Emit the program-end THRSW instruction. */; struct qinst *thrsw = vir_nop(); thrsw->qpu.sig.thrsw = true; - emit_thrsw(c, end_block, &scoreboard, thrsw); + emit_thrsw(c, end_block, &scoreboard, thrsw, true); qpu_set_branch_targets(c); diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c index 3b2c10eabc6..4ef587c1d52 100644 --- a/src/broadcom/compiler/qpu_validate.c +++ b/src/broadcom/compiler/qpu_validate.c @@ -39,6 +39,10 @@ struct v3d_qpu_validate_state { const struct v3d_qpu_instr *last; int ip; int last_sfu_write; + int last_branch_ip; + int last_thrsw_ip; + bool last_thrsw_found; + int thrsw_count; }; static void @@ -63,6 +67,18 @@ fail_instr(struct v3d_qpu_validate_state *state, const char *msg) } static bool +in_branch_delay_slots(struct v3d_qpu_validate_state *state) +{ + return (state->ip - state->last_branch_ip) < 3; +} + +static bool +in_thrsw_delay_slots(struct v3d_qpu_validate_state *state) +{ + return (state->ip - state->last_thrsw_ip) < 3; +} + +static bool qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst, bool (*predicate)(enum v3d_qpu_waddr waddr)) { @@ -136,6 +152,19 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) } } + if (in_thrsw_delay_slots(state)) { + /* There's no way you want to start SFU during the THRSW delay + * slots, since the result would land in the other thread. + */ + if (sfu_writes) { + fail_instr(state, + "SFU write started during THRSW delay slots "); + } + + if (inst->sig.ldvary) + fail_instr(state, "LDVARY during THRSW delay slots"); + } + (void)qpu_magic_waddr_matches; /* XXX */ /* SFU r4 results come back two instructions later. No doing @@ -170,6 +199,35 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) if (sfu_writes) state->last_sfu_write = state->ip; + + if (inst->sig.thrsw) { + if (in_branch_delay_slots(state)) + fail_instr(state, "THRSW in a branch delay slot."); + + if (state->last_thrsw_ip == state->ip - 1) { + /* If it's the second THRSW in a row, then it's just a + * last-thrsw signal. + */ + if (state->last_thrsw_found) + fail_instr(state, "Two last-THRSW signals"); + state->last_thrsw_found = true; + } else { + if (in_thrsw_delay_slots(state)) { + fail_instr(state, + "THRSW too close to another THRSW."); + } + state->thrsw_count++; + state->last_thrsw_ip = state->ip; + } + } + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { + if (in_branch_delay_slots(state)) + fail_instr(state, "branch in a branch delay slot."); + if (in_thrsw_delay_slots(state)) + fail_instr(state, "branch in a THRSW delay slot."); + state->last_branch_ip = state->ip; + } } static void @@ -201,10 +259,22 @@ qpu_validate(struct v3d_compile *c) struct v3d_qpu_validate_state state = { .c = c, .last_sfu_write = -10, + .last_thrsw_ip = -10, + .last_branch_ip = -10, .ip = 0, }; vir_for_each_block(block, c) { qpu_validate_block(&state, block); } + + if (state.thrsw_count > 1 && !state.last_thrsw_found) { + fail_instr(&state, + "thread switch found without last-THRSW in program"); + } + + if (state.thrsw_count == 0 || + (state.last_thrsw_found && state.thrsw_count == 1)) { + fail_instr(&state, "No program-end THRSW found"); + } } diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index e17a108233f..cb3614edcb6 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -134,6 +134,7 @@ struct qinst { struct qreg src[3]; bool cond_is_exec_mask; bool has_implicit_uniform; + bool is_last_thrsw; /* After vir_to_qpu.c: If instr reads a uniform, which uniform from * the uncompiled stream it is. @@ -522,12 +523,16 @@ struct v3d_compile { uint32_t program_id; uint32_t variant_id; - /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH - * is used to hide texturing latency at the cost of limiting ourselves - * to the bottom half of physical reg space. + /* Set to compile program in in 1x, 2x, or 4x threaded mode, where + * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of + * limiting ourselves to the part of the physical reg space. + * + * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On + * V3D 4.x, all shaders are 2x threaded, and 4x only divides the + * physical reg space in half. */ - bool fs_threaded; - + uint8_t threads; + struct qinst *last_thrsw; bool last_thrsw_at_top_level; bool failed; @@ -547,7 +552,12 @@ struct v3d_prog_data { uint32_t ubo_size; uint8_t num_inputs; + uint8_t threads; + /* For threads > 1, whether the program should be dispatched in the + * after-final-THRSW state. + */ + bool single_seg; }; struct v3d_vs_prog_data { @@ -674,7 +684,7 @@ void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); void vir_lower_uniforms(struct v3d_compile *c); -void v3d_vir_to_qpu(struct v3d_compile *c); +void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); void qpu_validate(struct v3d_compile *c); struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index da4ece2cffe..a063ebc5d53 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -109,7 +109,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) } } - if (inst->qpu.sig.ldtmu) + if (inst->qpu.sig.ldtmu || inst->qpu.sig.thrsw) return true; return false; @@ -528,6 +528,7 @@ vir_compile_init(const struct v3d_compiler *compiler, c->key = key; c->program_id = program_id; c->variant_id = variant_id; + c->threads = 4; s = nir_shader_clone(c, s); c->s = s; @@ -637,6 +638,9 @@ static void v3d_set_prog_data(struct v3d_compile *c, struct v3d_prog_data *prog_data) { + prog_data->threads = c->threads; + prog_data->single_seg = !c->last_thrsw; + v3d_set_prog_data_uniforms(c, prog_data); v3d_set_prog_data_ubo(c, prog_data); } diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index ff30101ce41..ab3a4e257ff 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -23,6 +23,7 @@ #include "util/ralloc.h" #include "util/register_allocate.h" +#include "common/v3d_device_info.h" #include "v3d_compiler.h" #define QPU_R(i) { .magic = false, .index = i } @@ -35,15 +36,17 @@ bool vir_init_reg_sets(struct v3d_compiler *compiler) { + /* Allocate up to 3 regfile classes, for the ways the physical + * register file can be divided up for fragment shader threading. + */ + int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); + compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, true); if (!compiler->regs) return false; - /* Allocate 3 regfile classes, for the ways the physical register file - * can be divided up for fragment shader threading. - */ - for (int threads = 0; threads < 3; threads++) { + for (int threads = 0; threads < max_thread_index; threads++) { compiler->reg_class_phys_or_acc[threads] = ra_alloc_reg_class(compiler->regs); compiler->reg_class_phys[threads] = @@ -105,6 +108,16 @@ v3d_register_allocate(struct v3d_compile *c) struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, c->num_temps + ARRAY_SIZE(acc_nodes)); + /* Convert 1, 2, 4 threads to 0, 1, 2 index. + * + * V3D 4.x has double the physical register space, so 64 physical regs + * are available at both 1x and 2x threading, and 4x has 32. + */ + int thread_index = ffs(c->threads) - 1; + if (c->devinfo->ver >= 40) { + if (thread_index >= 1) + thread_index--; + } /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread @@ -117,9 +130,6 @@ v3d_register_allocate(struct v3d_compile *c) ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); } - /* Compute the live ranges so we can figure out interference. */ - vir_calculate_live_intervals(c); - for (uint32_t i = 0; i < c->num_temps; i++) { map[i].temp = i; map[i].priority = c->temp_end[i] - c->temp_start[i]; @@ -204,23 +214,15 @@ v3d_register_allocate(struct v3d_compile *c) } } -#if 0 - switch (inst->op) { - case QOP_THRSW: + if (inst->qpu.sig.thrsw) { /* All accumulators are invalidated across a thread * switch. */ for (int i = 0; i < c->num_temps; i++) { if (c->temp_start[i] < ip && c->temp_end[i] > ip) - class_bits[i] &= ~(CLASS_BIT_R0_R3 | - CLASS_BIT_R4); + class_bits[i] &= CLASS_BIT_PHYS; } - break; - - default: - break; } -#endif ip++; } @@ -228,14 +230,14 @@ v3d_register_allocate(struct v3d_compile *c) for (uint32_t i = 0; i < c->num_temps; i++) { if (class_bits[i] == CLASS_BIT_PHYS) { ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_phys[c->fs_threaded]); + c->compiler->reg_class_phys[thread_index]); } else { assert(class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4)); ra_set_node_class(g, temp_to_node[i], - c->compiler->reg_class_phys_or_acc[c->fs_threaded]); + c->compiler->reg_class_phys_or_acc[thread_index]); } } @@ -252,12 +254,6 @@ v3d_register_allocate(struct v3d_compile *c) bool ok = ra_allocate(g); if (!ok) { - if (!c->fs_threaded) { - fprintf(stderr, "Failed to register allocate:\n"); - vir_dump(c); - } - - c->failed = true; free(temp_registers); return NULL; } diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c index 955eb96a87e..9229fa5ba47 100644 --- a/src/broadcom/compiler/vir_to_qpu.c +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -319,10 +319,8 @@ v3d_dump_qpu(struct v3d_compile *c) } void -v3d_vir_to_qpu(struct v3d_compile *c) +v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers) { - struct qpu_reg *temp_registers = v3d_register_allocate(c); - /* Reset the uniform count to how many will be actually loaded by the * generated QPU code. */ |