diff options
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r-- | src/gallium/drivers/vc4/vc4_context.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_opt_copy_propagation.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_opt_cse.c | 16 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_program.c | 19 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir.c | 18 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir.h | 13 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_emit.c | 58 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_register_allocate.c | 83 |
8 files changed, 106 insertions, 107 deletions
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index 30fb285eefe..654c46f3c0d 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -270,6 +270,7 @@ struct vc4_context { struct ra_regs *regs; unsigned int reg_class_any; + unsigned int reg_class_r4_or_a; unsigned int reg_class_a; uint8_t prim_mode; diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index d6d2fbf257f..a755de9aa41 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c) if (inst->op == QOP_MOV && inst->dst.file == QFILE_TEMP && - inst->src[0].file != QFILE_VPM && - !(inst->src[0].file == QFILE_TEMP && - (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT || - c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) { + inst->src[0].file != QFILE_VPM) { movs[inst->dst.index] = inst->src[0]; } } diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c index 51a56504e5e..0e5480ea781 100644 --- a/src/gallium/drivers/vc4/vc4_opt_cse.c +++ b/src/gallium/drivers/vc4/vc4_opt_cse.c @@ -46,8 +46,7 @@ struct inst_key { struct qreg src[4]; /** * If the instruction depends on the flags, how many SFs have been - * seen before this instruction, or if it depends on r4, how many r4 - * writes have been seen. + * seen before this instruction. */ uint32_t implicit_arg_update_count; }; @@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b) static struct qinst * vc4_find_cse(struct vc4_compile *c, struct hash_table *ht, - struct qinst *inst, uint32_t sf_count, - uint32_t r4_count) + struct qinst *inst, uint32_t sf_count) { if (inst->dst.file != QFILE_TEMP || inst->op == QOP_MOV || @@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht, qir_get_op_nsrc(inst->op) * sizeof(key.src[0])); if (qir_depends_on_flags(inst)) key.implicit_arg_update_count = sf_count; - if (qir_reads_r4(inst)) - key.implicit_arg_update_count = r4_count; uint32_t hash = _mesa_hash_data(&key, sizeof(key)); struct hash_entry *entry = @@ -121,7 +117,7 @@ bool qir_opt_cse(struct vc4_compile *c) { bool progress = false; - uint32_t sf_count = 0, r4_count = 0; + uint32_t sf_count = 0; struct hash_table *ht = _mesa_hash_table_create(NULL, NULL, inst_key_equals); @@ -138,8 +134,7 @@ qir_opt_cse(struct vc4_compile *c) if (inst->sf) { sf_count++; } else { - struct qinst *cse = vc4_find_cse(c, ht, inst, - sf_count, r4_count); + struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count); if (cse) { inst->src[0] = cse->dst; for (int i = 1; i < qir_get_op_nsrc(inst->op); @@ -155,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c) } } } - - if (qir_writes_r4(inst)) - r4_count++; } ralloc_free(ht); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index f2742986beb..5e2a3f448a0 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -105,9 +105,8 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) range->size - 4))); qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); - struct qreg r4 = qir_TEX_RESULT(c); c->num_texture_samples++; - return qir_MOV(c, r4); + return qir_TEX_RESULT(c); } static struct qreg * @@ -360,13 +359,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) qir_TEX_S(c, s, texture_u[next_texture_u++]); c->num_texture_samples++; - struct qreg r4 = qir_TEX_RESULT(c); + struct qreg tex = qir_TEX_RESULT(c); enum pipe_format format = c->key->tex[unit].format; struct qreg unpacked[4]; if (util_format_is_depth_or_stencil(format)) { - struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4, + struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex, qir_uniform_ui(c, 8))); struct qreg normalized = qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff)); @@ -418,7 +417,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) unpacked[i] = depth_output; } else { for (int i = 0; i < 4; i++) - unpacked[i] = qir_R4_UNPACK(c, r4, i); + unpacked[i] = qir_UNPACK_8_F(c, tex, i); } const uint8_t *format_swiz = vc4_get_format_swizzle(format); @@ -1305,9 +1304,10 @@ blend_pipeline(struct vc4_compile *c) if (c->fs_key->blend.blend_enable || c->fs_key->blend.colormask != 0xf || c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - struct qreg r4 = qir_TLB_COLOR_READ(c); + packed_dst_color = qir_TLB_COLOR_READ(c); for (int i = 0; i < 4; i++) - tlb_read_color[i] = qir_R4_UNPACK(c, r4, i); + tlb_read_color[i] = qir_UNPACK_8_F(c, + packed_dst_color, i); for (int i = 0; i < 4; i++) { dst_color[i] = get_swizzled_channel(c, tlb_read_color, @@ -1319,11 +1319,6 @@ blend_pipeline(struct vc4_compile *c) linear_dst_color[i] = dst_color[i]; } } - - /* Save the packed value for logic ops. Can't reuse r4 - * because other things might smash it (like sRGB) - */ - packed_dst_color = qir_MOV(c, r4); } struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef }; diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 1c96ef4795f..254140a72f5 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = { [QOP_TEX_B] = { "tex_b", 0, 2 }, [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, - [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 }, - [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 }, - [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 }, - [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 }, [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 }, [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 }, [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 }, @@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst) } } -bool -qir_reads_r4(struct qinst *inst) -{ - switch (inst->op) { - case QOP_R4_UNPACK_A: - case QOP_R4_UNPACK_B: - case QOP_R4_UNPACK_C: - case QOP_R4_UNPACK_D: - return true; - default: - return false; - } -} - static void qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) { diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index e2d2574f1b1..7a74018d9af 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -158,10 +158,6 @@ enum qop { * the destination */ QOP_TEX_RESULT, - QOP_R4_UNPACK_A, - QOP_R4_UNPACK_B, - QOP_R4_UNPACK_C, - QOP_R4_UNPACK_D }; struct queued_qpu_inst { @@ -442,7 +438,6 @@ bool qir_is_multi_instruction(struct qinst *inst); bool qir_is_tex(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); -bool qir_reads_r4(struct qinst *inst); bool qir_src_needs_a_file(struct qinst *inst); struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg); @@ -579,14 +574,6 @@ QIR_NODST_1(TLB_DISCARD_SETUP) QIR_NODST_1(TLB_STENCIL_SETUP) static inline struct qreg -qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i) -{ - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef)); - return t; -} - -static inline struct qreg qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i) { struct qreg t = qir_get_temp(c); diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index e1b3f3ce99a..f324056258c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -320,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) abort(); } - queue(c, qpu_a_MOV(dst, qpu_r4())); + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; @@ -403,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_COLOR_LOAD); + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; case QOP_TLB_COLOR_WRITE: @@ -452,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_NOP()); *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_LOAD_TMU0); - - break; - - case QOP_R4_UNPACK_A: - case QOP_R4_UNPACK_B: - case QOP_R4_UNPACK_C: - case QOP_R4_UNPACK_D: - assert(src[0].mux == QPU_MUX_R4); - queue(c, qpu_a_MOV(dst, src[0])); - *last_inst(c) |= QPU_PM; - *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + - (qinst->op - - QOP_R4_UNPACK_A), - QPU_UNPACK); - + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); break; case QOP_UNPACK_8A_F: @@ -475,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_UNPACK_8D_F: case QOP_UNPACK_16A_F: case QOP_UNPACK_16B_F: { - assert(src[0].mux == QPU_MUX_A); - - /* Since we're setting the pack bits, if the - * destination is in A it would get re-packed. - */ - queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ? - qpu_rb(31) : dst), - src[0], src[0])); - *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op - - QOP_UNPACK_8A_F], - QPU_UNPACK); + if (src[0].mux == QPU_MUX_R4) { + queue(c, qpu_a_MOV(dst, src[0])); + *last_inst(c) |= QPU_PM; + *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + + (qinst->op - + QOP_UNPACK_8A_F), + QPU_UNPACK); + } else { + assert(src[0].mux == QPU_MUX_A); - if (dst.mux == QPU_MUX_A) { - queue(c, qpu_a_MOV(dst, qpu_rb(31))); + /* Since we're setting the pack bits, if the + * destination is in A it would get re-packed. + */ + queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ? + qpu_rb(31) : dst), + src[0], src[0])); + *last_inst(c) |= + QPU_SET_FIELD(unpack_map[qinst->op - + QOP_UNPACK_8A_F], + QPU_UNPACK); + + if (dst.mux == QPU_MUX_A) { + queue(c, qpu_a_MOV(dst, qpu_rb(31))); + } } } break; diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index 73964b48dca..a29db1f3abe 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4) vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs)); vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) { /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in * vc4_qpu_emit.c @@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4) /* R4 can't be written as a general purpose register. (it's * TMU_NOSWAP as a write address). */ - if (vc4_regs[i].mux == QPU_MUX_R4) + if (vc4_regs[i].mux == QPU_MUX_R4) { + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); continue; + } ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); } - vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); - for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) + for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) { ra_class_add_reg(vc4->regs, vc4->reg_class_a, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); + } ra_set_finalize(vc4->regs, NULL); } @@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b) return a->priority - b->priority; } +#define CLASS_BIT_A (1 << 0) +#define CLASS_BIT_B_OR_ACC (1 << 1) +#define CLASS_BIT_R4 (1 << 2) + /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. * @@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) uint32_t temp_to_node[c->num_temps]; uint32_t def[c->num_temps]; uint32_t use[c->num_temps]; + uint8_t class_bits[c->num_temps]; struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); memset(def, 0, sizeof(def)); @@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, c->num_temps); - for (uint32_t i = 0; i < c->num_temps; i++) { - ra_set_node_class(g, i, vc4->reg_class_any); - } - /* Compute the live ranges so we can figure out interference. */ uint32_t ip = 0; @@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) temp_to_node[map[i].temp] = i; } - /* Figure out our register classes and preallocated registers*/ + /* Figure out our register classes and preallocated registers. We + * start with any temp being able to be in any file, then instructions + * incrementally remove bits that the temp definitely can't be in. + */ + memset(class_bits, + CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4, + sizeof(class_bits)); + + ip = 0; list_for_each_entry(struct qinst, inst, &c->instructions, link) { + if (qir_writes_r4(inst)) { + /* This instruction writes r4 (and optionally moves + * its result to a temp), so nothing else can be + * stored in r4 across it. + */ + for (int i = 0; i < c->num_temps; i++) { + if (def[i] < ip && use[i] > ip) + class_bits[i] &= ~CLASS_BIT_R4; + } + } else { + /* R4 can't be written as a general purpose + * register. (it's TMU_NOSWAP as a write address). + */ + if (inst->dst.file == QFILE_TEMP) + class_bits[inst->dst.index] &= ~CLASS_BIT_R4; + } + switch (inst->op) { case QOP_FRAG_Z: ra_set_node_reg(g, temp_to_node[inst->dst.index], @@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); break; - case QOP_TEX_RESULT: - case QOP_TLB_COLOR_READ: - assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4); - ra_set_node_reg(g, temp_to_node[inst->dst.index], - ACC_INDEX + 4); - break; - case QOP_PACK_SCALED: /* The pack flags require an A-file dst register. */ - ra_set_node_class(g, temp_to_node[inst->dst.index], - vc4->reg_class_a); + class_bits[inst->dst.index] &= CLASS_BIT_A; break; default: @@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) } if (qir_src_needs_a_file(inst)) { - ra_set_node_class(g, temp_to_node[inst->src[0].index], - vc4->reg_class_a); + class_bits[inst->src[0].index] &= CLASS_BIT_A; + } + ip++; + } + + for (uint32_t i = 0; i < c->num_temps; i++) { + int node = temp_to_node[i]; + + switch (class_bits[i]) { + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4: + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC: + ra_set_node_class(g, node, vc4->reg_class_any); + break; + case CLASS_BIT_A | CLASS_BIT_R4: + ra_set_node_class(g, node, vc4->reg_class_r4_or_a); + break; + case CLASS_BIT_A: + ra_set_node_class(g, node, vc4->reg_class_a); + break; + default: + fprintf(stderr, "temp %d: bad class bits: 0x%x\n", + i, class_bits[i]); + abort(); + break; } } |