summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r--src/gallium/drivers/vc4/vc4_context.h1
-rw-r--r--src/gallium/drivers/vc4/vc4_opt_copy_propagation.c5
-rw-r--r--src/gallium/drivers/vc4/vc4_opt_cse.c16
-rw-r--r--src/gallium/drivers/vc4/vc4_program.c19
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.c18
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.h13
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_emit.c58
-rw-r--r--src/gallium/drivers/vc4/vc4_register_allocate.c83
8 files changed, 106 insertions, 107 deletions
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 30fb285eefe..654c46f3c0d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -270,6 +270,7 @@ struct vc4_context {
struct ra_regs *regs;
unsigned int reg_class_any;
+ unsigned int reg_class_r4_or_a;
unsigned int reg_class_a;
uint8_t prim_mode;
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d6d2fbf257f..a755de9aa41 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c)
if (inst->op == QOP_MOV &&
inst->dst.file == QFILE_TEMP &&
- inst->src[0].file != QFILE_VPM &&
- !(inst->src[0].file == QFILE_TEMP &&
- (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT ||
- c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) {
+ inst->src[0].file != QFILE_VPM) {
movs[inst->dst.index] = inst->src[0];
}
}
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 51a56504e5e..0e5480ea781 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -46,8 +46,7 @@ struct inst_key {
struct qreg src[4];
/**
* If the instruction depends on the flags, how many SFs have been
- * seen before this instruction, or if it depends on r4, how many r4
- * writes have been seen.
+ * seen before this instruction.
*/
uint32_t implicit_arg_update_count;
};
@@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b)
static struct qinst *
vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
- struct qinst *inst, uint32_t sf_count,
- uint32_t r4_count)
+ struct qinst *inst, uint32_t sf_count)
{
if (inst->dst.file != QFILE_TEMP ||
inst->op == QOP_MOV ||
@@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
qir_get_op_nsrc(inst->op) * sizeof(key.src[0]));
if (qir_depends_on_flags(inst))
key.implicit_arg_update_count = sf_count;
- if (qir_reads_r4(inst))
- key.implicit_arg_update_count = r4_count;
uint32_t hash = _mesa_hash_data(&key, sizeof(key));
struct hash_entry *entry =
@@ -121,7 +117,7 @@ bool
qir_opt_cse(struct vc4_compile *c)
{
bool progress = false;
- uint32_t sf_count = 0, r4_count = 0;
+ uint32_t sf_count = 0;
struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
inst_key_equals);
@@ -138,8 +134,7 @@ qir_opt_cse(struct vc4_compile *c)
if (inst->sf) {
sf_count++;
} else {
- struct qinst *cse = vc4_find_cse(c, ht, inst,
- sf_count, r4_count);
+ struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count);
if (cse) {
inst->src[0] = cse->dst;
for (int i = 1; i < qir_get_op_nsrc(inst->op);
@@ -155,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c)
}
}
}
-
- if (qir_writes_r4(inst))
- r4_count++;
}
ralloc_free(ht);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index f2742986beb..5e2a3f448a0 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -105,9 +105,8 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
range->size - 4)));
qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
- struct qreg r4 = qir_TEX_RESULT(c);
c->num_texture_samples++;
- return qir_MOV(c, r4);
+ return qir_TEX_RESULT(c);
}
static struct qreg *
@@ -360,13 +359,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
qir_TEX_S(c, s, texture_u[next_texture_u++]);
c->num_texture_samples++;
- struct qreg r4 = qir_TEX_RESULT(c);
+ struct qreg tex = qir_TEX_RESULT(c);
enum pipe_format format = c->key->tex[unit].format;
struct qreg unpacked[4];
if (util_format_is_depth_or_stencil(format)) {
- struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
+ struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
qir_uniform_ui(c, 8)));
struct qreg normalized = qir_FMUL(c, depthf,
qir_uniform_f(c, 1.0f/0xffffff));
@@ -418,7 +417,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
unpacked[i] = depth_output;
} else {
for (int i = 0; i < 4; i++)
- unpacked[i] = qir_R4_UNPACK(c, r4, i);
+ unpacked[i] = qir_UNPACK_8_F(c, tex, i);
}
const uint8_t *format_swiz = vc4_get_format_swizzle(format);
@@ -1305,9 +1304,10 @@ blend_pipeline(struct vc4_compile *c)
if (c->fs_key->blend.blend_enable ||
c->fs_key->blend.colormask != 0xf ||
c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
- struct qreg r4 = qir_TLB_COLOR_READ(c);
+ packed_dst_color = qir_TLB_COLOR_READ(c);
for (int i = 0; i < 4; i++)
- tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
+ tlb_read_color[i] = qir_UNPACK_8_F(c,
+ packed_dst_color, i);
for (int i = 0; i < 4; i++) {
dst_color[i] = get_swizzled_channel(c,
tlb_read_color,
@@ -1319,11 +1319,6 @@ blend_pipeline(struct vc4_compile *c)
linear_dst_color[i] = dst_color[i];
}
}
-
- /* Save the packed value for logic ops. Can't reuse r4
- * because other things might smash it (like sRGB)
- */
- packed_dst_color = qir_MOV(c, r4);
}
struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 1c96ef4795f..254140a72f5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = {
[QOP_TEX_B] = { "tex_b", 0, 2 },
[QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
[QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
- [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
- [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
- [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 },
- [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 },
[QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
[QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
[QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
@@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst)
}
}
-bool
-qir_reads_r4(struct qinst *inst)
-{
- switch (inst->op) {
- case QOP_R4_UNPACK_A:
- case QOP_R4_UNPACK_B:
- case QOP_R4_UNPACK_C:
- case QOP_R4_UNPACK_D:
- return true;
- default:
- return false;
- }
-}
-
static void
qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
{
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index e2d2574f1b1..7a74018d9af 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -158,10 +158,6 @@ enum qop {
* the destination
*/
QOP_TEX_RESULT,
- QOP_R4_UNPACK_A,
- QOP_R4_UNPACK_B,
- QOP_R4_UNPACK_C,
- QOP_R4_UNPACK_D
};
struct queued_qpu_inst {
@@ -442,7 +438,6 @@ bool qir_is_multi_instruction(struct qinst *inst);
bool qir_is_tex(struct qinst *inst);
bool qir_depends_on_flags(struct qinst *inst);
bool qir_writes_r4(struct qinst *inst);
-bool qir_reads_r4(struct qinst *inst);
bool qir_src_needs_a_file(struct qinst *inst);
struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
@@ -579,14 +574,6 @@ QIR_NODST_1(TLB_DISCARD_SETUP)
QIR_NODST_1(TLB_STENCIL_SETUP)
static inline struct qreg
-qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
-{
- struct qreg t = qir_get_temp(c);
- qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef));
- return t;
-}
-
-static inline struct qreg
qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
{
struct qreg t = qir_get_temp(c);
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index e1b3f3ce99a..f324056258c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -320,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
abort();
}
- queue(c, qpu_a_MOV(dst, qpu_r4()));
+ if (dst.mux != QPU_MUX_R4)
+ queue(c, qpu_a_MOV(dst, qpu_r4()));
break;
@@ -403,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
*last_inst(c) = qpu_set_sig(*last_inst(c),
QPU_SIG_COLOR_LOAD);
+ if (dst.mux != QPU_MUX_R4)
+ queue(c, qpu_a_MOV(dst, qpu_r4()));
break;
case QOP_TLB_COLOR_WRITE:
@@ -452,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
queue(c, qpu_NOP());
*last_inst(c) = qpu_set_sig(*last_inst(c),
QPU_SIG_LOAD_TMU0);
-
- break;
-
- case QOP_R4_UNPACK_A:
- case QOP_R4_UNPACK_B:
- case QOP_R4_UNPACK_C:
- case QOP_R4_UNPACK_D:
- assert(src[0].mux == QPU_MUX_R4);
- queue(c, qpu_a_MOV(dst, src[0]));
- *last_inst(c) |= QPU_PM;
- *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
- (qinst->op -
- QOP_R4_UNPACK_A),
- QPU_UNPACK);
-
+ if (dst.mux != QPU_MUX_R4)
+ queue(c, qpu_a_MOV(dst, qpu_r4()));
break;
case QOP_UNPACK_8A_F:
@@ -475,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
case QOP_UNPACK_8D_F:
case QOP_UNPACK_16A_F:
case QOP_UNPACK_16B_F: {
- assert(src[0].mux == QPU_MUX_A);
-
- /* Since we're setting the pack bits, if the
- * destination is in A it would get re-packed.
- */
- queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
- qpu_rb(31) : dst),
- src[0], src[0]));
- *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
- QOP_UNPACK_8A_F],
- QPU_UNPACK);
+ if (src[0].mux == QPU_MUX_R4) {
+ queue(c, qpu_a_MOV(dst, src[0]));
+ *last_inst(c) |= QPU_PM;
+ *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
+ (qinst->op -
+ QOP_UNPACK_8A_F),
+ QPU_UNPACK);
+ } else {
+ assert(src[0].mux == QPU_MUX_A);
- if (dst.mux == QPU_MUX_A) {
- queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+ /* Since we're setting the pack bits, if the
+ * destination is in A it would get re-packed.
+ */
+ queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+ qpu_rb(31) : dst),
+ src[0], src[0]));
+ *last_inst(c) |=
+ QPU_SET_FIELD(unpack_map[qinst->op -
+ QOP_UNPACK_8A_F],
+ QPU_UNPACK);
+
+ if (dst.mux == QPU_MUX_A) {
+ queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+ }
}
}
break;
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 73964b48dca..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
/* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
* vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
/* R4 can't be written as a general purpose register. (it's
* TMU_NOSWAP as a write address).
*/
- if (vc4_regs[i].mux == QPU_MUX_R4)
+ if (vc4_regs[i].mux == QPU_MUX_R4) {
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
continue;
+ }
ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
}
- vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
- for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+ for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+ }
ra_set_finalize(vc4->regs, NULL);
}
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
return a->priority - b->priority;
}
+#define CLASS_BIT_A (1 << 0)
+#define CLASS_BIT_B_OR_ACC (1 << 1)
+#define CLASS_BIT_R4 (1 << 2)
+
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
*
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
uint32_t temp_to_node[c->num_temps];
uint32_t def[c->num_temps];
uint32_t use[c->num_temps];
+ uint8_t class_bits[c->num_temps];
struct qpu_reg *temp_registers = calloc(c->num_temps,
sizeof(*temp_registers));
memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
c->num_temps);
- for (uint32_t i = 0; i < c->num_temps; i++) {
- ra_set_node_class(g, i, vc4->reg_class_any);
- }
-
/* Compute the live ranges so we can figure out interference.
*/
uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
temp_to_node[map[i].temp] = i;
}
- /* Figure out our register classes and preallocated registers*/
+ /* Figure out our register classes and preallocated registers. We
+ * start with any temp being able to be in any file, then instructions
+ * incrementally remove bits that the temp definitely can't be in.
+ */
+ memset(class_bits,
+ CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+ sizeof(class_bits));
+
+ ip = 0;
list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+ if (qir_writes_r4(inst)) {
+ /* This instruction writes r4 (and optionally moves
+ * its result to a temp), so nothing else can be
+ * stored in r4 across it.
+ */
+ for (int i = 0; i < c->num_temps; i++) {
+ if (def[i] < ip && use[i] > ip)
+ class_bits[i] &= ~CLASS_BIT_R4;
+ }
+ } else {
+ /* R4 can't be written as a general purpose
+ * register. (it's TMU_NOSWAP as a write address).
+ */
+ if (inst->dst.file == QFILE_TEMP)
+ class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+ }
+
switch (inst->op) {
case QOP_FRAG_Z:
ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
break;
- case QOP_TEX_RESULT:
- case QOP_TLB_COLOR_READ:
- assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
- ra_set_node_reg(g, temp_to_node[inst->dst.index],
- ACC_INDEX + 4);
- break;
-
case QOP_PACK_SCALED:
/* The pack flags require an A-file dst register. */
- ra_set_node_class(g, temp_to_node[inst->dst.index],
- vc4->reg_class_a);
+ class_bits[inst->dst.index] &= CLASS_BIT_A;
break;
default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
}
if (qir_src_needs_a_file(inst)) {
- ra_set_node_class(g, temp_to_node[inst->src[0].index],
- vc4->reg_class_a);
+ class_bits[inst->src[0].index] &= CLASS_BIT_A;
+ }
+ ip++;
+ }
+
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ int node = temp_to_node[i];
+
+ switch (class_bits[i]) {
+ case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+ case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+ ra_set_node_class(g, node, vc4->reg_class_any);
+ break;
+ case CLASS_BIT_A | CLASS_BIT_R4:
+ ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+ break;
+ case CLASS_BIT_A:
+ ra_set_node_class(g, node, vc4->reg_class_a);
+ break;
+ default:
+ fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+ i, class_bits[i]);
+ abort();
+ break;
}
}