summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4/vc4_register_allocate.c
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2015-07-31 11:46:56 -0700
committerEric Anholt <[email protected]>2015-08-04 17:19:01 -0700
commitcc8fb2904673588d31b660dbfaf692615b5202dd (patch)
tree9eb8039f76db45555999e01939ce7a932c5398a6 /src/gallium/drivers/vc4/vc4_register_allocate.c
parent9b403c0756ecf806a8ff768bd73a4cbf42986bdb (diff)
vc4: Make r4-writes implicitly move to a temp, and allocate temps to r4.
Previously, SFU values always moved to a temporary, and TLB color reads and texture reads always lived in r4. Instead, we can have these results just be normal temporaries, and the register allocator can leave the values in r4 when they don't interfere with anything else using r4. shader-db results: total instructions in shared programs: 100809 -> 100040 (-0.76%) instructions in affected programs: 42383 -> 41614 (-1.81%)
Diffstat (limited to 'src/gallium/drivers/vc4/vc4_register_allocate.c')
-rw-r--r--src/gallium/drivers/vc4/vc4_register_allocate.c83
1 files changed, 64 insertions, 19 deletions
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 73964b48dca..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
/* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
* vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
/* R4 can't be written as a general purpose register. (it's
* TMU_NOSWAP as a write address).
*/
- if (vc4_regs[i].mux == QPU_MUX_R4)
+ if (vc4_regs[i].mux == QPU_MUX_R4) {
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
continue;
+ }
ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
}
- vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
- for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+ for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+ }
ra_set_finalize(vc4->regs, NULL);
}
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
return a->priority - b->priority;
}
+#define CLASS_BIT_A (1 << 0)
+#define CLASS_BIT_B_OR_ACC (1 << 1)
+#define CLASS_BIT_R4 (1 << 2)
+
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
*
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
uint32_t temp_to_node[c->num_temps];
uint32_t def[c->num_temps];
uint32_t use[c->num_temps];
+ uint8_t class_bits[c->num_temps];
struct qpu_reg *temp_registers = calloc(c->num_temps,
sizeof(*temp_registers));
memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
c->num_temps);
- for (uint32_t i = 0; i < c->num_temps; i++) {
- ra_set_node_class(g, i, vc4->reg_class_any);
- }
-
/* Compute the live ranges so we can figure out interference.
*/
uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
temp_to_node[map[i].temp] = i;
}
- /* Figure out our register classes and preallocated registers*/
+ /* Figure out our register classes and preallocated registers. We
+ * start with any temp being able to be in any file, then instructions
+ * incrementally remove bits that the temp definitely can't be in.
+ */
+ memset(class_bits,
+ CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+ sizeof(class_bits));
+
+ ip = 0;
list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+ if (qir_writes_r4(inst)) {
+ /* This instruction writes r4 (and optionally moves
+ * its result to a temp), so nothing else can be
+ * stored in r4 across it.
+ */
+ for (int i = 0; i < c->num_temps; i++) {
+ if (def[i] < ip && use[i] > ip)
+ class_bits[i] &= ~CLASS_BIT_R4;
+ }
+ } else {
+ /* R4 can't be written as a general purpose
+ * register. (it's TMU_NOSWAP as a write address).
+ */
+ if (inst->dst.file == QFILE_TEMP)
+ class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+ }
+
switch (inst->op) {
case QOP_FRAG_Z:
ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
break;
- case QOP_TEX_RESULT:
- case QOP_TLB_COLOR_READ:
- assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
- ra_set_node_reg(g, temp_to_node[inst->dst.index],
- ACC_INDEX + 4);
- break;
-
case QOP_PACK_SCALED:
/* The pack flags require an A-file dst register. */
- ra_set_node_class(g, temp_to_node[inst->dst.index],
- vc4->reg_class_a);
+ class_bits[inst->dst.index] &= CLASS_BIT_A;
break;
default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
}
if (qir_src_needs_a_file(inst)) {
- ra_set_node_class(g, temp_to_node[inst->src[0].index],
- vc4->reg_class_a);
+ class_bits[inst->src[0].index] &= CLASS_BIT_A;
+ }
+ ip++;
+ }
+
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ int node = temp_to_node[i];
+
+ switch (class_bits[i]) {
+ case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+ case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+ ra_set_node_class(g, node, vc4->reg_class_any);
+ break;
+ case CLASS_BIT_A | CLASS_BIT_R4:
+ ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+ break;
+ case CLASS_BIT_A:
+ ra_set_node_class(g, node, vc4->reg_class_a);
+ break;
+ default:
+ fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+ i, class_bits[i]);
+ abort();
+ break;
}
}