summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2015-01-10 15:59:56 +1300
committerEric Anholt <[email protected]>2015-01-11 08:57:24 +1300
commit355156d2f75ffb5fae90dd6583ae3cf465ba40b4 (patch)
tree59643c4204e3f666e51ee91c69a0daf48242c113 /src/gallium/drivers/vc4
parenta8e14c293b19a2d298f91f283d6b6839f36fb518 (diff)
vc4: Avoid the save/restore of r3 for raddr conflicts, just use ra31.
Turns out this was harmful in code quality: total instructions in shared programs: 39487 -> 38845 (-1.63%) instructions in affected programs: 22522 -> 21880 (-2.85%) This costs us yet another register, which is painful since it means more programs might fail to compile). However, the alternative was causing us trouble where we'd save/restore r3 while it contained a MIN-ed direct texture offset, causing the kernel to fail to validate our shaders (such as in GLB2.7).
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_emit.c45
-rw-r--r--src/gallium/drivers/vc4/vc4_register_allocate.c4
2 files changed, 11 insertions, 38 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index f3f50ec68f8..7531be5cf89 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -96,13 +96,12 @@ swap_file(struct qpu_reg *src)
* address.
*
* In that case, we need to move one to a temporary that can be used in the
- * instruction, instead.
+ * instruction, instead. We reserve ra31/rb31 for this purpose.
*/
-static bool
+static void
fixup_raddr_conflict(struct vc4_compile *c,
struct qpu_reg dst,
- struct qpu_reg *src0, struct qpu_reg *src1,
- bool r3_live)
+ struct qpu_reg *src0, struct qpu_reg *src1)
{
uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
@@ -111,31 +110,18 @@ fixup_raddr_conflict(struct vc4_compile *c,
mux0 != mux1 ||
(src0->addr == src1->addr &&
src0->mux == src1->mux)) {
- return false;
+ return;
}
if (swap_file(src0) || swap_file(src1))
- return false;
+ return;
if (mux0 == QPU_MUX_A) {
- /* If we're conflicting over the A regfile, then we can just
- * use the reserved rb31.
- */
queue(c, qpu_a_MOV(qpu_rb(31), *src1));
*src1 = qpu_rb(31);
- return false;
} else {
- /* Otherwise, we need a non-B regfile. So, we spill r3 out to
- * rb31, then store our desired value in r3, and tell the
- * caller to put rb31 back into r3 when we're done.
- */
- if (r3_live)
- queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
- queue(c, qpu_a_MOV(qpu_r3(), *src1));
-
- *src1 = qpu_r3();
-
- return r3_live && dst.mux != QPU_MUX_R3;
+ queue(c, qpu_a_MOV(qpu_ra(31), *src1));
+ *src1 = qpu_ra(31);
}
}
@@ -148,8 +134,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
uint32_t vpm_read_fifo_count = 0;
uint32_t vpm_read_offset = 0;
int last_vpm_read_index = -1;
- bool written_r3 = false;
- bool needs_restore;
/* Map from the QIR ops enum order to QPU unpack bits. */
static const uint32_t unpack_map[] = {
QPU_UNPACK_8A,
@@ -467,12 +451,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
break;
case QOP_TEX_DIRECT:
- needs_restore = fixup_raddr_conflict(c, dst,
- &src[0], &src[1],
- written_r3);
+ fixup_raddr_conflict(c, dst, &src[0], &src[1]);
queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
- if (needs_restore)
- queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
break;
case QOP_TEX_RESULT:
@@ -554,9 +534,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
if (qir_get_op_nsrc(qinst->op) == 1)
src[1] = src[0];
- needs_restore = fixup_raddr_conflict(c, dst,
- &src[0], &src[1],
- written_r3);
+ fixup_raddr_conflict(c, dst, &src[0], &src[1]);
if (translate[qinst->op].is_mul) {
queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -567,14 +545,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
dst,
src[0], src[1]));
}
- if (needs_restore)
- queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
break;
}
-
- if (dst.mux == QPU_MUX_R3)
- written_r3 = true;
}
qpu_schedule_instructions(c);
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 9eae7fca758..efd9d51eaa0 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -117,10 +117,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
- /* Reserve rb31 for spilling fixup_raddr_conflict() in
+ /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
* vc4_qpu_emit.c
*/
- if (vc4_regs[i].mux == QPU_MUX_B && vc4_regs[i].addr == 31)
+ if (vc4_regs[i].addr == 31)
continue;
/* R4 can't be written as a general purpose register. (it's