summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4/kernel
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2014-12-10 14:56:46 -0800
committerEric Anholt <[email protected]>2014-12-17 19:35:13 -0800
commite473fbe4690b5cbe3769042a4917f22559e2ba8d (patch)
treed2c2a467d69a4713651b40bf269db9691544baab /src/gallium/drivers/vc4/kernel
parentff266483fb61fd69775daf5c931ca7a56a26f4ac (diff)
vc4: Add support for turning constant uniforms into small immediates.
Small immediates have the downside of taking over the raddr B field, so you might have less chance to pack instructions together thanks to raddr B conflicts. However, it also reduces some register pressure since it lets you load 2 "uniform" values in one instruction (avoiding a previous load of the constant value to a register), and increases some pairing for the same reason. total uniforms in shared programs: 16231 -> 13374 (-17.60%) uniforms in affected programs: 10280 -> 7423 (-27.79%) total instructions in shared programs: 40795 -> 41168 (0.91%) instructions in affected programs: 25551 -> 25924 (1.46%) In a previous version of this patch I had a reduction in instruction count by forcing the other args alongside a SMALL_IMM to be in the A file or accumulators, but that increases register pressure and had a bug in handling FRAG_Z. In this patch is I just use raddr conflict resolution, which is more expensive. I think I'd rather tweak allocation to have some way to slightly prefer good choices for files in general, rather than risk failing to register allocate by forcing things into register classes.
Diffstat (limited to 'src/gallium/drivers/vc4/kernel')
-rw-r--r--src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c17
1 files changed, 14 insertions, 3 deletions
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
index f5e152bab55..48bc683da5c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -133,12 +133,18 @@ check_tmu_write(uint64_t inst,
int tmu = waddr > QPU_W_TMU0_B;
bool submit = is_tmu_submit(waddr);
bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
if (is_direct) {
uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
uint32_t clamp_offset = ~0;
+ if (sig == QPU_SIG_SMALL_IMM) {
+ DRM_ERROR("direct TMU read used small immediate\n");
+ return false;
+ }
+
/* Make sure that this texture load is an add of the base
* address of the UBO to a clamped offset within the UBO.
*/
@@ -180,7 +186,8 @@ check_tmu_write(uint64_t inst,
validation_state->tmu_setup[tmu].is_direct = true;
} else {
- if (raddr_a == QPU_R_UNIF || raddr_b == QPU_R_UNIF) {
+ if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
+ raddr_b == QPU_R_UNIF)) {
DRM_ERROR("uniform read in the same instruction as "
"texture setup.\n");
return false;
@@ -298,6 +305,7 @@ track_live_clamps(uint64_t inst,
uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
bool is_b = inst & QPU_WS;
uint32_t live_reg_index;
@@ -305,7 +313,8 @@ track_live_clamps(uint64_t inst,
return;
if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
- !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+ !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
+ sig != QPU_SIG_SMALL_IMM)) {
return;
}
@@ -344,9 +353,10 @@ check_instruction_reads(uint64_t inst,
{
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
if (raddr_a == QPU_R_UNIF ||
- raddr_b == QPU_R_UNIF) {
+ (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
/* This can't overflow the uint32_t, because we're reading 8
* bytes of instruction to increment by 4 here, so we'd
* already be OOM.
@@ -401,6 +411,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
case QPU_SIG_LOAD_TMU0:
case QPU_SIG_LOAD_TMU1:
case QPU_SIG_PROG_END:
+ case QPU_SIG_SMALL_IMM:
if (!check_instruction_writes(inst, validated_shader,
&validation_state)) {
DRM_ERROR("Bad write at ip %d\n", ip);