summaryrefslogtreecommitdiffstats
path: root/src/broadcom
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2019-02-14 21:11:20 -0800
committerEric Anholt <[email protected]>2019-02-18 18:09:07 -0800
commit1a775d43c9360257b267d097f60fc6ef751730f2 (patch)
treeb78c5c58b0416bb1044c4eaa9ea45f071abf853e /src/broadcom
parent5a84d46896dc16309325c0e60674347c12665eef (diff)
v3d: Delay emitting ldvpm on V3D 4.x until it's actually used.
For V3D 3.x, we emitted the ldvpms all at the top so that we didn't need to do VPM setup when the load_inputs are out of order. For V3D 4.x, we can reduce register pressure by delaying our loads until they're actually needed. This also avoids a bunch of silly MOVs in the pre-opt VIR dump. total instructions in shared programs: 6421415 -> 6419933 (-0.02%) total uniforms in shared programs: 2393139 -> 2393140 (<.01%) total threads in shared programs: 153864 -> 153906 (0.03%)
Diffstat (limited to 'src/broadcom')
-rw-r--r--src/broadcom/compiler/nir_to_vir.c49
1 files changed, 43 insertions, 6 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 27694f66a44..dc65e379b7f 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1537,6 +1537,12 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
&num_components, ~0);
}
+ /* The actual loads will happen directly in nir_intrinsic_load_input
+ * on newer versions.
+ */
+ if (c->devinfo->ver >= 40)
+ return;
+
for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
(loc + 1) * 4);
@@ -1868,12 +1874,43 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_input:
- for (int i = 0; i < instr->num_components; i++) {
- offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- int comp = nir_intrinsic_component(instr) + i;
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[offset * 4 + comp]));
+ offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[0]));
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT &&
+ c->devinfo->ver >= 40) {
+ /* Emit the LDVPM directly now, rather than at the top
+ * of the shader like we did for V3D 3.x (which needs
+ * vpmsetup when not just taking the next offset).
+ *
+ * Note that delaying like this may introduce stalls,
+ * as LDVPMV takes a minimum of 1 instruction but may
+ * be slower if the VPM unit is busy with another QPU.
+ */
+ int index = 0;
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+ index++;
+ }
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+ index++;
+ }
+ for (int i = 0; i < offset; i++)
+ index += c->vattr_sizes[i];
+ index += nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ struct qreg vpm_offset =
+ vir_uniform_ui(c, index++);
+ ntq_store_dest(c, &instr->dest, i,
+ vir_LDVPMV_IN(c, vpm_offset));
+ }
+ } else {
+ for (int i = 0; i < instr->num_components; i++) {
+ int comp = nir_intrinsic_component(instr) + i;
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, c->inputs[offset * 4 +
+ comp]));
+ }
}
break;