v3d: Delay emitting ldvpm on V3D 4.x until it's actually used.

For V3D 3.x, we emitted the ldvpms all at the top so that we didn't need to do VPM setup when the load_inputs are out of order. For V3D 4.x, we can reduce register pressure by delaying our loads until they're actually needed. This also avoids a bunch of silly MOVs in the pre-opt VIR dump. total instructions in shared programs: 6421415 -> 6419933 (-0.02%) total uniforms in shared programs: 2393139 -> 2393140 (<.01%) total threads in shared programs: 153864 -> 153906 (0.03%)
author: Eric Anholt <[email protected]> 2019-02-14 21:11:20 -0800
committer: Eric Anholt <[email protected]> 2019-02-18 18:09:07 -0800
commit: 1a775d43c9360257b267d097f60fc6ef751730f2 (patch)
tree: b78c5c58b0416bb1044c4eaa9ea45f071abf853e /src/broadcom
parent: 5a84d46896dc16309325c0e60674347c12665eef (diff)
1 files changed, 43 insertions, 6 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 27694f66a44..dc65e379b7f 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1537,6 +1537,12 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
                                            &num_components, ~0);
         }
 
+        /* The actual loads will happen directly in nir_intrinsic_load_input
+         * on newer versions.
+         */
+        if (c->devinfo->ver >= 40)
+                return;
+
         for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + 1) * 4);
@@ -1868,12 +1874,43 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_input:
-                for (int i = 0; i < instr->num_components; i++) {
-                        offset = (nir_intrinsic_base(instr) +
-                                  nir_src_as_uint(instr->src[0]));
-                        int comp = nir_intrinsic_component(instr) + i;
-                        ntq_store_dest(c, &instr->dest, i,
-                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
+                offset = (nir_intrinsic_base(instr) +
+                          nir_src_as_uint(instr->src[0]));
+                if (c->s->info.stage != MESA_SHADER_FRAGMENT &&
+                    c->devinfo->ver >= 40) {
+                        /* Emit the LDVPM directly now, rather than at the top
+                         * of the shader like we did for V3D 3.x (which needs
+                         * vpmsetup when not just taking the next offset).
+                         *
+                         * Note that delaying like this may introduce stalls,
+                         * as LDVPMV takes a minimum of 1 instruction but may
+                         * be slower if the VPM unit is busy with another QPU.
+                         */
+                        int index = 0;
+                        if (c->s->info.system_values_read &
+                            (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+                                index++;
+                        }
+                        if (c->s->info.system_values_read &
+                            (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+                                index++;
+                        }
+                        for (int i = 0; i < offset; i++)
+                                index += c->vattr_sizes[i];
+                        index += nir_intrinsic_component(instr);
+                        for (int i = 0; i < instr->num_components; i++) {
+                                struct qreg vpm_offset =
+                                        vir_uniform_ui(c, index++);
+                                ntq_store_dest(c, &instr->dest, i,
+                                               vir_LDVPMV_IN(c, vpm_offset));
+                        }
+                } else {
+                        for (int i = 0; i < instr->num_components; i++) {
+                                int comp = nir_intrinsic_component(instr) + i;
+                                ntq_store_dest(c, &instr->dest, i,
+                                               vir_MOV(c, c->inputs[offset * 4 +
+                                                                    comp]));
+                        }
                 }
                 break;
author	Eric Anholt <[email protected]>	2019-02-14 21:11:20 -0800
committer	Eric Anholt <[email protected]>	2019-02-18 18:09:07 -0800
commit	1a775d43c9360257b267d097f60fc6ef751730f2 (patch)
tree	b78c5c58b0416bb1044c4eaa9ea45f071abf853e /src/broadcom
parent	5a84d46896dc16309325c0e60674347c12665eef (diff)