v3d: Drop in a bunch of notes about performance improvement opportunities.

These have all been floating in my head, and while I've thought about encoding them in issues on gitlab once they're enabled, they also make sense to just have in the area of the code you'll need to work in.
author: Eric Anholt <[email protected]> 2018-12-14 14:46:48 -0800
committer: Eric Anholt <[email protected]> 2018-12-14 17:48:01 -0800
commit: 29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch)
tree: d56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/nir_to_vir.c
parent: 248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff)
1 files changed, 34 insertions, 1 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 446ac53e95f..484dc050368 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -850,6 +850,9 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_unpack_half_2x16_split_x:
+                /* XXX perf: It would be good to be able to merge this unpack
+                 * with whatever uses our result.
+                 */
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
                 break;
@@ -1489,6 +1492,10 @@ ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
 {
+        /* XXX perf: Experiment with using immediate loads to avoid having
+         * these end up in the uniform stream.  Watch out for breaking the
+         * small immediates optimization in the process!
+         */
         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
@@ -1535,6 +1542,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 for (int i = 0; i < instr->num_components; i++) {
                         int ubo = nir_src_as_uint(instr->src[0]);
 
+                        /* XXX perf: On V3D 4.x with uniform offsets, we
+                         * should probably try setting UBOs up in the A
+                         * register file and doing a sequence of loads that
+                         * way.
+                         */
                         /* Adjust for where we stored the TGSI register base. */
                         vir_ADD_dest(c,
                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
@@ -1669,6 +1681,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
 /* Clears (activates) the execute flags for any channels whose jump target
  * matches this block.
+ *
+ * XXX perf: Could we be using flpush/flpop somehow for our execution channel
+ * enabling?
+ *
+ * XXX perf: For uniform control flow, we should be able to skip c->execute
+ * handling entirely.
  */
 static void
 ntq_activate_execute_for_block(struct v3d_compile *c)
@@ -1704,6 +1722,10 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
         /* Set A for executing (execute == 0) and jumping (if->condition ==
          * 0) channels, and then update execute flags for those to point to
          * the ELSE block.
+         *
+         * XXX perf: we could reuse ntq_emit_comparison() to generate our if
+         * condition, and the .uf field to ignore non-executing channels, to
+         * reduce the overhead of if statements.
          */
         vir_PF(c, vir_OR(c,
                          c->execute,
@@ -1925,6 +1947,10 @@ nir_to_vir(struct v3d_compile *c)
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
+                /* XXX perf: We could set the "disable implicit point/line
+                 * varyings" field in the shader record and not emit these, if
+                 * they're not going to be used.
+                 */
                 if (c->fs_key->is_points) {
                         c->point_x = emit_fragment_varying(c, NULL, 0);
                         c->point_y = emit_fragment_varying(c, NULL, 0);
@@ -2119,7 +2145,14 @@ v3d_nir_to_vir(struct v3d_compile *c)
 
         vir_check_payload_w(c);
 
-        /* XXX: vir_schedule_instructions(c); */
+        /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
+         * We used that on that platform to pipeline TMU writes and reduce the
+         * number of thread switches, as well as try (mostly successfully) to
+         * reduce maximum register pressure to allow more threads.  We should
+         * do something of that sort for V3D -- either instruction scheduling
+         * here, or delay the the THRSW and LDTMUs from our texture
+         * instructions until the results are needed.
+         */
 
         if (V3D_DEBUG & (V3D_DEBUG_VIR |
                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
author	Eric Anholt <[email protected]>	2018-12-14 14:46:48 -0800
committer	Eric Anholt <[email protected]>	2018-12-14 17:48:01 -0800
commit	29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch)
tree	d56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/nir_to_vir.c
parent	248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff)