v3d: Drop in a bunch of notes about performance improvement opportunities.

These have all been floating in my head, and while I've thought about encoding them in issues on gitlab once they're enabled, they also make sense to just have in the area of the code you'll need to work in.
author: Eric Anholt <[email protected]> 2018-12-14 14:46:48 -0800
committer: Eric Anholt <[email protected]> 2018-12-14 17:48:01 -0800
commit: 29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch)
tree: d56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/qpu_schedule.c
parent: 248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff)
1 files changed, 13 insertions, 0 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 365aebdbd6d..7662c8f6f08 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
         if (!magic) {
                 add_write_dep(state, &state->last_rf[waddr], n);
         } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+                /* XXX perf: For V3D 4.x, we could reorder TMU writes other
+                 * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
+                 */
                 add_write_dep(state, &state->last_tmu_write, n);
                 switch (waddr) {
                 case V3D_QPU_WADDR_TMUS:
@@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
                 return next_score;
         next_score++;
 
+        /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
+         * instructions after the producer if possible, not just 1.
+         */
+
         /* Default score for things that aren't otherwise special. */
         baseline_score = next_score;
         next_score++;
@@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
                  * r5 in the same tick.
+                 *
+                 * XXX perf: To get good pipelining of a sequence of varying
+                 * loads, we need to figure out how to pair the ldvary signal
+                 * up to the instruction before the last r5 user in the
+                 * previous ldvary sequence.  Currently, it usually pairs with
+                 * the last r5 user.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
author	Eric Anholt <[email protected]>	2018-12-14 14:46:48 -0800
committer	Eric Anholt <[email protected]>	2018-12-14 17:48:01 -0800
commit	29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch)
tree	d56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/qpu_schedule.c
parent	248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff)