summaryrefslogtreecommitdiffstats
path: root/src/broadcom/compiler/qpu_schedule.c
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2018-12-14 14:46:48 -0800
committerEric Anholt <[email protected]>2018-12-14 17:48:01 -0800
commit29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch)
treed56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/qpu_schedule.c
parent248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff)
v3d: Drop in a bunch of notes about performance improvement opportunities.
These have all been floating in my head, and while I've thought about encoding them in issues on gitlab once they're enabled, they also make sense to just have in the area of the code you'll need to work in.
Diffstat (limited to 'src/broadcom/compiler/qpu_schedule.c')
-rw-r--r--src/broadcom/compiler/qpu_schedule.c13
1 files changed, 13 insertions, 0 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 365aebdbd6d..7662c8f6f08 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
if (!magic) {
add_write_dep(state, &state->last_rf[waddr], n);
} else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+ /* XXX perf: For V3D 4.x, we could reorder TMU writes other
+ * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
+ */
add_write_dep(state, &state->last_tmu_write, n);
switch (waddr) {
case V3D_QPU_WADDR_TMUS:
@@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
return next_score;
next_score++;
+ /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
+ * instructions after the producer if possible, not just 1.
+ */
+
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
@@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
* sooner. If the ldvary's r5 wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
* r5 in the same tick.
+ *
+ * XXX perf: To get good pipelining of a sequence of varying
+ * loads, we need to figure out how to pair the ldvary signal
+ * up to the instruction before the last r5 user in the
+ * previous ldvary sequence. Currently, it usually pairs with
+ * the last r5 user.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {