summaryrefslogtreecommitdiffstats
path: root/src/broadcom/compiler/qpu_schedule.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/broadcom/compiler/qpu_schedule.c')
-rw-r--r--src/broadcom/compiler/qpu_schedule.c13
1 files changed, 13 insertions, 0 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 365aebdbd6d..7662c8f6f08 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
if (!magic) {
add_write_dep(state, &state->last_rf[waddr], n);
} else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+ /* XXX perf: For V3D 4.x, we could reorder TMU writes other
+ * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
+ */
add_write_dep(state, &state->last_tmu_write, n);
switch (waddr) {
case V3D_QPU_WADDR_TMUS:
@@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
return next_score;
next_score++;
+ /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
+ * instructions after the producer if possible, not just 1.
+ */
+
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
@@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
* sooner. If the ldvary's r5 wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
* r5 in the same tick.
+ *
+ * XXX perf: To get good pipelining of a sequence of varying
+ * loads, we need to figure out how to pair the ldvary signal
+ * up to the instruction before the last r5 user in the
+ * previous ldvary sequence. Currently, it usually pairs with
+ * the last r5 user.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {