diff options
author | Eric Anholt <[email protected]> | 2018-12-14 14:46:48 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2018-12-14 17:48:01 -0800 |
commit | 29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch) | |
tree | d56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/qpu_schedule.c | |
parent | 248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff) |
v3d: Drop in a bunch of notes about performance improvement opportunities.
These have all been floating in my head, and while I've thought about
encoding them in issues on gitlab once they're enabled, they also make
sense to just have in the area of the code you'll need to work in.
Diffstat (limited to 'src/broadcom/compiler/qpu_schedule.c')
-rw-r--r-- | src/broadcom/compiler/qpu_schedule.c | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 365aebdbd6d..7662c8f6f08 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, if (!magic) { add_write_dep(state, &state->last_rf[waddr], n); } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { + /* XXX perf: For V3D 4.x, we could reorder TMU writes other + * than the TMUS/TMUD/TMUA to improve scheduling flexibility. + */ add_write_dep(state, &state->last_tmu_write, n); switch (waddr) { case V3D_QPU_WADDR_TMUS: @@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) return next_score; next_score++; + /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 + * instructions after the producer if possible, not just 1. + */ + /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; @@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * sooner. If the ldvary's r5 wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update * r5 in the same tick. + * + * XXX perf: To get good pipelining of a sequence of varying + * loads, we need to figure out how to pair the ldvary signal + * up to the instruction before the last r5 user in the + * previous ldvary sequence. Currently, it usually pairs with + * the last r5 user. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { |