summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2015-12-18 11:30:30 -0800
committerEric Anholt <[email protected]>2015-12-18 17:09:03 -0800
commit5278c64de58b545dfe3272b005b331fd5b71da68 (patch)
treefa122bd2d25e5e0fda276487923df5734d9638e7 /src
parent960f48809ffebca14af27ce9e87eabc04dfe9b84 (diff)
vc4: Fix latency handling for QPU texture scheduling.
There's only high latency between a complete texture fetch setup and collecting its result, not between each step of setting up the texture fetch request.
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_schedule.c82
1 files changed, 50 insertions, 32 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 76cad2e03fe..09164b74932 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -50,7 +50,7 @@ struct schedule_node {
uint32_t child_array_size;
uint32_t parent_count;
- /* Longest cycles + n->latency of any parent of this node. */
+ /* Longest cycles + instruction_latency() of any parent of this node. */
uint32_t unblocked_time;
/**
@@ -624,6 +624,46 @@ dump_state(struct list_head *schedule_list)
}
}
+static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
+{
+ if (waddr < 32)
+ return 2;
+
+ /* Apply some huge latency between texture fetch requests and getting
+ * their results back.
+ */
+ if (waddr == QPU_W_TMU0_S) {
+ if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
+ return 100;
+ }
+ if (waddr == QPU_W_TMU1_S) {
+ if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
+ return 100;
+ }
+
+ switch(waddr) {
+ case QPU_W_SFU_RECIP:
+ case QPU_W_SFU_RECIPSQRT:
+ case QPU_W_SFU_EXP:
+ case QPU_W_SFU_LOG:
+ return 3;
+ default:
+ return 1;
+ }
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+ uint64_t before_inst = before->inst->inst;
+ uint64_t after_inst = after->inst->inst;
+
+ return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
+ after_inst),
+ waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
+ after_inst));
+}
+
/** Recursive computation of the delay member of a node. */
static void
compute_delay(struct schedule_node *n)
@@ -635,7 +675,8 @@ compute_delay(struct schedule_node *n)
if (!n->children[i].node->delay)
compute_delay(n->children[i].node);
n->delay = MAX2(n->delay,
- n->children[i].node->delay + n->latency);
+ n->children[i].node->delay +
+ instruction_latency(n, n->children[i].node));
}
}
}
@@ -664,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
* immediately after (or paired with!) the thing reading the
* destination.
*/
- int latency_from_previous = war_only ? 0 : node->latency;
+ uint32_t latency = 0;
+ if (!war_only) {
+ latency = instruction_latency(node,
+ node->children[i].node);
+ }
+
child->unblocked_time = MAX2(child->unblocked_time,
- time + latency_from_previous);
+ time + latency);
child->parent_count--;
if (child->parent_count == 0)
list_add(&child->link, schedule_list);
@@ -799,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
return time;
}
-static uint32_t waddr_latency(uint32_t waddr)
-{
- if (waddr < 32)
- return 2;
-
- /* Some huge number, really. */
- if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
- return 100;
-
- switch(waddr) {
- case QPU_W_SFU_RECIP:
- case QPU_W_SFU_RECIPSQRT:
- case QPU_W_SFU_EXP:
- case QPU_W_SFU_LOG:
- return 3;
- default:
- return 1;
- }
-}
-
-static uint32_t
-instruction_latency(uint64_t inst)
-{
- return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
- waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
-}
-
uint32_t
qpu_schedule_instructions(struct vc4_compile *c)
{
@@ -852,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
n->inst = inst;
- n->latency = instruction_latency(inst->inst);
if (reads_uniform(inst->inst)) {
n->uniform = next_uniform++;