summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/vc4/vc4_program.c11
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.h5
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_emit.c18
3 files changed, 34 insertions, 0 deletions
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ad06d8558fe..d2281ce6bd3 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2285,6 +2285,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
switch (stage) {
case QSTAGE_FRAG:
+ /* FS threading requires that the thread execute
+ * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
+ * (with no other THRSW afterwards, obviously). If we didn't
+ * fetch a texture at a top level block, this wouldn't be
+ * true.
+ */
+ if (c->fs_threaded && !c->last_thrsw_at_top_level) {
+ c->failed = true;
+ return c;
+ }
+
emit_frag_end(c);
break;
case QSTAGE_VERT:
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 03ac1f50128..eebfdf047df 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -514,6 +514,9 @@ struct vc4_compile {
struct list_head qpu_inst_list;
+ /* Pre-QPU-scheduled instruction containing the last THRSW */
+ uint64_t *last_thrsw;
+
uint64_t *qpu_insts;
uint32_t qpu_inst_count;
uint32_t qpu_inst_size;
@@ -540,6 +543,8 @@ struct vc4_compile {
*/
bool fs_threaded;
+ bool last_thrsw_at_top_level;
+
bool failed;
};
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index cb936243c65..e2f04253855 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -504,6 +504,7 @@ vc4_generate_code_block(struct vc4_compile *c,
queue(block, qpu_NOP());
*last_inst(block) = qpu_set_sig(*last_inst(block),
QPU_SIG_THREAD_SWITCH);
+ c->last_thrsw = last_inst(block);
break;
case QOP_BRANCH:
@@ -591,6 +592,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
qir_for_each_block(block, c)
vc4_generate_code_block(c, block, temp_registers);
+ /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+ *
+ * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+ * that ensures that a later thread doesn't try to lock the scoreboard
+ * and terminate before an earlier-spawned thread on the same QPU, by
+ * delaying switching back to the later shader until earlier has
+ * finished. Otherwise, if the earlier thread was hitting the same
+ * quad, the scoreboard would deadlock.
+ */
+ if (c->last_thrsw) {
+ assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+ QPU_SIG_THREAD_SWITCH);
+ *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+ QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+ QPU_SIG));
+ }
+
uint32_t cycles = qpu_schedule_instructions(c);
uint32_t inst_count_at_schedule_time = c->qpu_inst_count;