diff options
Diffstat (limited to 'src/gallium')
-rw-r--r-- | src/gallium/drivers/vc4/vc4_program.c | 11 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir.h | 5 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_emit.c | 18 |
3 files changed, 34 insertions, 0 deletions
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index ad06d8558fe..d2281ce6bd3 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2285,6 +2285,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, switch (stage) { case QSTAGE_FRAG: + /* FS threading requires that the thread execute + * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating + * (with no other THRSW afterwards, obviously). If we didn't + * fetch a texture at a top level block, this wouldn't be + * true. + */ + if (c->fs_threaded && !c->last_thrsw_at_top_level) { + c->failed = true; + return c; + } + emit_frag_end(c); break; case QSTAGE_VERT: diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 03ac1f50128..eebfdf047df 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -514,6 +514,9 @@ struct vc4_compile { struct list_head qpu_inst_list; + /* Pre-QPU-scheduled instruction containing the last THRSW */ + uint64_t *last_thrsw; + uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; @@ -540,6 +543,8 @@ struct vc4_compile { */ bool fs_threaded; + bool last_thrsw_at_top_level; + bool failed; }; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index cb936243c65..e2f04253855 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -504,6 +504,7 @@ vc4_generate_code_block(struct vc4_compile *c, queue(block, qpu_NOP()); *last_inst(block) = qpu_set_sig(*last_inst(block), QPU_SIG_THREAD_SWITCH); + c->last_thrsw = last_inst(block); break; case QOP_BRANCH: @@ -591,6 +592,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) qir_for_each_block(block, c) vc4_generate_code_block(c, block, temp_registers); + /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW. + * + * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi) + * that ensures that a later thread doesn't try to lock the scoreboard + * and terminate before an earlier-spawned thread on the same QPU, by + * delaying switching back to the later shader until earlier has + * finished. Otherwise, if the earlier thread was hitting the same + * quad, the scoreboard would deadlock. + */ + if (c->last_thrsw) { + assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) == + QPU_SIG_THREAD_SWITCH); + *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) | + QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH, + QPU_SIG)); + } + uint32_t cycles = qpu_schedule_instructions(c); uint32_t inst_count_at_schedule_time = c->qpu_inst_count; |