aboutsummaryrefslogtreecommitdiffstats
path: root/src/freedreno/ir3
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2019-12-18 11:57:41 -0800
committerMarge Bot <[email protected]>2020-02-01 02:40:22 +0000
commit093c94456bc99308bd80bcc952d1f77ea71a831c (patch)
tree26e93e50d31fc5fd4fe1240b929a7f165bae5f53 /src/freedreno/ir3
parentc803c662f990621acefd2f002d9df0d42ad8a3a0 (diff)
freedreno/ir3: move nop padding to legalize
This way we can deal with it in one place, *after* all the blocks have been scheduled. Which will simplify life for a post-RA sched pass. This has the benefit of already taking into account nop's that legalize has to insert for non-delay related reasons. Signed-off-by: Rob Clark <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>
Diffstat (limited to 'src/freedreno/ir3')
-rw-r--r--src/freedreno/ir3/ir3.h1
-rw-r--r--src/freedreno/ir3/ir3_a6xx.c13
-rw-r--r--src/freedreno/ir3/ir3_delay.c21
-rw-r--r--src/freedreno/ir3/ir3_legalize.c72
-rw-r--r--src/freedreno/ir3/ir3_sched.c52
5 files changed, 74 insertions, 85 deletions
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 03abaafa393..ac294934133 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1120,6 +1120,7 @@ unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
unsigned maxd, bool pred);
unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
bool soft, bool pred);
+void ir3_remove_nops(struct ir3 *ir);
/* depth calculation: */
struct ir3_shader_variant;
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c
index b75489b6b6a..fd18fc3aa3c 100644
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -365,19 +365,6 @@ get_atomic_dest_mov(struct ir3_instruction *atomic)
list_delinit(&mov->node);
list_add(&mov->node, &atomic->node);
- /* And because this is after instruction scheduling, we don't really
- * have a good way to know if extra delay slots are needed. For
- * example, if the result is consumed by an stib (storeImage()) there
- * would be no extra delay slots in place already, but 5 are needed.
- * Just plan for the worst and hope nobody looks at the resulting
- * code that is generated :-(
- */
- struct ir3_instruction *nop = ir3_NOP(atomic->block);
- nop->repeat = 5;
-
- list_delinit(&nop->node);
- list_add(&nop->node, &mov->node);
-
return atomic->data = mov;
}
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index 506e2969326..207c8cb91cc 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -335,3 +335,24 @@ ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
return delay;
}
+
+/**
+ * Remove nop instructions. The scheduler can insert placeholder nop's
+ * so that ir3_delay_calc() can account for nop's that won't be needed
+ * due to nop's triggered by a previous instruction. However, before
+ * legalize, we want to remove these. The legalize pass can insert
+ * some nop's if needed to hold (for example) sync flags. This final
+ * remaining nops are inserted by legalize after this.
+ */
+void
+ir3_remove_nops(struct ir3 *ir)
+{
+ foreach_block (block, &ir->block_list) {
+ foreach_instr_safe (instr, &block->instr_list) {
+ if (instr->opc == OPC_NOP) {
+ list_del(&instr->node);
+ }
+ }
+ }
+
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index db21507181c..4b95b905e20 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -211,26 +211,6 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
ir3_NOP(block);
- if (is_nop(n) && !list_is_empty(&block->instr_list)) {
- struct ir3_instruction *last = list_last_entry(&block->instr_list,
- struct ir3_instruction, node);
- if (is_nop(last) && (last->repeat < 5)) {
- last->repeat++;
- last->flags |= n->flags;
- continue;
- }
-
- /* NOTE: I think the nopN encoding works for a5xx and
- * probably a4xx, but not a3xx. So far only tested on
- * a6xx.
- */
- if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) &&
- ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
- last->nop++;
- continue;
- }
- }
-
if (ctx->compiler->samgq_workaround &&
ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
struct ir3_instruction *samgp;
@@ -573,6 +553,54 @@ mark_xvergence_points(struct ir3 *ir)
}
}
+/* Insert nop's required to make this a legal/valid shader program: */
+static void
+nop_sched(struct ir3 *ir)
+{
+ foreach_block (block, &ir->block_list) {
+ struct ir3_instruction *last = NULL;
+ struct list_head instr_list;
+
+ /* remove all the instructions from the list, we'll be adding
+ * them back in as we go
+ */
+ list_replace(&block->instr_list, &instr_list);
+ list_inithead(&block->instr_list);
+
+ foreach_instr_safe (instr, &instr_list) {
+ unsigned delay = ir3_delay_calc(block, instr, false, true);
+
+ /* NOTE: I think the nopN encoding works for a5xx and
+ * probably a4xx, but not a3xx. So far only tested on
+ * a6xx.
+ */
+
+ if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
+ ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
+ /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+ unsigned transfer = MIN2(delay, 3 - last->nop);
+ last->nop += transfer;
+ delay -= transfer;
+ }
+
+ if ((delay > 0) && last && (last->opc == OPC_NOP)) {
+ /* the previous nop can encode at most 5 repeats: */
+ unsigned transfer = MIN2(delay, 5 - last->repeat);
+ last->repeat += transfer;
+ delay -= transfer;
+ }
+
+ if (delay > 0) {
+ debug_assert(delay <= 6);
+ ir3_NOP(block)->repeat = delay - 1;
+ }
+
+ list_addtail(&instr->node, &block->instr_list);
+ last = instr;
+ }
+ }
+}
+
void
ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
{
@@ -589,6 +617,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
block->data = rzalloc(ctx, struct ir3_legalize_block_data);
}
+ ir3_remove_nops(ir);
+
/* process each block: */
do {
progress = false;
@@ -599,6 +629,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
*max_bary = ctx->max_bary;
+ nop_sched(ir);
+
do {
ir3_count_instructions(ir);
} while(resolve_jumps(ir));
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index ec5ad6e872e..13ec6e023ac 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -717,7 +717,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
if (instr) {
unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
-
d("delay=%u", delay);
/* and if we run out of instructions that can be scheduled,
@@ -770,18 +769,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
if (block->successors[1]) {
/* if/else, conditional branches to "then" or "else": */
struct ir3_instruction *br;
- unsigned delay = 6;
debug_assert(ctx->pred);
debug_assert(block->condition);
- delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
-
- while (delay > 0) {
- ir3_NOP(block);
- delay--;
- }
-
/* create "else" branch first (since "then" block should
* frequently/always end up being a fall-thru):
*/
@@ -814,45 +805,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
*/
}
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal. We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass). All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
-static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
- unsigned n = 0;
-
- ctx->block = block;
-
- foreach_instr_safe (instr, &block->instr_list) {
- unsigned delay = 0;
-
- set_foreach(block->predecessors, entry) {
- struct ir3_block *pred = (struct ir3_block *)entry->key;
- unsigned d = ir3_delay_calc(pred, instr, false, true);
- delay = MAX2(d, delay);
- }
-
- while (delay > n) {
- struct ir3_instruction *nop = ir3_NOP(block);
-
- /* move to before instr: */
- list_delinit(&nop->node);
- list_addtail(&nop->node, &instr->node);
-
- n++;
- }
-
- /* we can bail once we hit worst case delay: */
- if (++n > 6)
- break;
- }
-}
-
int ir3_sched(struct ir3 *ir)
{
struct ir3_sched_ctx ctx = {0};
@@ -865,10 +817,6 @@ int ir3_sched(struct ir3 *ir)
sched_block(&ctx, block);
}
- foreach_block (block, &ir->block_list) {
- sched_intra_block(&ctx, block);
- }
-
if (ctx.error)
return -1;