5 files changed, 84 insertions, 31 deletions
diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h
index 1ab0d4dd0d1..18ff2572b48 100644
--- a/src/panfrost/midgard/compiler.h
+++ b/src/panfrost/midgard/compiler.h
@@ -110,6 +110,7 @@ typedef struct midgard_instruction {
 
         bool compact_branch;
         bool writeout;
+        bool last_writeout;
 
         /* Kind of a hack, but hint against aggressive DCE */
         bool dont_eliminate;
@@ -218,6 +219,7 @@ typedef struct midgard_bundle {
         bool has_embedded_constants;
         float constants[4];
         bool has_blend_constant;
+        bool last_writeout;
 } midgard_bundle;
 
 typedef struct compiler_context {
@@ -303,6 +305,9 @@ typedef struct compiler_context {
 
         /* Model-specific quirk set */
         uint32_t quirks;
+
+        /* Writeout instructions for each render target */
+        midgard_instruction *writeout_branch[4];
 } compiler_context;
 
 /* Per-block live_in/live_out */
diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
index 306d63374b1..24765f3da2a 100644
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -1331,11 +1331,6 @@ compute_builtin_arg(nir_op op)
         }
 }
 
-/* Emit store for a fragment shader, which is encoded via a fancy branch. TODO:
- * Handle MRT here */
-static void
-emit_fragment_epilogue(compiler_context *ctx, unsigned rt);
-
 static void
 emit_fragment_store(compiler_context *ctx, unsigned src, unsigned rt)
 {
@@ -1353,9 +1348,15 @@ emit_fragment_store(compiler_context *ctx, unsigned src, unsigned rt)
         /* Emit the branch */
         midgard_instruction *br = emit_mir_instruction(ctx, ins);
         schedule_barrier(ctx);
-        br->branch.target_block = ctx->block_count - 1;
 
-        emit_fragment_epilogue(ctx, rt);
+        assert(rt < ARRAY_SIZE(ctx->writeout_branch));
+        assert(!ctx->writeout_branch[rt]);
+        ctx->writeout_branch[rt] = br;
+
+        /* Push our current location = current block count - 1 = where we'll
+         * jump to. Maybe a bit too clever for my own good */
+
+        br->branch.target_block = ctx->block_count - 1;
 }
 
 static void
@@ -2284,28 +2285,20 @@ midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
         return progress;
 }
 
-static void
+static unsigned
 emit_fragment_epilogue(compiler_context *ctx, unsigned rt)
 {
-        /* Include a move to specify the render target */
-
-        if (rt > 0) {
-                midgard_instruction rt_move = v_mov(SSA_FIXED_REGISTER(1),
-                                SSA_FIXED_REGISTER(1));
-                rt_move.mask = 1 << COMPONENT_Z;
-                rt_move.unit = UNIT_SADD;
-                emit_mir_instruction(ctx, rt_move);
-        }
-
         /* Loop to ourselves */
 
         struct midgard_instruction ins = v_branch(false, false);
         ins.writeout = true;
         ins.branch.target_block = ctx->block_count - 1;
+        ins.constants[0] = rt * 0x100;
         emit_mir_instruction(ctx, ins);
 
         ctx->current_block->epilogue = true;
         schedule_barrier(ctx);
+        return ins.branch.target_block;
 }
 
 static midgard_block *
@@ -2557,6 +2550,36 @@ pan_format_from_glsl(const struct glsl_type *type)
                 MALI_NR_CHANNELS(4);
 }
 
+/* For each fragment writeout instruction, generate a writeout loop to
+ * associate with it */
+
+static void
+mir_add_writeout_loops(compiler_context *ctx)
+{
+        for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {
+                midgard_instruction *br = ctx->writeout_branch[rt];
+                if (!br) continue;
+
+                unsigned popped = br->branch.target_block;
+                midgard_block_add_successor(mir_get_block(ctx, popped - 1), ctx->current_block);
+                br->branch.target_block = emit_fragment_epilogue(ctx, rt);
+
+                /* If we have more RTs, we'll need to restore back after our
+                 * loop terminates */
+
+                if ((rt + 1) < ARRAY_SIZE(ctx->writeout_branch) && ctx->writeout_branch[rt + 1]) {
+                        midgard_instruction uncond = v_branch(false, false);
+                        uncond.branch.target_block = popped;
+                        emit_mir_instruction(ctx, uncond);
+                        midgard_block_add_successor(ctx->current_block, mir_get_block(ctx, popped));
+                        schedule_barrier(ctx);
+                } else {
+                        /* We're last, so we can terminate here */
+                        br->last_writeout = true;
+                }
+        }
+}
+
 int
 midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb)
 {
@@ -2700,6 +2723,9 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
                 assert(!ins->invert);
         }
 
+        if (ctx->stage == MESA_SHADER_FRAGMENT)
+                mir_add_writeout_loops(ctx);
+
         /* Schedule! */
         schedule_program(ctx);
         mir_ra(ctx);
@@ -2836,22 +2862,14 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
 
         /* Midgard prefetches instruction types, so during emission we
          * need to lookahead. Unless this is the last instruction, in
-         * which we return 1. Or if this is the second to last and the
-         * last is an ALU, then it's also 1... */
+         * which we return 1. */
 
         mir_foreach_block(ctx, block) {
                 mir_foreach_bundle_in_block(block, bundle) {
                         int lookahead = 1;
 
-                        if (current_bundle + 1 < bundle_count) {
-                                uint8_t next = source_order_bundles[current_bundle + 1]->tag;
-
-                                if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
-                                        lookahead = 1;
-                                } else {
-                                        lookahead = next;
-                                }
-                        }
+                        if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))
+                                lookahead = source_order_bundles[current_bundle + 1]->tag;
 
                         emit_binary_bundle(ctx, bundle, compiled, lookahead);
                         ++current_bundle;
diff --git a/src/panfrost/midgard/midgard_liveness.c b/src/panfrost/midgard/midgard_liveness.c
index 8627e01fa74..b1b2f311ffa 100644
--- a/src/panfrost/midgard/midgard_liveness.c
+++ b/src/panfrost/midgard/midgard_liveness.c
@@ -153,14 +153,20 @@ mir_compute_liveness(compiler_context *ctx)
 
                 /* If we made progress, we need to process the predecessors */
 
-                if (progress || (blk == exit) || blk->epilogue) {
+                if (progress || !blk->visited) {
                         mir_foreach_predecessor(blk, pred)
                                 _mesa_set_add(work_list, pred);
                 }
+
+                blk->visited = true;
         } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
 
         /* Liveness is now valid */
         ctx->metadata |= MIDGARD_METADATA_LIVENESS;
+
+        mir_foreach_block(ctx, block) {
+                block->visited = false;
+        }
 }
 
 /* Once liveness data is no longer valid, call this */
diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c
index 65e4d246282..92be82fe7b8 100644
--- a/src/panfrost/midgard/midgard_ra.c
+++ b/src/panfrost/midgard/midgard_ra.c
@@ -380,6 +380,27 @@ mir_compute_interference(
         /* First, we need liveness information to be computed per block */
         mir_compute_liveness(ctx);
 
+        /* We need to force r1.w live throughout a blend shader */
+
+        if (ctx->is_blend) {
+                unsigned r1w = ~0;
+
+                mir_foreach_block(ctx, block) {
+                        mir_foreach_instr_in_block_rev(block, ins) {
+                                if (ins->writeout)
+                                        r1w = ins->src[2];
+                        }
+
+                        if (r1w != ~0)
+                                break;
+                }
+
+                mir_foreach_instr_global(ctx, ins) {
+                        if (ins->dest < ctx->temp_count)
+                                lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, 0xF);
+                }
+        }
+
         /* Now that every block has live_in/live_out computed, we can determine
          * interference by walking each block linearly. Take live_out at the
          * end of each block and walk the block backwards. */
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index e0425fd0578..46e1f7a4a35 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -890,6 +890,9 @@ mir_schedule_alu(
                 mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT);
 
         if (writeout) {
+                /* Propagate up */
+                bundle.last_writeout = branch->last_writeout;
+
                 midgard_instruction add = v_mov(~0, make_compiler_temp(ctx));
 
                 if (!ctx->is_blend) {
@@ -938,7 +941,7 @@ mir_schedule_alu(
 
         /* If we have a render target reference, schedule a move for it */
 
-        if (branch && branch->writeout && branch->constants[0]) {
+        if (branch && branch->writeout && (branch->constants[0] || ctx->is_blend)) {
                 midgard_instruction mov = v_mov(~0, make_compiler_temp(ctx));
                 sadd = mem_dup(&mov, sizeof(midgard_instruction));
                 sadd->unit = UNIT_SADD;