panfrost/midgard: Handle csel correctly

We use an algebraic pass for the csel optimizations, and use proper vectorized csel ops (i/fcsel_v) for mixed, rather lowering. To avoid regressions along the way, we fix an issue with the copy propagation pass (it should not attempt to propagate constants). Similarly, we take care to break bundles when using csel to fix some scheduler corner cases. Signed-off-by: Alyssa Rosenzweig <[email protected]>
author: Alyssa Rosenzweig <[email protected]> 2019-05-07 02:52:08 +0000
committer: Alyssa Rosenzweig <[email protected]> 2019-05-12 22:21:49 +0000
commit: 726f0263e14d219153088b018624710d20f3a124 (patch)
tree: 6984d60f8ca12ca81754d4f872361005adfb0d72 /src
parent: a35269cf446bfad2261dc1e7945cd779fb42208d (diff)
5 files changed, 128 insertions, 152 deletions
diff --git a/src/gallium/drivers/panfrost/ci/expected-failures.txt b/src/gallium/drivers/panfrost/ci/expected-failures.txt
index f72aa07deb9..0fe3342802d 100644
--- a/src/gallium/drivers/panfrost/ci/expected-failures.txt
+++ b/src/gallium/drivers/panfrost/ci/expected-failures.txt
@@ -1872,9 +1872,6 @@ dEQP-GLES2.functional.shaders.random.all_features.fragment.74
 dEQP-GLES2.functional.shaders.random.all_features.fragment.77
 dEQP-GLES2.functional.shaders.random.all_features.fragment.95
 dEQP-GLES2.functional.shaders.random.all_features.vertex.17
-dEQP-GLES2.functional.shaders.random.conditionals.combined.88
-dEQP-GLES2.functional.shaders.random.conditionals.combined.92
-dEQP-GLES2.functional.shaders.random.conditionals.fragment.24
 dEQP-GLES2.functional.shaders.random.exponential.fragment.46
 dEQP-GLES2.functional.shaders.random.exponential.vertex.46
 dEQP-GLES2.functional.shaders.random.texture.fragment.1
diff --git a/src/gallium/drivers/panfrost/midgard/helpers.h b/src/gallium/drivers/panfrost/midgard/helpers.h
index be2a45cd19d..dc2de15931b 100644
--- a/src/gallium/drivers/panfrost/midgard/helpers.h
+++ b/src/gallium/drivers/panfrost/midgard/helpers.h
@@ -196,7 +196,8 @@ static struct {
         [midgard_alu_op_ule]		 = {"ule", UNITS_MOST},
 
         [midgard_alu_op_icsel]		 = {"icsel", UNITS_ADD},
-        [midgard_alu_op_fcsel_i]	 = {"fcsel_i", UNITS_ADD},
+        [midgard_alu_op_icsel_v]         = {"icsel_v", UNITS_ADD},
+        [midgard_alu_op_fcsel_v]	 = {"fcsel_v", UNITS_ADD},
         [midgard_alu_op_fcsel]		 = {"fcsel", UNITS_ADD | UNIT_SMUL},
 
         [midgard_alu_op_frcp]		 = {"frcp", UNIT_VLUT},
diff --git a/src/gallium/drivers/panfrost/midgard/midgard.h b/src/gallium/drivers/panfrost/midgard/midgard.h
index c2808937b43..91d1c075f96 100644
--- a/src/gallium/drivers/panfrost/midgard/midgard.h
+++ b/src/gallium/drivers/panfrost/midgard/midgard.h
@@ -138,8 +138,9 @@ typedef enum {
         midgard_alu_op_i2f        = 0xB8,
         midgard_alu_op_u2f        = 0xBC,
 
-        midgard_alu_op_icsel      = 0xC1,
-        midgard_alu_op_fcsel_i    = 0xC4,
+        midgard_alu_op_icsel_v    = 0xC0, /* condition code r31 */
+        midgard_alu_op_icsel      = 0xC1, /* condition code r31.w */
+        midgard_alu_op_fcsel_v    = 0xC4,
         midgard_alu_op_fcsel      = 0xC5,
         midgard_alu_op_fround     = 0xC6,
 
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
index 25316cab053..4a26ba769b2 100644
--- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
@@ -138,6 +138,12 @@ typedef struct midgard_instruction {
         /* I.e. (1 << alu_bit) */
         int unit;
 
+        /* When emitting bundle, should this instruction have a break forced
+         * before it? Used for r31 writes which are valid only within a single
+         * bundle and *need* to happen as early as possible... this is a hack,
+         * TODO remove when we have a scheduler */
+        bool precede_break;
+
         bool has_constants;
         float constants[4];
         uint16_t inline_constant;
@@ -287,21 +293,6 @@ vector_alu_modifiers(nir_alu_src *src, bool is_int)
         return alu_src;
 }
 
-static bool
-mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
-{
-        /* abs or neg */
-        if (!is_int && src.mod) return true;
-
-        /* swizzle */
-        for (unsigned c = 0; c < 4; ++c) {
-                if (!(mask & (1 << c))) continue;
-                if (((src.swizzle >> (2*c)) & 3) != c) return true;
-        }
-
-        return false;
-}
-
 /* 'Intrinsic' move for misc aliasing uses independent of actual NIR ALU code */
 
 static midgard_instruction
@@ -715,58 +706,6 @@ midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
         nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
 }
 
-/* Lower csel with mixed condition channels to mulitple csel instructions. For
- * context, the csel ops on Midgard are vector in *outputs*, but not in
- * *conditions*. So, if the condition is e.g. yyyy, a single op can select a
- * vec4. But if the condition is e.g. xyzw, four ops are needed as the ISA
- * can't cope with the divergent channels.*/
-
-static void
-midgard_nir_lower_mixed_csel_body(nir_builder *b, nir_alu_instr *alu)
-{
-        if (alu->op != nir_op_bcsel)
-                return;
-
-        b->cursor = nir_before_instr(&alu->instr);
-
-        /* Must be run before registering */
-        assert(alu->dest.dest.is_ssa);
-
-        /* Check for mixed condition */
-
-        unsigned comp = alu->src[0].swizzle[0];
-        unsigned nr_components = alu->dest.dest.ssa.num_components;
-
-        bool mixed = false;
-
-        for (unsigned c = 1; c < nr_components; ++c)
-                mixed |= (alu->src[0].swizzle[c] != comp);
-
-        if (!mixed)
-                return;
-
-        /* We're mixed, so lower */
-
-        assert(nr_components <= 4);
-        nir_ssa_def *results[4];
-
-        nir_ssa_def *cond = nir_ssa_for_alu_src(b, alu, 0);
-        nir_ssa_def *choice0 = nir_ssa_for_alu_src(b, alu, 1);
-        nir_ssa_def *choice1 = nir_ssa_for_alu_src(b, alu, 2);
-
-        for (unsigned c = 0; c < nr_components; ++c) {
-                results[c] = nir_bcsel(b,
-                                nir_channel(b, cond, c),
-                                nir_channel(b, choice0, c),
-                                nir_channel(b, choice1, c));
-        }
-
-        /* Replace with our scalarized version */
-
-        nir_ssa_def *result = nir_vec(b, results, nr_components);
-        nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(result));
-}
-
 static int
 midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
 {
@@ -851,36 +790,6 @@ midgard_nir_lower_fdot2(nir_shader *shader)
         return progress;
 }
 
-static bool
-midgard_nir_lower_mixed_csel(nir_shader *shader)
-{
-        bool progress = false;
-
-        nir_foreach_function(function, shader) {
-                if (!function->impl) continue;
-
-                nir_builder _b;
-                nir_builder *b = &_b;
-                nir_builder_init(b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_alu) continue;
-
-                                nir_alu_instr *alu = nir_instr_as_alu(instr);
-                                midgard_nir_lower_mixed_csel_body(b, alu);
-
-                                progress |= true;
-                        }
-                }
-
-                nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
-
-        }
-
-        return progress;
-}
-
 static void
 optimise_nir(nir_shader *nir)
 {
@@ -892,7 +801,6 @@ optimise_nir(nir_shader *nir)
 
         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
         NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
-        NIR_PASS(progress, nir, midgard_nir_lower_mixed_csel);
 
         nir_lower_tex_options lower_tex_options = {
                 .lower_rect = true
@@ -957,6 +865,11 @@ optimise_nir(nir_shader *nir)
         } while (progress);
 
         NIR_PASS(progress, nir, nir_opt_algebraic_late);
+
+        /* We implement booleans as 32-bit 0/~0 */
+        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
+
+        /* Now that booleans are lowered, we can run out late opts */
         NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
 
         /* Lower mods for float ops only. Integer ops don't support modifiers
@@ -967,9 +880,6 @@ optimise_nir(nir_shader *nir)
         NIR_PASS(progress, nir, nir_copy_prop);
         NIR_PASS(progress, nir, nir_opt_dce);
 
-        /* We implement booleans as 32-bit 0/~0 */
-        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
-
         /* Take us out of SSA */
         NIR_PASS(progress, nir, nir_lower_locals_to_regs);
         NIR_PASS(progress, nir, nir_convert_from_ssa, true);
@@ -1119,8 +1029,21 @@ nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
         return nir_src_index(ctx, &src->src);
 }
 
-/* Midgard puts conditionals in r31.w; move an arbitrary source (the output of
- * a conditional test) into that register */
+static bool
+nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
+{
+        unsigned comp = src->swizzle[0];
+
+        for (unsigned c = 1; c < nr_components; ++c) {
+                if (src->swizzle[c] != comp)
+                        return true;
+        }
+
+        return false;
+}
+
+/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the
+ * output of a conditional test) into that register */
 
 static void
 emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component)
@@ -1138,8 +1061,13 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co
 
         midgard_instruction ins = {
                 .type = TAG_ALU_4,
-                .unit = for_branch ? UNIT_SMUL : UNIT_SADD, /* TODO: DEDUCE THIS */
+
+                /* We need to set the conditional as close as possible */
+                .precede_break = true,
+                .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
+
                 .ssa_args = {
+
                         .src0 = condition,
                         .src1 = condition,
                         .dest = SSA_FIXED_REGISTER(31),
@@ -1157,6 +1085,46 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co
         emit_mir_instruction(ctx, ins);
 }
 
+/* Or, for mixed conditions (with csel_v), here's a vector version using all of
+ * r31 instead */
+
+static void
+emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
+{
+        int condition = nir_src_index(ctx, &src->src);
+
+        /* Source to swizzle the desired component into w */
+
+        const midgard_vector_alu_src alu_src = {
+                .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle),
+        };
+
+        /* There is no boolean move instruction. Instead, we simulate a move by
+         * ANDing the condition with itself to get it into r31.w */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .precede_break = true,
+                .ssa_args = {
+                        .src0 = condition,
+                        .src1 = condition,
+                        .dest = SSA_FIXED_REGISTER(31),
+                },
+                .alu = {
+                        .op = midgard_alu_op_iand,
+                        .reg_mode = midgard_reg_mode_32,
+                        .dest_override = midgard_dest_override_none,
+                        .mask = expand_writemask((1 << nr_comp) - 1),
+                        .src1 = vector_alu_srco_unsigned(alu_src),
+                        .src2 = vector_alu_srco_unsigned(alu_src)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
+
+
 /* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
  * pinning to eliminate this move in all known cases */
 
@@ -1189,7 +1157,6 @@ emit_indirect_offset(compiler_context *ctx, nir_src *src)
 	case nir_op_##nir: \
 		op = midgard_alu_op_##_op; \
 		break;
-
 static bool
 nir_is_fzero_constant(nir_src src)
 {
@@ -1332,53 +1299,34 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 break;
         }
 
-        /* For a few special csel cases not handled by NIR, we can opt to
-         * bitwise. Otherwise, we emit the condition and do a real csel */
-
         case nir_op_b32csel: {
-                if (nir_is_fzero_constant(instr->src[2].src)) {
-                        /* (b ? v : 0) = (b & v) */
-                        op = midgard_alu_op_iand;
-                        nr_inputs = 2;
-                } else if (nir_is_fzero_constant(instr->src[1].src)) {
-                        /* (b ? 0 : v) = (!b ? v : 0) = (~b & v) = (v & ~b) */
-                        op = midgard_alu_op_iandnot;
-                        nr_inputs = 2;
-                        instr->src[1] = instr->src[0];
-                        instr->src[0] = instr->src[2];
-                } else {
-                        /* Midgard features both fcsel and icsel, depending on
-                         * the type of the arguments/output. However, as long
-                         * as we're careful we can _always_ use icsel and
-                         * _never_ need fcsel, since the latter does additional
-                         * floating-point-specific processing whereas the
-                         * former just moves bits on the wire. It's not obvious
-                         * why these are separate opcodes, save for the ability
-                         * to do things like sat/pos/abs/neg for free */
-
-                        op = midgard_alu_op_icsel;
+                /* Midgard features both fcsel and icsel, depending on
+                 * the type of the arguments/output. However, as long
+                 * as we're careful we can _always_ use icsel and
+                 * _never_ need fcsel, since the latter does additional
+                 * floating-point-specific processing whereas the
+                 * former just moves bits on the wire. It's not obvious
+                 * why these are separate opcodes, save for the ability
+                 * to do things like sat/pos/abs/neg for free */
 
-                        /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
-                        nr_inputs = 2;
+                bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
+                op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel;
 
-                        /* Figure out which component the condition is in */
+                /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
+                nr_inputs = 2;
 
-                        unsigned comp = instr->src[0].swizzle[0];
+                /* Emit the condition into r31 */
 
-                        /* Make sure NIR isn't throwing a mixed condition at us */
-
-                        for (unsigned c = 1; c < nr_components; ++c)
-                                assert(instr->src[0].swizzle[c] == comp);
-
-                        /* Emit the condition into r31.w */
-                        emit_condition(ctx, &instr->src[0].src, false, comp);
+                if (mixed)
+                        emit_condition_mixed(ctx, &instr->src[0], nr_components);
+                else
+                        emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]);
 
-                        /* The condition is the first argument; move the other
-                         * arguments up one to be a binary instruction for
-                         * Midgard */
+                /* The condition is the first argument; move the other
+                 * arguments up one to be a binary instruction for
+                 * Midgard */
 
-                        memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
-                }
+                memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
                 break;
         }
 
@@ -2596,6 +2544,10 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
                         /* Ensure that the chain can continue */
                         if (ains->type != TAG_ALU_4) break;
 
+                        /* If there's already something in the bundle and we
+                         * have weird scheduler constraints, break now */
+                        if (ains->precede_break && index) break;
+
                         /* According to the presentation "The ARM
                          * Mali-T880 Mobile GPU" from HotChips 27,
                          * there are two pipeline stages. Branching
@@ -3297,6 +3249,21 @@ midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
 }
 
 static bool
+mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
+{
+        /* abs or neg */
+        if (!is_int && src.mod) return true;
+
+        /* swizzle */
+        for (unsigned c = 0; c < 4; ++c) {
+                if (!(mask & (1 << c))) continue;
+                if (((src.swizzle >> (2*c)) & 3) != c) return true;
+        }
+
+        return false;
+}
+
+static bool
 midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
 {
         bool progress = false;
@@ -3315,6 +3282,10 @@ midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
                 if (to >= ctx->func->impl->ssa_alloc) continue;
                 if (from >= ctx->func->impl->ssa_alloc) continue;
 
+                /* Constant propagation is not handled here, either */
+                if (ins->ssa_args.inline_constant) continue;
+                if (ins->has_constants) continue;
+
                 /* Also, if the move has side effects, we're helpless */
 
                 midgard_vector_alu_src src =
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py b/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py
index 7c8cd06feed..df0caa26640 100644
--- a/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py
+++ b/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py
@@ -28,6 +28,7 @@ import math
 
 a = 'a'
 b = 'b'
+c = 'c'
 
 algebraic_late = [
     # ineg must be lowered late, but only for integers; floats will try to
@@ -35,8 +36,13 @@ algebraic_late = [
     # a more standard lower_negate approach
 
     (('ineg', a), ('isub', 0, a)),
-]
 
+    # These two special-cases save space/an op than the actual csel op +
+    # scheduler flexibility
+
+    (('b32csel', a, 'b@32', 0), ('iand', a, b)),
+    (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)),
+]
 
 # Midgard scales fsin/fcos arguments by pi.
 # Pass must be run only once, after the main loop
author	Alyssa Rosenzweig <[email protected]>	2019-05-07 02:52:08 +0000
committer	Alyssa Rosenzweig <[email protected]>	2019-05-12 22:21:49 +0000
commit	726f0263e14d219153088b018624710d20f3a124 (patch)
tree	6984d60f8ca12ca81754d4f872361005adfb0d72 /src
parent	a35269cf446bfad2261dc1e7945cd779fb42208d (diff)