2 files changed, 76 insertions, 16 deletions
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
index ce758f5555b..f0700d61f95 100644
--- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
@@ -991,6 +991,34 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch)
         emit_mir_instruction(ctx, ins);
 }
 
+/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
+ * pinning to eliminate this move in all known cases */
+
+static void
+emit_indirect_offset(compiler_context *ctx, nir_src *src)
+{
+        int offset = nir_src_index(ctx, src);
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .ssa_args = {
+                        .src0 = SSA_UNUSED_1,
+                        .src1 = offset,
+                        .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET),
+                },
+                .alu = {
+                        .op = midgard_alu_op_imov,
+                        .reg_mode = midgard_reg_mode_full,
+                        .dest_override = midgard_dest_override_none,
+                        .mask = (0x3 << 6), /* w */
+                        .src1 = vector_alu_srco_unsigned(zero_alu_src),
+                        .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
 #define ALU_CASE(nir, _op) \
 	case nir_op_##nir: \
 		op = midgard_alu_op_##_op; \
@@ -1260,23 +1288,22 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
 #undef ALU_CASE
 
 static void
-emit_uniform_read(compiler_context *ctx, unsigned dest, unsigned offset)
+emit_uniform_read(compiler_context *ctx, unsigned dest, unsigned offset, nir_src *indirect_offset)
 {
         /* TODO: half-floats */
 
-        if (offset < ctx->uniform_cutoff) {
-                /* Fast path: For the first 16 uniform,
-                 * accesses are 0-cycle, since they're
-                 * just a register fetch in the usual
-                 * case.  So, we alias the registers
-                 * while we're still in SSA-space */
+        if (!indirect_offset && offset < ctx->uniform_cutoff) {
+                /* Fast path: For the first 16 uniforms, direct accesses are
+                 * 0-cycle, since they're just a register fetch in the usual
+                 * case.  So, we alias the registers while we're still in
+                 * SSA-space */
 
                 int reg_slot = 23 - offset;
                 alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot));
         } else {
-                /* Otherwise, read from the 'special'
-                 * UBO to access higher-indexed
-                 * uniforms, at a performance cost */
+                /* Otherwise, read from the 'special' UBO to access
+                 * higher-indexed uniforms, at a performance cost. More
+                 * generally, we're emitting a UBO read instruction. */
 
                 midgard_instruction ins = m_load_uniform_32(dest, offset);
 
@@ -1284,7 +1311,13 @@ emit_uniform_read(compiler_context *ctx, unsigned dest, unsigned offset)
                 ins.load_store.varying_parameters = (offset & 7) << 7;
                 ins.load_store.address = offset >> 3;
 
-                ins.load_store.unknown = 0x1E00; /* xxx: what is this? */
+                if (indirect_offset) {
+                        emit_indirect_offset(ctx, indirect_offset);
+                        ins.load_store.unknown = 0x8700; /* xxx: what is this? */
+                } else {
+                        ins.load_store.unknown = 0x1E00; /* xxx: what is this? */
+                }
+
                 emit_mir_instruction(ctx, ins);
         }
 }
@@ -1302,7 +1335,8 @@ emit_sysval_read(compiler_context *ctx, nir_intrinsic_instr *instr)
         /* Sysvals are prefix uniforms */
         unsigned uniform = ((uintptr_t) val) - 1;
 
-        emit_uniform_read(ctx, dest, uniform);
+        /* Emit the read itself -- this is never indirect */
+        emit_uniform_read(ctx, dest, uniform, NULL);
 }
 
 static void
@@ -1328,14 +1362,18 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_uniform:
         case nir_intrinsic_load_input:
-                assert(nir_src_is_const(instr->src[0]) && "no indirect inputs");
+                offset = nir_intrinsic_base(instr);
 
-                offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
+                bool direct = nir_src_is_const(instr->src[0]);
+
+                if (direct) {
+                        offset += nir_src_as_uint(instr->src[0]);
+                }
 
                 reg = nir_dest_index(ctx, &instr->dest);
 
                 if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) {
-                        emit_uniform_read(ctx, reg, ctx->sysval_count + offset);
+                        emit_uniform_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL);
                 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
                         /* XXX: Half-floats? */
                         /* TODO: swizzle, mask */
@@ -1352,7 +1390,16 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                         memcpy(&u, &p, sizeof(p));
                         ins.load_store.varying_parameters = u;
 
-                        ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
+                        if (direct) {
+                                /* We have the offset totally ready */
+                                ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
+                        } else {
+                                /* We have it partially ready, but we need to
+                                 * add in the dynamic index, moved to r27.w */
+                                emit_indirect_offset(ctx, &instr->src[0]);
+                                ins.load_store.unknown = 0x79e; /* xxx: what is this? */
+                        }
+
                         emit_mir_instruction(ctx, ins);
                 } else if (ctx->is_blend && instr->intrinsic == nir_intrinsic_load_uniform) {
                         /* Constant encoded as a pinned constant */
@@ -2978,8 +3025,19 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
 
                                 if (c->type != TAG_LOAD_STORE_4) continue;
 
+                                /* Stores cannot be reordered, since they have
+                                 * dependencies. For the same reason, indirect
+                                 * loads cannot be reordered as their index is
+                                 * loaded in r27.w */
+
                                 if (OP_IS_STORE(c->load_store.op)) continue;
 
+                                /* It appears the 0x800 bit is set whenever a
+                                 * load is direct, unset when it is indirect.
+                                 * Skip indirect loads. */
+
+                                if (!(c->load_store.unknown & 0x800)) continue;
+
                                 /* We found one! Move it up to pair and remove it from the old location */
 
                                 mir_insert_instruction_before(ins, *c);
diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c
index 6d3aca594f1..a296c254ef6 100644
--- a/src/gallium/drivers/panfrost/pan_screen.c
+++ b/src/gallium/drivers/panfrost/pan_screen.c
@@ -321,6 +321,8 @@ panfrost_get_shader_param(struct pipe_screen *screen,
 
         case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
         case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+                return 1;
+
         case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
                 return 0;