summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2015-07-30 11:16:13 -0700
committerEric Anholt <[email protected]>2015-07-30 15:47:12 -0700
commit7830e465a5f446616ce49a7f8219256a5503a68b (patch)
tree9b9b520aa904758c1e44662c4a7249f5d7f2da7d /src/gallium/drivers/vc4
parent5a8c57b52287ba2bb8faa4447e7d1cc46ef1a3d4 (diff)
vc4: Lower uniform loads to scalar in NIR.
This also moves the vec4-to-byte-addressing math into NIR, so that algebraic has a chance at it.
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r--src/gallium/drivers/vc4/vc4_nir_lower_io.c86
-rw-r--r--src/gallium/drivers/vc4/vc4_program.c26
2 files changed, 81 insertions, 31 deletions
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index fa06c893cfb..ffc120e8865 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -29,11 +29,30 @@
* Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
* something amenable to the VC4 architecture.
*
- * Currently, it split inputs and outputs into scalars, and drops any
- * non-position outputs in coordinate shaders.
+ * Currently, it split inputs, outputs, and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.
*/
static void
+replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_ssa_def **comps)
+{
+
+ /* Batch things back together into a vec4. This will get split by the
+ * later ALU scalarization pass.
+ */
+ nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]);
+
+ /* Replace the old intrinsic with a reference to our reconstructed
+ * vec4.
+ */
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec),
+ ralloc_parent(b->impl));
+ nir_instr_remove(&intr->instr);
+}
+
+static void
vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
nir_intrinsic_instr *intr)
{
@@ -102,18 +121,7 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
break;
}
- /* Batch things back together into a vec4. This will get split by the
- * later ALU scalarization pass.
- */
- nir_ssa_def *vec_instr = nir_vec4(b, dests[0], dests[1],
- dests[2], dests[3]);
-
- /* Replace the old intrinsic with a reference to our reconstructed
- * vec4.
- */
- nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec_instr),
- ralloc_parent(b->impl));
- nir_instr_remove(&intr->instr);
+ replace_intrinsic_with_vec4(b, intr, dests);
}
static void
@@ -159,6 +167,51 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
}
static void
+vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ /* All TGSI-to-NIR uniform loads are vec4. */
+ assert(intr->num_components == 4);
+
+ nir_builder_insert_before_instr(b, &intr->instr);
+
+ /* Generate scalar loads equivalent to the original VEC4. */
+ nir_ssa_def *dests[4];
+ for (unsigned i = 0; i < intr->num_components; i++) {
+ nir_intrinsic_instr *intr_comp =
+ nir_intrinsic_instr_create(c->s, intr->intrinsic);
+ intr_comp->num_components = 1;
+ nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+
+ if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) {
+ /* Convert the variable TGSI register index to a byte
+ * offset.
+ */
+ intr_comp->src[0] =
+ nir_src_for_ssa(nir_ishl(b,
+ intr->src[0].ssa,
+ nir_imm_int(b, 4)));
+
+ /* Convert the offset to be a byte index, too. */
+ intr_comp->const_index[0] = (intr->const_index[0] * 16 +
+ i * 4);
+ } else {
+ /* We want a dword index for non-indirect uniform
+ * loads.
+ */
+ intr_comp->const_index[0] = (intr->const_index[0] * 4 +
+ i);
+ }
+
+ dests[i] = &intr_comp->dest.ssa;
+
+ nir_builder_instr_insert(b, &intr_comp->instr);
+ }
+
+ replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
struct nir_instr *instr)
{
@@ -175,6 +228,11 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
vc4_nir_lower_output(c, b, intr);
break;
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_uniform_indirect:
+ vc4_nir_lower_uniform(c, b, intr);
+ break;
+
default:
break;
}
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ddc997003b2..f2742986beb 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -42,6 +42,9 @@
#include "simpenrose/simpenrose.h"
#endif
+static struct qreg
+ntq_get_src(struct vc4_compile *c, nir_src src, int i);
+
static void
resize_qreg_array(struct vc4_compile *c,
struct qreg **regs,
@@ -64,10 +67,10 @@ resize_qreg_array(struct vc4_compile *c,
}
static struct qreg
-indirect_uniform_load(struct vc4_compile *c,
- struct qreg indirect_offset,
- unsigned offset)
+indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
{
+ struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+ uint32_t offset = intr->const_index[0];
struct vc4_compiler_ubo_range *range = NULL;
unsigned i;
for (i = 0; i < c->num_uniform_ranges; i++) {
@@ -89,10 +92,6 @@ indirect_uniform_load(struct vc4_compile *c,
};
offset -= range->src_offset;
- /* Translate the user's TGSI register index from the TGSI register
- * base to a byte offset.
- */
- indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4));
/* Adjust for where we stored the TGSI register base. */
indirect_offset = qir_ADD(c, indirect_offset,
@@ -1793,19 +1792,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
switch (instr->intrinsic) {
case nir_intrinsic_load_uniform:
- for (int i = 0; i < instr->num_components; i++) {
- dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
- instr->const_index[0] * 4 + i);
- }
+ assert(instr->num_components == 1);
+ *dest = qir_uniform(c, QUNIFORM_UNIFORM, instr->const_index[0]);
break;
case nir_intrinsic_load_uniform_indirect:
- for (int i = 0; i < instr->num_components; i++) {
- dest[i] = indirect_uniform_load(c,
- ntq_get_src(c, instr->src[0], 0),
- (instr->const_index[0] *
- 4 + i) * sizeof(float));
- }
+ *dest = indirect_uniform_load(c, instr);
break;