summaryrefslogtreecommitdiffstats
path: root/src/broadcom/compiler
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2019-01-03 12:13:18 -0800
committerEric Anholt <[email protected]>2019-01-04 15:41:23 -0800
commitf8a8de8b9a69fc6f4a8fc86a71f81c168cdd18b0 (patch)
treecd56f3c421d9c18be0d943a47685fe063715b491 /src/broadcom/compiler
parentb0e008625706c34758defffa860d087d746261b5 (diff)
v3d: Do UBO loads a vector at a time.
In the process of adding support for SSBOs and CS shared vars, I ended up needing a helper function for doing TMU general ops. This helper can be that starting point, and saves us a bunch of round-trips to the TMU by loading a vector at a time.
Diffstat (limited to 'src/broadcom/compiler')
-rw-r--r--src/broadcom/compiler/nir_to_vir.c124
-rw-r--r--src/broadcom/compiler/vir.c10
2 files changed, 99 insertions, 35 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index f2099182dcb..91d4ab0020e 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -32,6 +32,40 @@
#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
+#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
+#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
+#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3)
+#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3)
+#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3)
+#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3)
+#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3)
+#define GENERAL_TMU_READ_OP_READ (15 << 3)
+#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
+#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
+
+#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3)
+#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3)
+#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3)
+
static void
ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
@@ -73,6 +107,60 @@ vir_emit_thrsw(struct v3d_compile *c)
c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
}
+/**
+ * Implements indirect uniform loads through the TMU general memory access
+ * interface.
+ */
+static void
+ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ uint32_t tmu_op = GENERAL_TMU_READ_OP_READ;
+ bool has_index = instr->intrinsic == nir_intrinsic_load_ubo;
+ int offset_src = 0 + has_index;
+
+ /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
+ * 1 (0 is gallium's constant buffer 0).
+ */
+ struct qreg offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+ nir_src_as_uint(instr->src[0]) + 1);
+
+ uint32_t config = (0xffffff00 |
+ tmu_op |
+ GENERAL_TMU_LOOKUP_PER_PIXEL);
+ if (instr->num_components == 1) {
+ config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ } else {
+ config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+ instr->num_components - 2);
+ }
+
+ struct qreg dest;
+ if (config == ~0)
+ dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+ else
+ dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+ struct qinst *tmu;
+ if (nir_src_is_const(instr->src[offset_src]) &&
+ nir_src_as_uint(instr->src[offset_src]) == 0) {
+ tmu = vir_MOV_dest(c, dest, offset);
+ } else {
+ tmu = vir_ADD_dest(c, dest,
+ offset,
+ ntq_get_src(c, instr->src[offset_src], 0));
+ }
+
+ if (config != ~0) {
+ tmu->src[vir_get_implicit_uniform_src(tmu)] =
+ vir_uniform_ui(c, config);
+ }
+
+ vir_emit_thrsw(c);
+
+ for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
+ ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+}
+
static struct qreg
indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
{
@@ -1547,41 +1635,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_ubo:
- for (int i = 0; i < instr->num_components; i++) {
- int ubo = nir_src_as_uint(instr->src[0]);
-
- /* XXX perf: On V3D 4.x with uniform offsets, we
- * should probably try setting UBOs up in the A
- * register file and doing a sequence of loads that
- * way.
- */
- /* Adjust for where we stored the TGSI register base. */
- vir_ADD_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
- vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
- vir_ADD(c,
- ntq_get_src(c, instr->src[1], 0),
- vir_uniform_ui(c, i * 4)));
-
- vir_emit_thrsw(c);
-
- ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
- }
- break;
-
- if (nir_src_is_const(instr->src[0])) {
- offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- assert(offset % 4 == 0);
- /* We need dwords */
- offset = offset / 4;
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_UNIFORM,
- offset));
- } else {
- ntq_store_dest(c, &instr->dest, 0,
- indirect_uniform_load(c, instr));
- }
+ ntq_emit_tmu_general(c, instr);
break;
case nir_intrinsic_load_user_clip_plane:
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 2f32359f384..6eb346ce9fd 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -61,6 +61,16 @@ vir_has_implicit_uniform(struct qinst *inst)
switch (inst->dst.file) {
case QFILE_TLBU:
return true;
+ case QFILE_MAGIC:
+ switch (inst->dst.index) {
+ case V3D_QPU_WADDR_TLBU:
+ case V3D_QPU_WADDR_TMUAU:
+ case V3D_QPU_WADDR_SYNCU:
+ return true;
+ default:
+ break;
+ }
+ break;
default:
return inst->has_implicit_uniform;
}