v3d: Add SSBO/atomic counters support.

So far I assume that all the buffers get written. If they weren't, you'd probably be using UBOs instead.
author: Eric Anholt <[email protected]> 2017-12-10 17:11:25 -0800
committer: Eric Anholt <[email protected]> 2019-01-14 15:40:55 -0800
commit: 5932c2f0b9b56e6eeee87baa7b0b493227850f69 (patch)
tree: c93d65bea9cebf3f236fc72ad6960ef7dca1b416 /src/broadcom/compiler/nir_to_vir.c
parent: 6c8edcb89c1264c43f5e98444551edb8df2f91f9 (diff)
1 files changed, 129 insertions, 6 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 2e7b1e8e8a2..b8e39f357f7 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -107,16 +107,89 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
 }
 
+static uint32_t
+v3d_general_tmu_op(nir_intrinsic_instr *instr)
+{
+        switch (instr->intrinsic) {
+        case nir_intrinsic_load_ssbo:
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_uniform:
+                return GENERAL_TMU_READ_OP_READ;
+        case nir_intrinsic_store_ssbo:
+                return GENERAL_TMU_WRITE_OP_WRITE;
+        case nir_intrinsic_ssbo_atomic_add:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
+        case nir_intrinsic_ssbo_atomic_imin:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
+        case nir_intrinsic_ssbo_atomic_umin:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
+        case nir_intrinsic_ssbo_atomic_imax:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
+        case nir_intrinsic_ssbo_atomic_umax:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
+        case nir_intrinsic_ssbo_atomic_and:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
+        case nir_intrinsic_ssbo_atomic_or:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
+        case nir_intrinsic_ssbo_atomic_xor:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
+        case nir_intrinsic_ssbo_atomic_exchange:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
+        case nir_intrinsic_ssbo_atomic_comp_swap:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
+        default:
+                unreachable("unknown intrinsic op");
+        }
+}
+
 /**
- * Implements indirect uniform loads through the TMU general memory access
- * interface.
+ * Implements indirect uniform loads and SSBO accesses through the TMU general
+ * memory access interface.
  */
 static void
 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
-        uint32_t tmu_op = GENERAL_TMU_READ_OP_READ;
-        bool has_index = instr->intrinsic == nir_intrinsic_load_ubo;
-        int offset_src = 0 + has_index;
+        /* XXX perf: We should turn add/sub of 1 to inc/dec.  Perhaps NIR
+         * wants to have support for inc/dec?
+         */
+
+        uint32_t tmu_op = v3d_general_tmu_op(instr);
+        bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo;
+
+        int offset_src;
+        int tmu_writes = 1; /* address */
+        if (instr->intrinsic == nir_intrinsic_load_uniform) {
+                offset_src = 0;
+        } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
+                   instr->intrinsic == nir_intrinsic_load_ubo) {
+                offset_src = 1;
+        } else if (is_store) {
+                offset_src = 2;
+                for (int i = 0; i < instr->num_components; i++) {
+                        vir_MOV_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     ntq_get_src(c, instr->src[0], i));
+                        tmu_writes++;
+                }
+        } else {
+                offset_src = 1;
+                vir_MOV_dest(c,
+                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                             ntq_get_src(c, instr->src[2], 0));
+                tmu_writes++;
+                if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
+                        vir_MOV_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     ntq_get_src(c, instr->src[3], 0));
+                        tmu_writes++;
+                }
+        }
+
+        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
+         * storing at the same time.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
 
         struct qreg offset;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -149,12 +222,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
                 if (base != 0)
                         offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
-        } else {
+        } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
                 /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
                  * 1 (0 is gallium's constant buffer 0).
                  */
                 offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
                                      nir_src_as_uint(instr->src[0]) + 1);
+        } else {
+                offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+                                     nir_src_as_uint(instr->src[is_store ?
+                                                                1 : 0]));
         }
 
         uint32_t config = (0xffffff00 |
@@ -167,6 +244,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
                            instr->num_components - 2);
         }
 
+        if (c->execute.file != QFILE_NULL)
+                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+
         struct qreg dest;
         if (config == ~0)
                 dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
@@ -188,10 +268,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
                         vir_uniform_ui(c, config);
         }
 
+        if (c->execute.file != QFILE_NULL)
+                vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
         vir_emit_thrsw(c);
 
+        /* Read the result, or wait for the TMU op to complete. */
         for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
                 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+
+        if (nir_intrinsic_dest_components(instr) == 0)
+                vir_TMUWT(c);
 }
 
 static struct qreg *
@@ -1549,6 +1636,9 @@ ntq_setup_uniforms(struct v3d_compile *c)
                                                                  false);
                 unsigned vec4_size = 4 * sizeof(float);
 
+                if (var->data.mode != nir_var_uniform)
+                        continue;
+
                 declare_uniform_range(c, var->data.driver_location * vec4_size,
                                       vec4_count * vec4_size);
 
@@ -1629,6 +1719,27 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_emit_tmu_general(c, instr);
                 break;
 
+        case nir_intrinsic_ssbo_atomic_add:
+        case nir_intrinsic_ssbo_atomic_imin:
+        case nir_intrinsic_ssbo_atomic_umin:
+        case nir_intrinsic_ssbo_atomic_imax:
+        case nir_intrinsic_ssbo_atomic_umax:
+        case nir_intrinsic_ssbo_atomic_and:
+        case nir_intrinsic_ssbo_atomic_or:
+        case nir_intrinsic_ssbo_atomic_xor:
+        case nir_intrinsic_ssbo_atomic_exchange:
+        case nir_intrinsic_ssbo_atomic_comp_swap:
+        case nir_intrinsic_load_ssbo:
+        case nir_intrinsic_store_ssbo:
+                ntq_emit_tmu_general(c, instr);
+                break;
+
+        case nir_intrinsic_get_buffer_size:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE,
+                                           nir_src_as_uint(instr->src[0])));
+                break;
+
         case nir_intrinsic_load_user_clip_plane:
                 for (int i = 0; i < instr->num_components; i++) {
                         ntq_store_dest(c, &instr->dest, i,
@@ -1732,6 +1843,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
         }
 
+        case nir_intrinsic_memory_barrier:
+        case nir_intrinsic_memory_barrier_atomic_counter:
+        case nir_intrinsic_memory_barrier_buffer:
+                /* We don't do any instruction scheduling of these NIR
+                 * instructions between each other, so we just need to make
+                 * sure that the TMU operations before the barrier are flushed
+                 * before the ones after the barrier.  That is currently
+                 * handled by having a THRSW in each of them and a LDTMU
+                 * series or a TMUWT after.
+                 */
+                break;
+
         default:
                 fprintf(stderr, "Unknown intrinsic: ");
                 nir_print_instr(&instr->instr, stderr);
author	Eric Anholt <[email protected]>	2017-12-10 17:11:25 -0800
committer	Eric Anholt <[email protected]>	2019-01-14 15:40:55 -0800
commit	5932c2f0b9b56e6eeee87baa7b0b493227850f69 (patch)
tree	c93d65bea9cebf3f236fc72ad6960ef7dca1b416 /src/broadcom/compiler/nir_to_vir.c
parent	6c8edcb89c1264c43f5e98444551edb8df2f91f9 (diff)