v3d: refactor ntq_emit_tmu_general() slightly

When we implement write masks on store operations we might need to emit multiple write sequences for a given store intrinsic. To make that easier, let's split the emission of the tmud instructions to their own block after we are done with the code that only needs to run once no matter how many write sequences we need to emit. Reviewed-by: Eric Anholt <[email protected]>
author: Iago Toral Quiroga <[email protected]> 2019-08-06 12:18:17 +0200
committer: Iago Toral Quiroga <[email protected]> 2019-08-13 08:38:19 +0200
commit: 3d65d2a4883bcf0cdc2eb3a2eeafda1d3c784b9b (patch)
tree: bcfb4305166cb3417831147d7b4d4424bea13f42 /src/broadcom
parent: b594796f1b86952268372622ba844210fa54f6e5 (diff)
1 files changed, 36 insertions, 24 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 03dac60645d..3857f96a9bd 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -196,13 +196,20 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                      instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
                                     (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
                                      tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+
         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                          instr->intrinsic == nir_intrinsic_store_scratch ||
                          instr->intrinsic == nir_intrinsic_store_shared);
+
+        bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
+                        instr->intrinsic == nir_intrinsic_load_ubo ||
+                        instr->intrinsic == nir_intrinsic_load_ssbo ||
+                        instr->intrinsic == nir_intrinsic_load_scratch ||
+                        instr->intrinsic == nir_intrinsic_load_shared);
+
         bool has_index = !is_shared_or_scratch;
 
         int offset_src;
-        int tmu_writes = 1; /* address */
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
                 offset_src = 0;
         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
@@ -213,25 +220,8 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                 offset_src = 0 + has_index;
         } else if (is_store) {
                 offset_src = 1 + has_index;
-                for (int i = 0; i < instr->num_components; i++) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[0], i));
-                        tmu_writes++;
-                }
         } else {
                 offset_src = 0 + has_index;
-                vir_MOV_dest(c,
-                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                             ntq_get_src(c, instr->src[1 + has_index], 0));
-                tmu_writes++;
-                if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[2 + has_index],
-                                                 0));
-                        tmu_writes++;
-                }
         }
 
         bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
@@ -239,12 +229,6 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         if (!dynamic_src)
                 const_offset = nir_src_as_uint(instr->src[offset_src]);
 
-        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
-         * storing at the same time.
-         */
-        while (tmu_writes > 16 / c->threads)
-                c->threads /= 2;
-
         struct qreg offset;
         if (instr->intrinsic == nir_intrinsic_load_uniform) {
                 const_offset += nir_intrinsic_base(instr);
@@ -277,6 +261,34 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                                                 1 : 0]));
         }
 
+        int tmu_writes = 1; /* address */
+        if (is_store) {
+                for (int i = 0; i < instr->num_components; i++) {
+                        vir_MOV_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     ntq_get_src(c, instr->src[0], i));
+                        tmu_writes++;
+                }
+        } else if (!is_load && !atomic_add_replaced) {
+                vir_MOV_dest(c,
+                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                             ntq_get_src(c, instr->src[1 + has_index], 0));
+                tmu_writes++;
+                if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+                        vir_MOV_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     ntq_get_src(c, instr->src[2 + has_index],
+                                                 0));
+                        tmu_writes++;
+                }
+        }
+
+        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
+         * storing at the same time.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
+
         /* The spec says that for atomics, the TYPE field is ignored, but that
          * doesn't seem to be the case for CMPXCHG.  Just use the number of
          * tmud writes we did to decide the type (or choose "32bit" for atomic
author	Iago Toral Quiroga <[email protected]>	2019-08-06 12:18:17 +0200
committer	Iago Toral Quiroga <[email protected]>	2019-08-13 08:38:19 +0200
commit	3d65d2a4883bcf0cdc2eb3a2eeafda1d3c784b9b (patch)
tree	bcfb4305166cb3417831147d7b4d4424bea13f42 /src/broadcom
parent	b594796f1b86952268372622ba844210fa54f6e5 (diff)