nir/lower_vec_to_movs: Coalesce movs on-the-fly when possible

The old pass blindly inserted a bunch of moves into the shader with no concern for whether or not it was really needed. This adds code to try and coalesce into the destination of the instruction providing the value. Shader-db results for vec4 shaders on Haswell: total instructions in shared programs: 1754420 -> 1747753 (-0.38%) instructions in affected programs: 231230 -> 224563 (-2.88%) helped: 1017 HURT: 2 This approach is heavily based on a different patch by Eduardo Lima Mitev <[email protected]>. Eduardo's patch did this in a separate pass as opposed to integrating it into nir_lower_vec_to_movs. Reviewed-by: Eduardo Lima Mitev <[email protected]>
author: Jason Ekstrand <[email protected]> 2015-09-09 14:40:06 -0700
committer: Jason Ekstrand <[email protected]> 2015-09-15 12:38:07 -0700
commit: 2458ea95c5676807a064f24ec720f12506975402 (patch)
tree: 96ddba5a578d276ebe526ae3c605f3f141c46ad0
parent: 2b2f1f16a04dfba4e02a7f331befb5bd94d4a1be (diff)
1 files changed, 85 insertions, 0 deletions
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index 29dd0ca7235..9ff86ea7543 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -79,6 +79,88 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    return mov->dest.write_mask;
 }
 
+/* Attempts to coalesce the "move" from the given source of the vec to the
+ * destination of the instruction generating the value. If, for whatever
+ * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
+ * can then call insert_mov as normal.
+ */
+static unsigned
+try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
+{
+   assert(start_idx < nir_op_infos[vec->op].num_inputs);
+
+   /* We will only even try if the source is SSA */
+   if (!vec->src[start_idx].src.is_ssa)
+      return 0;
+
+   assert(vec->src[start_idx].src.ssa);
+
+   /* If we are going to do a reswizzle, then the vecN operation must be the
+    * only use of the source value.  We also can't have any source modifiers.
+    */
+   nir_foreach_use(vec->src[start_idx].src.ssa, src) {
+      if (src->parent_instr != &vec->instr)
+         return 0;
+
+      nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
+      if (alu_src->abs || alu_src->negate)
+         return 0;
+   }
+
+   if (!list_empty(&vec->src[start_idx].src.ssa->if_uses))
+      return 0;
+
+   if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return 0;
+
+   nir_alu_instr *src_alu =
+      nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
+
+   /* We only care about being able to re-swizzle the instruction if it is
+    * something that we can reswizzle.  It must be per-component.
+    */
+   if (nir_op_infos[src_alu->op].output_size != 0)
+      return 0;
+
+   /* If we are going to reswizzle the instruction, we can't have any
+    * non-per-component sources either.
+    */
+   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+      if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+         return 0;
+
+   /* Stash off all of the ALU instruction's swizzles. */
+   uint8_t swizzles[4][4];
+   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+      for (unsigned i = 0; i < 4; i++)
+         swizzles[j][i] = src_alu->src[j].swizzle[i];
+
+   unsigned write_mask = 0;
+   for (unsigned i = start_idx; i < 4; i++) {
+      if (!(vec->dest.write_mask & (1 << i)))
+         continue;
+
+      if (!vec->src[i].src.is_ssa ||
+          vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
+         continue;
+
+      /* At this point, the give vec source matchese up with the ALU
+       * instruction so we can re-swizzle that component to match.
+       */
+      write_mask |= 1 << i;
+      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+         src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+
+      /* Clear the no longer needed vec source */
+      nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
+   }
+
+   nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest);
+   src_alu->dest.write_mask = write_mask;
+
+   return write_mask;
+}
+
 static bool
 lower_vec_to_movs_block(nir_block *block, void *void_impl)
 {
@@ -133,6 +215,9 @@ lower_vec_to_movs_block(nir_block *block, void *void_impl)
             continue;
 
          if (!(finished_write_mask & (1 << i)))
+            finished_write_mask |= try_coalesce(vec, i, shader);
+
+         if (!(finished_write_mask & (1 << i)))
             finished_write_mask |= insert_mov(vec, i, shader);
       }
author	Jason Ekstrand <[email protected]>	2015-09-09 14:40:06 -0700
committer	Jason Ekstrand <[email protected]>	2015-09-15 12:38:07 -0700
commit	2458ea95c5676807a064f24ec720f12506975402 (patch)
tree	96ddba5a578d276ebe526ae3c605f3f141c46ad0
parent	2b2f1f16a04dfba4e02a7f331befb5bd94d4a1be (diff)