nir/lower_vec_to_movs: Coalesce into destinations of fdot instructions

Now that we have a replicating fdot instruction, we can actually coalesce into the destinations of vec4 instructions. We couldn't really do this before because, if the destination had to end up in .z, we couldn't reswizzle the instruction. With a replicated destination, the result ends up in all channels so we can just set the writemask and we're done. Shader-db results for vec4 programs on Haswell: total instructions in shared programs: 1747753 -> 1746280 (-0.08%) instructions in affected programs: 143274 -> 141801 (-1.03%) helped: 667 HURT: 0 It turns out that dot-products matter... Reviewed-by: Eduardo Lima Mitev <[email protected]>
author: Jason Ekstrand <[email protected]> 2015-09-09 17:18:55 -0700
committer: Jason Ekstrand <[email protected]> 2015-09-15 12:38:48 -0700
commit: 29348631fe7bf732a38856ea842cfc7aa2263468 (patch)
tree: 6a682b7036ca7c1ff9bf5447f283b9335f9b8a5b /src/glsl
parent: a88ce0c1c4c1f77209b71d5a6858f952642f385a (diff)
1 files changed, 36 insertions, 13 deletions
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index 9ff86ea7543..2cb0457f9ba 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -79,6 +79,14 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    return mov->dest.write_mask;
 }
 
+static bool
+has_replicated_dest(nir_alu_instr *alu)
+{
+   return alu->op == nir_op_fdot_replicated2 ||
+          alu->op == nir_op_fdot_replicated3 ||
+          alu->op == nir_op_fdot_replicated4;
+}
+
 /* Attempts to coalesce the "move" from the given source of the vec to the
  * destination of the instruction generating the value. If, for whatever
  * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
@@ -116,19 +124,28 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    nir_alu_instr *src_alu =
       nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
 
-   /* We only care about being able to re-swizzle the instruction if it is
-    * something that we can reswizzle.  It must be per-component.
-    */
-   if (nir_op_infos[src_alu->op].output_size != 0)
-      return 0;
-
-   /* If we are going to reswizzle the instruction, we can't have any
-    * non-per-component sources either.
-    */
-   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
-      if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+   if (has_replicated_dest(src_alu)) {
+      /* The fdot instruction is special: It replicates its result to all
+       * components.  This means that we can always rewrite its destination
+       * and we don't need to swizzle anything.
+       */
+   } else {
+      /* We only care about being able to re-swizzle the instruction if it is
+       * something that we can reswizzle.  It must be per-component.  The one
+       * exception to this is the fdotN instructions which implicitly splat
+       * their result out to all channels.
+       */
+      if (nir_op_infos[src_alu->op].output_size != 0)
          return 0;
 
+      /* If we are going to reswizzle the instruction, we can't have any
+       * non-per-component sources either.
+       */
+      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+         if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+            return 0;
+   }
+
    /* Stash off all of the ALU instruction's swizzles. */
    uint8_t swizzles[4][4];
    for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
@@ -148,8 +165,14 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
        * instruction so we can re-swizzle that component to match.
        */
       write_mask |= 1 << i;
-      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
-         src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+      if (has_replicated_dest(src_alu)) {
+         /* Since the destination is a single replicated value, we don't need
+          * to do any reswizzling
+          */
+      } else {
+         for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+            src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+      }
 
       /* Clear the no longer needed vec source */
       nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
author	Jason Ekstrand <[email protected]>	2015-09-09 17:18:55 -0700
committer	Jason Ekstrand <[email protected]>	2015-09-15 12:38:48 -0700
commit	29348631fe7bf732a38856ea842cfc7aa2263468 (patch)
tree	6a682b7036ca7c1ff9bf5447f283b9335f9b8a5b /src/glsl
parent	a88ce0c1c4c1f77209b71d5a6858f952642f385a (diff)