intel/compiler: Add a flag to avoid compacting push constants

In vec4, we can just not run the pass. In fs, things are a bit more deeply intertwined. Reviewed-by: Lionel Landwerlin <[email protected]>
author: Jason Ekstrand <[email protected]> 2019-10-31 15:57:52 -0500
committer: Jason Ekstrand <[email protected]> 2019-11-18 18:35:14 +0000
commit: d1c4e64a69e49c64148529024ecb700d18d3c1c8 (patch)
tree: 728c7cfc80cfd4ab9c47f8419f7ff09014a1bec5 /src/intel
parent: aecde235198f6c8dccb0d26b6397f1efb5e22bfe (diff)
4 files changed, 168 insertions, 145 deletions
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index dd2de2094e7..6e6610a26af 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -119,6 +119,12 @@ struct brw_compiler {
     * whether nir_opt_large_constants will be run.
     */
    bool supports_shader_constants;
+
+   /**
+    * Whether or not the driver wants uniform params to be compacted by the
+    * back-end compiler.
+    */
+   bool compact_params;
 };
 
 /**
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 37a90d0d489..0d3d6137058 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2307,159 +2307,190 @@ fs_visitor::assign_constant_locations()
       return;
    }
 
-   struct uniform_slot_info slots[uniforms];
-   memset(slots, 0, sizeof(slots));
+   if (compiler->compact_params) {
+      struct uniform_slot_info slots[uniforms];
+      memset(slots, 0, sizeof(slots));
 
-   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      for (int i = 0 ; i < inst->sources; i++) {
-         if (inst->src[i].file != UNIFORM)
-            continue;
+      foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+         for (int i = 0 ; i < inst->sources; i++) {
+            if (inst->src[i].file != UNIFORM)
+               continue;
 
-         /* NIR tightly packs things so the uniform number might not be
-          * aligned (if we have a double right after a float, for instance).
-          * This is fine because the process of re-arranging them will ensure
-          * that things are properly aligned.  The offset into that uniform,
-          * however, must be aligned.
-          *
-          * In Vulkan, we have explicit offsets but everything is crammed
-          * into a single "variable" so inst->src[i].nr will always be 0.
-          * Everything will be properly aligned relative to that one base.
-          */
-         assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
+            /* NIR tightly packs things so the uniform number might not be
+             * aligned (if we have a double right after a float, for
+             * instance).  This is fine because the process of re-arranging
+             * them will ensure that things are properly aligned.  The offset
+             * into that uniform, however, must be aligned.
+             *
+             * In Vulkan, we have explicit offsets but everything is crammed
+             * into a single "variable" so inst->src[i].nr will always be 0.
+             * Everything will be properly aligned relative to that one base.
+             */
+            assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
 
-         unsigned u = inst->src[i].nr +
-                      inst->src[i].offset / UNIFORM_SLOT_SIZE;
+            unsigned u = inst->src[i].nr +
+                         inst->src[i].offset / UNIFORM_SLOT_SIZE;
 
-         if (u >= uniforms)
-            continue;
+            if (u >= uniforms)
+               continue;
 
-         unsigned slots_read;
-         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
-            slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
-         } else {
-            unsigned bytes_read = inst->components_read(i) *
-                                  type_sz(inst->src[i].type);
-            slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
-         }
+            unsigned slots_read;
+            if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+               slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
+            } else {
+               unsigned bytes_read = inst->components_read(i) *
+                                     type_sz(inst->src[i].type);
+               slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
+            }
 
-         assert(u + slots_read <= uniforms);
-         mark_uniform_slots_read(&slots[u], slots_read,
-                                 type_sz(inst->src[i].type));
+            assert(u + slots_read <= uniforms);
+            mark_uniform_slots_read(&slots[u], slots_read,
+                                    type_sz(inst->src[i].type));
+         }
       }
-   }
 
-   int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
+      int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
 
-   /* Only allow 16 registers (128 uniform components) as push constants.
-    *
-    * Just demote the end of the list.  We could probably do better
-    * here, demoting things that are rarely used in the program first.
-    *
-    * If changing this value, note the limitation about total_regs in
-    * brw_curbe.c.
-    */
-   unsigned int max_push_components = 16 * 8;
-   if (subgroup_id_index >= 0)
-      max_push_components--; /* Save a slot for the thread ID */
+      /* Only allow 16 registers (128 uniform components) as push constants.
+       *
+       * Just demote the end of the list.  We could probably do better
+       * here, demoting things that are rarely used in the program first.
+       *
+       * If changing this value, note the limitation about total_regs in
+       * brw_curbe.c.
+       */
+      unsigned int max_push_components = 16 * 8;
+      if (subgroup_id_index >= 0)
+         max_push_components--; /* Save a slot for the thread ID */
 
-   /* We push small arrays, but no bigger than 16 floats.  This is big enough
-    * for a vec4 but hopefully not large enough to push out other stuff.  We
-    * should probably use a better heuristic at some point.
-    */
-   const unsigned int max_chunk_size = 16;
+      /* We push small arrays, but no bigger than 16 floats.  This is big
+       * enough for a vec4 but hopefully not large enough to push out other
+       * stuff.  We should probably use a better heuristic at some point.
+       */
+      const unsigned int max_chunk_size = 16;
 
-   unsigned int num_push_constants = 0;
-   unsigned int num_pull_constants = 0;
+      unsigned int num_push_constants = 0;
+      unsigned int num_pull_constants = 0;
 
-   push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
 
-   /* Default to -1 meaning no location */
-   memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
-   memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+      /* Default to -1 meaning no location */
+      memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
 
-   int chunk_start = -1;
-   struct cplx_align align;
-   for (unsigned u = 0; u < uniforms; u++) {
-      if (!slots[u].is_live) {
-         assert(chunk_start == -1);
-         continue;
-      }
+      int chunk_start = -1;
+      struct cplx_align align;
+      for (unsigned u = 0; u < uniforms; u++) {
+         if (!slots[u].is_live) {
+            assert(chunk_start == -1);
+            continue;
+         }
 
-      /* Skip subgroup_id_index to put it in the last push register. */
-      if (subgroup_id_index == (int)u)
-         continue;
+         /* Skip subgroup_id_index to put it in the last push register. */
+         if (subgroup_id_index == (int)u)
+            continue;
 
-      if (chunk_start == -1) {
-         chunk_start = u;
-         align = slots[u].align;
-      } else {
-         /* Offset into the chunk */
-         unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
+         if (chunk_start == -1) {
+            chunk_start = u;
+            align = slots[u].align;
+         } else {
+            /* Offset into the chunk */
+            unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
 
-         /* Shift the slot alignment down by the chunk offset so it is
-          * comparable with the base chunk alignment.
-          */
-         struct cplx_align slot_align = slots[u].align;
-         slot_align.offset =
-            (slot_align.offset - chunk_offset) & (align.mul - 1);
+            /* Shift the slot alignment down by the chunk offset so it is
+             * comparable with the base chunk alignment.
+             */
+            struct cplx_align slot_align = slots[u].align;
+            slot_align.offset =
+               (slot_align.offset - chunk_offset) & (align.mul - 1);
 
-         align = cplx_align_combine(align, slot_align);
-      }
+            align = cplx_align_combine(align, slot_align);
+         }
 
-      /* Sanity check the alignment */
-      cplx_align_assert_sane(align);
+         /* Sanity check the alignment */
+         cplx_align_assert_sane(align);
 
-      if (slots[u].contiguous)
-         continue;
+         if (slots[u].contiguous)
+            continue;
 
-      /* Adjust the alignment to be in terms of slots, not bytes */
-      assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
-      assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
-      align.mul /= UNIFORM_SLOT_SIZE;
-      align.offset /= UNIFORM_SLOT_SIZE;
-
-      unsigned push_start_align = cplx_align_apply(align, num_push_constants);
-      unsigned chunk_size = u - chunk_start + 1;
-      if ((!compiler->supports_pull_constants && u < UBO_START) ||
-          (chunk_size < max_chunk_size &&
-           push_start_align + chunk_size <= max_push_components)) {
-         /* Align up the number of push constants */
-         num_push_constants = push_start_align;
-         for (unsigned i = 0; i < chunk_size; i++)
-            push_constant_loc[chunk_start + i] = num_push_constants++;
-      } else {
-         /* We need to pull this one */
-         num_pull_constants = cplx_align_apply(align, num_pull_constants);
-         for (unsigned i = 0; i < chunk_size; i++)
-            pull_constant_loc[chunk_start + i] = num_pull_constants++;
+         /* Adjust the alignment to be in terms of slots, not bytes */
+         assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
+         assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
+         align.mul /= UNIFORM_SLOT_SIZE;
+         align.offset /= UNIFORM_SLOT_SIZE;
+
+         unsigned push_start_align = cplx_align_apply(align, num_push_constants);
+         unsigned chunk_size = u - chunk_start + 1;
+         if ((!compiler->supports_pull_constants && u < UBO_START) ||
+             (chunk_size < max_chunk_size &&
+              push_start_align + chunk_size <= max_push_components)) {
+            /* Align up the number of push constants */
+            num_push_constants = push_start_align;
+            for (unsigned i = 0; i < chunk_size; i++)
+               push_constant_loc[chunk_start + i] = num_push_constants++;
+         } else {
+            /* We need to pull this one */
+            num_pull_constants = cplx_align_apply(align, num_pull_constants);
+            for (unsigned i = 0; i < chunk_size; i++)
+               pull_constant_loc[chunk_start + i] = num_pull_constants++;
+         }
+
+         /* Reset the chunk and start again */
+         chunk_start = -1;
       }
 
-      /* Reset the chunk and start again */
-      chunk_start = -1;
-   }
+      /* Add the CS local thread ID uniform at the end of the push constants */
+      if (subgroup_id_index >= 0)
+         push_constant_loc[subgroup_id_index] = num_push_constants++;
 
-   /* Add the CS local thread ID uniform at the end of the push constants */
-   if (subgroup_id_index >= 0)
-      push_constant_loc[subgroup_id_index] = num_push_constants++;
+      /* As the uniforms are going to be reordered, stash the old array and
+       * create two new arrays for push/pull params.
+       */
+      uint32_t *param = stage_prog_data->param;
+      stage_prog_data->nr_params = num_push_constants;
+      if (num_push_constants) {
+         stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
+                                                num_push_constants);
+      } else {
+         stage_prog_data->param = NULL;
+      }
+      assert(stage_prog_data->nr_pull_params == 0);
+      assert(stage_prog_data->pull_param == NULL);
+      if (num_pull_constants > 0) {
+         stage_prog_data->nr_pull_params = num_pull_constants;
+         stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
+                                                     num_pull_constants);
+      }
 
-   /* As the uniforms are going to be reordered, stash the old array and
-    * create two new arrays for push/pull params.
-    */
-   uint32_t *param = stage_prog_data->param;
-   stage_prog_data->nr_params = num_push_constants;
-   if (num_push_constants) {
-      stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
-                                             num_push_constants);
+      /* Up until now, the param[] array has been indexed by reg + offset
+       * of UNIFORM registers.  Move pull constants into pull_param[] and
+       * condense param[] to only contain the uniforms we chose to push.
+       *
+       * NOTE: Because we are condensing the params[] array, we know that
+       * push_constant_loc[i] <= i and we can do it in one smooth loop without
+       * having to make a copy.
+       */
+      for (unsigned int i = 0; i < uniforms; i++) {
+         uint32_t value = param[i];
+         if (pull_constant_loc[i] != -1) {
+            stage_prog_data->pull_param[pull_constant_loc[i]] = value;
+         } else if (push_constant_loc[i] != -1) {
+            stage_prog_data->param[push_constant_loc[i]] = value;
+         }
+      }
+      ralloc_free(param);
    } else {
-      stage_prog_data->param = NULL;
-   }
-   assert(stage_prog_data->nr_pull_params == 0);
-   assert(stage_prog_data->pull_param == NULL);
-   if (num_pull_constants > 0) {
-      stage_prog_data->nr_pull_params = num_pull_constants;
-      stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
-                                                  num_pull_constants);
+      /* If we don't want to compact anything, just set up dummy push/pull
+       * arrays.  All the rest of the compiler cares about are these arrays.
+       */
+      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+
+      for (unsigned u = 0; u < uniforms; u++)
+         push_constant_loc[u] = u;
+
+      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
    }
 
    /* Now that we know how many regular uniforms we'll push, reduce the
@@ -2475,24 +2506,6 @@ fs_visitor::assign_constant_locations()
       push_length += range->length;
    }
    assert(push_length <= 64);
-
-   /* Up until now, the param[] array has been indexed by reg + offset
-    * of UNIFORM registers.  Move pull constants into pull_param[] and
-    * condense param[] to only contain the uniforms we chose to push.
-    *
-    * NOTE: Because we are condensing the params[] array, we know that
-    * push_constant_loc[i] <= i and we can do it in one smooth loop without
-    * having to make a copy.
-    */
-   for (unsigned int i = 0; i < uniforms; i++) {
-      uint32_t value = param[i];
-      if (pull_constant_loc[i] != -1) {
-         stage_prog_data->pull_param[pull_constant_loc[i]] = value;
-      } else if (push_constant_loc[i] != -1) {
-         stage_prog_data->param[push_constant_loc[i]] = value;
-      }
-   }
-   ralloc_free(param);
 }
 
 bool
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 9fa946ccdb4..fb4cf365cb4 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -633,6 +633,9 @@ set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
 void
 vec4_visitor::pack_uniform_registers()
 {
+   if (!compiler->compact_params)
+      return;
+
    uint8_t chans_used[this->uniforms];
    int new_loc[this->uniforms];
    int new_chan[this->uniforms];
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 28a652867d9..0fc507c12a8 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -560,6 +560,7 @@ anv_physical_device_init(struct anv_physical_device *device,
    device->compiler->constant_buffer_0_is_relative =
       device->info.gen < 8 || !device->has_context_isolation;
    device->compiler->supports_shader_constants = true;
+   device->compiler->compact_params = true;
 
    /* Broadwell PRM says:
     *
author	Jason Ekstrand <[email protected]>	2019-10-31 15:57:52 -0500
committer	Jason Ekstrand <[email protected]>	2019-11-18 18:35:14 +0000
commit	d1c4e64a69e49c64148529024ecb700d18d3c1c8 (patch)
tree	728c7cfc80cfd4ab9c47f8419f7ff09014a1bec5 /src/intel
parent	aecde235198f6c8dccb0d26b6397f1efb5e22bfe (diff)