5 files changed, 131 insertions, 144 deletions
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 62c7e85e55f..ab39af22684 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -657,6 +657,19 @@ struct brw_stage_prog_data {
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
 
+   /* zero_push_reg is a bitfield which indicates what push registers (if any)
+    * should be zeroed by SW at the start of the shader.  The corresponding
+    * push_reg_mask_param specifies the param index (in 32-bit units) where
+    * the actual runtime 64-bit mask will be pushed.  The shader will zero
+    * push reg i if
+    *
+    *    reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
+    *
+    * If this field is set, brw_compiler::compact_params must be false.
+    */
+   uint64_t zero_push_reg;
+   unsigned push_reg_mask_param;
+
    unsigned curb_read_length;
    unsigned total_scratch;
    unsigned total_shared;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4e13dcca54a..b578d82a252 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1604,6 +1604,8 @@ fs_visitor::assign_curb_setup()
 
    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
 
+   uint64_t used = 0;
+
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
@@ -1625,6 +1627,9 @@ fs_visitor::assign_curb_setup()
                constant_nr = 0;
             }
 
+            assert(constant_nr / 8 < 64);
+            used |= BITFIELD64_BIT(constant_nr / 8);
+
 	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
@@ -1639,6 +1644,44 @@ fs_visitor::assign_curb_setup()
       }
    }
 
+   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
+   if (want_zero) {
+      assert(!compiler->compact_params);
+      fs_builder ubld = bld.exec_all().group(8, 0).at(
+         cfg->first_block(), cfg->first_block()->start());
+
+      /* push_reg_mask_param is in 32-bit units */
+      unsigned mask_param = stage_prog_data->push_reg_mask_param;
+      struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
+                                                            mask_param % 8);
+
+      fs_reg b32;
+      for (unsigned i = 0; i < 64; i++) {
+         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+            fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
+            ubld.SHL(horiz_offset(shifted, 8),
+                     byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
+                     brw_imm_v(0x01234567));
+            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+
+            fs_builder ubld16 = ubld.group(16, 0);
+            b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
+            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
+         }
+
+         if (want_zero & BITFIELD64_BIT(i)) {
+            assert(i < prog_data->curb_read_length);
+            struct brw_reg push_reg =
+               retype(brw_vec8_grf(payload.num_regs + i, 0),
+                      BRW_REGISTER_TYPE_D);
+
+            ubld.AND(push_reg, push_reg, component(b32, i % 16));
+         }
+      }
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
 }
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
index 960fb1ca4a3..9fcc2f74e22 100644
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -80,13 +80,15 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
    if (push_ubo_ranges && robust_buffer_access) {
       /* We can't on-the-fly adjust our push ranges because doing so would
        * mess up the layout in the shader.  When robustBufferAccess is
-       * enabled, we have to manually bounds check our pushed UBO accesses.
+       * enabled, we push a mask into the shader indicating which pushed
+       * registers are valid and we zero out the invalid ones at the top of
+       * the shader.
        */
-      const uint32_t ubo_size_start =
-         offsetof(struct anv_push_constants, push_ubo_sizes);
-      const uint32_t ubo_size_end = ubo_size_start + (4 * sizeof(uint32_t));
-      push_start = MIN2(push_start, ubo_size_start);
-      push_end = MAX2(push_end, ubo_size_end);
+      const uint32_t push_reg_mask_start =
+         offsetof(struct anv_push_constants, push_reg_mask);
+      const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t);
+      push_start = MIN2(push_start, push_reg_mask_start);
+      push_end = MAX2(push_end, push_reg_mask_end);
    }
 
    if (nir->info.stage == MESA_SHADER_COMPUTE) {
@@ -121,8 +123,32 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       .length = DIV_ROUND_UP(push_end - push_start, 32),
    };
 
-   /* Mapping from brw_ubo_range to anv_push_range */
-   int push_range_idx_map[4] = { -1, -1, -1, -1 };
+   if (has_push_intrinsic) {
+      nir_foreach_function(function, nir) {
+         if (!function->impl)
+            continue;
+
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+               switch (intrin->intrinsic) {
+               case nir_intrinsic_load_push_constant:
+                  intrin->intrinsic = nir_intrinsic_load_uniform;
+                  nir_intrinsic_set_base(intrin,
+                                         nir_intrinsic_base(intrin) -
+                                         push_start);
+                  break;
+
+               default:
+                  break;
+               }
+            }
+         }
+      }
+   }
 
    if (push_ubo_ranges) {
       brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
@@ -144,6 +170,16 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       if (push_constant_range.length > 0)
          map->push_ranges[n++] = push_constant_range;
 
+      if (robust_buffer_access) {
+         const uint32_t push_reg_mask_offset =
+            offsetof(struct anv_push_constants, push_reg_mask);
+         assert(push_reg_mask_offset >= push_start);
+         prog_data->push_reg_mask_param =
+            (push_reg_mask_offset - push_start) / 4;
+      }
+
+      unsigned range_start_reg = push_constant_range.length;
+
       for (int i = 0; i < 4; i++) {
          struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
          if (ubo_range->length == 0)
@@ -157,7 +193,6 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
          const struct anv_pipeline_binding *binding =
             &map->surface_to_descriptor[ubo_range->block];
 
-         push_range_idx_map[i] = n;
          map->push_ranges[n++] = (struct anv_push_range) {
             .set = binding->set,
             .index = binding->index,
@@ -165,6 +200,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
             .start = ubo_range->start,
             .length = ubo_range->length,
          };
+
+         /* We only bother to shader-zero pushed client UBOs */
+         if (binding->set < MAX_SETS && robust_buffer_access) {
+            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
+                                                         ubo_range->length);
+         }
+
+         range_start_reg += ubo_range->length;
       }
    } else {
       /* For Ivy Bridge, the push constants packets have a different
@@ -178,135 +221,6 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       map->push_ranges[0] = push_constant_range;
    }
 
-   if (has_push_intrinsic || (push_ubo_ranges && robust_buffer_access)) {
-      nir_foreach_function(function, nir) {
-         if (!function->impl)
-            continue;
-
-         nir_builder b;
-         nir_builder_init(&b, function->impl);
-
-         nir_foreach_block(block, function->impl) {
-            nir_foreach_instr_safe(instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                  continue;
-
-               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-               switch (intrin->intrinsic) {
-               case nir_intrinsic_load_ubo: {
-                  if (!robust_buffer_access)
-                     break;
-
-                  if (!nir_src_is_const(intrin->src[0]) ||
-                      !nir_src_is_const(intrin->src[1]))
-                     break;
-
-                  uint32_t index = nir_src_as_uint(intrin->src[0]);
-                  uint64_t offset = nir_src_as_uint(intrin->src[1]);
-                  uint32_t size = intrin->num_components *
-                                  (intrin->dest.ssa.bit_size / 8);
-
-                  int ubo_range_idx = -1;
-                  for (unsigned i = 0; i < 4; i++) {
-                     const struct brw_ubo_range *range =
-                        &prog_data->ubo_ranges[i];
-                     if (range->block == index &&
-                         offset + size > range->start * 32 &&
-                         offset < (range->start + range->length) * 32) {
-                        ubo_range_idx = i;
-                        break;
-                     }
-                  }
-
-                  if (ubo_range_idx < 0)
-                     break;
-
-                  b.cursor = nir_after_instr(&intrin->instr);
-
-                  assert(push_range_idx_map[ubo_range_idx] >= 0);
-                  const uint32_t ubo_size_offset =
-                     offsetof(struct anv_push_constants, push_ubo_sizes) +
-                     push_range_idx_map[ubo_range_idx] * sizeof(uint32_t);
-
-                  nir_intrinsic_instr *load_size =
-                     nir_intrinsic_instr_create(b.shader,
-                                                nir_intrinsic_load_uniform);
-                  load_size->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
-                  nir_intrinsic_set_base(load_size,
-                                         ubo_size_offset - push_start);
-                  nir_intrinsic_set_range(load_size, 4);
-                  nir_intrinsic_set_type(load_size, nir_type_uint32);
-                  load_size->num_components = 1;
-                  nir_ssa_dest_init(&load_size->instr, &load_size->dest,
-                                    1, 32, NULL);
-                  nir_builder_instr_insert(&b, &load_size->instr);
-
-                  /* Do the size checks per-component.  Thanks to scalar block
-                   * layout, we could end up with a single vector straddling a
-                   * 32B boundary.
-                   *
-                   * We intentionally push a size starting from the UBO
-                   * binding in the descriptor set rather than starting from
-                   * the started of the pushed range.  This prevents us from
-                   * accidentally flagging things as out-of-bounds due to
-                   * roll-over if a vector access crosses the push range
-                   * boundary.
-                   *
-                   * We align up to 32B so that we can get better CSE.
-                   *
-                   * We check
-                   *
-                   *    offset + size - 1 < push_ubo_sizes[i]
-                   *
-                   * rather than
-                   *
-                   *    offset + size <= push_ubo_sizes[i]
-                   *
-                   * because it properly returns OOB for the case where
-                   * offset + size == 0.
-                   */
-                  nir_const_value last_byte_const[NIR_MAX_VEC_COMPONENTS];
-                  for (unsigned c = 0; c < intrin->dest.ssa.num_components; c++) {
-                     assert(intrin->dest.ssa.bit_size % 8 == 0);
-                     const unsigned comp_size_B = intrin->dest.ssa.bit_size / 8;
-                     const uint32_t comp_last_byte =
-                        align_u32(offset + (c + 1) * comp_size_B,
-                                  ANV_UBO_BOUNDS_CHECK_ALIGNMENT) - 1;
-                     last_byte_const[c] =
-                        nir_const_value_for_uint(comp_last_byte, 32);
-                  }
-                  nir_ssa_def *last_byte =
-                     nir_build_imm(&b, intrin->dest.ssa.num_components, 32,
-                                   last_byte_const);
-                  nir_ssa_def *in_bounds =
-                     nir_ult(&b, last_byte, &load_size->dest.ssa);
-
-                  nir_ssa_def *zero =
-                     nir_imm_zero(&b, intrin->dest.ssa.num_components,
-                                      intrin->dest.ssa.bit_size);
-                  nir_ssa_def *value =
-                     nir_bcsel(&b, in_bounds, &intrin->dest.ssa, zero);
-                  nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
-                                                 nir_src_for_ssa(value),
-                                                 value->parent_instr);
-                  break;
-               }
-
-               case nir_intrinsic_load_push_constant:
-                  intrin->intrinsic = nir_intrinsic_load_uniform;
-                  nir_intrinsic_set_base(intrin,
-                                         nir_intrinsic_base(intrin) -
-                                         push_start);
-                  break;
-
-               default:
-                  break;
-               }
-            }
-         }
-      }
-   }
-
    /* Now that we're done computing the push constant portion of the
     * bind map, hash it.  This lets us quickly determine if the actual
     * mapping has changed and not just a no-op pipeline change.
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index af07a1d203a..20ed281d062 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2477,8 +2477,10 @@ struct anv_push_constants {
    /** Dynamic offsets for dynamic UBOs and SSBOs */
    uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
 
+   uint64_t push_reg_mask;
+
    /** Pad out to a multiple of 32 bytes */
-   uint32_t push_ubo_sizes[4];
+   uint32_t pad[2];
 
    struct {
       /** Base workgroup ID
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 13ad1ced3bc..4375b99f873 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -3152,16 +3152,31 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
             &cmd_buffer->state.push_constants[stage];
 
          if (cmd_buffer->device->robust_buffer_access) {
+            push->push_reg_mask = 0;
+            /* Start of the current range in the shader, relative to the start
+             * of push constants in the shader.
+             */
+            unsigned range_start_reg = 0;
             for (unsigned i = 0; i < 4; i++) {
                const struct anv_push_range *range = &bind_map->push_ranges[i];
-               if (range->length == 0) {
-                  push->push_ubo_sizes[i] = 0;
-               } else {
-                  push->push_ubo_sizes[i] =
-                     get_push_range_bound_size(cmd_buffer, stage, range);
+               if (range->length == 0)
+                  continue;
+
+               unsigned bound_size =
+                  get_push_range_bound_size(cmd_buffer, stage, range);
+               if (bound_size >= range->start * 32) {
+                  unsigned bound_regs =
+                     MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
+                          range->length);
+                  assert(range_start_reg + bound_regs <= 64);
+                  push->push_reg_mask |= BITFIELD64_RANGE(range_start_reg,
+                                                          bound_regs);
                }
+
                cmd_buffer->state.push_constants_dirty |=
                   mesa_to_vk_shader_stage(stage);
+
+               range_start_reg += range->length;
             }
          }