intel/compiler: Add support for variable workgroup size

Add new builtin parameters that are used to keep track of the group size. This will be used to implement ARB_compute_variable_group_size. The compiler will use the maximum group size supported to pick a suitable SIMD variant. A later improvement will be to keep all SIMD variants (like FS) so the driver can select the best one at dispatch time. When variable workgroup size is used, the small workgroup optimization is disabled as it we can't prove at compile time that the barriers won't be needed. Extracted from original i965 patch with additional changes by Caio Marcelo de Oliveira Filho. Reviewed-by: Caio Marcelo de Oliveira Filho <[email protected]> Reviewed-by: Paulo Zanoni <[email protected]> Reviewed-by: Jordan Justen <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4504>
author: Plamena Manolova <[email protected]> 2018-11-12 06:29:51 -0800
committer: Caio Marcelo de Oliveira Filho <[email protected]> 2020-04-09 19:23:12 -0700
commit: c77dc51203a45c8ae82d5a88d3e8fe99c32fc5bc (patch)
tree: 9ad38a786b4a7c236e2f3f4ede1d428b1a347529 /src/intel/compiler
parent: c54fc0d07b1a92e065000c1301971b93439595e2 (diff)
5 files changed, 100 insertions, 29 deletions
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 08999e95071..2e34b16dd44 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -615,6 +615,9 @@ enum brw_param_builtin {
    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
    BRW_PARAM_BUILTIN_SUBGROUP_ID,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
 };
 
 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
@@ -901,11 +904,13 @@ struct brw_cs_prog_data {
    struct brw_stage_prog_data base;
 
    unsigned local_size[3];
+   unsigned max_variable_local_size;
    unsigned simd_size;
    unsigned threads;
    unsigned slm_size;
    bool uses_barrier;
    bool uses_num_work_groups;
+   bool uses_variable_group_size;
 
    struct {
       struct brw_push_const_block cross_thread;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 96fdb6b0992..323fdb56ff5 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1190,6 +1190,8 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->pull_constant_loc = v->pull_constant_loc;
    this->uniforms = v->uniforms;
    this->subgroup_id = v->subgroup_id;
+   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
+      this->group_size[i] = v->group_size[i];
 }
 
 void
@@ -8866,9 +8868,16 @@ static void
 cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
 {
    cs_prog_data->simd_size = size;
-   unsigned group_size = cs_prog_data->local_size[0] *
-      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-   cs_prog_data->threads = (group_size + size - 1) / size;
+
+   unsigned group_size;
+   if (cs_prog_data->uses_variable_group_size) {
+      group_size = cs_prog_data->max_variable_local_size;
+   } else {
+      group_size = cs_prog_data->local_size[0] *
+                   cs_prog_data->local_size[1] *
+                   cs_prog_data->local_size[2];
+   }
+   cs_prog_data->threads = DIV_ROUND_UP(group_size, size);
 }
 
 static nir_shader *
@@ -8903,13 +8912,20 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    prog_data->base.total_shared = src_shader->info.cs.shared_size;
-   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
-   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
-   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
    prog_data->slm_size = src_shader->num_shared;
-   unsigned local_workgroup_size =
-      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
-      src_shader->info.cs.local_size[2];
+
+   unsigned local_workgroup_size;
+   if (prog_data->uses_variable_group_size) {
+      prog_data->max_variable_local_size =
+         src_shader->info.cs.max_variable_local_size;
+      local_workgroup_size = src_shader->info.cs.max_variable_local_size;
+   } else {
+      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
+      local_workgroup_size = src_shader->info.cs.local_size[0] *
+         src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2];
+   }
 
    /* Limit max_threads to 64 for the GPGPU_WALKER command */
    const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index c09c4eb8759..f2612968f25 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -370,6 +370,7 @@ public:
    int *push_constant_loc;
 
    fs_reg subgroup_id;
+   fs_reg group_size[3];
    fs_reg scratch_base;
    fs_reg frag_depth;
    fs_reg frag_stencil;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index f1d17a322e9..a038db72daa 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -101,11 +101,23 @@ fs_visitor::nir_setup_uniforms()
    uniforms = nir->num_uniforms / 4;
 
    if (stage == MESA_SHADER_COMPUTE) {
-      /* Add a uniform for the thread local id.  It must be the last uniform
-       * on the list.
-       */
+      /* Add uniforms for builtins after regular NIR uniforms. */
       assert(uniforms == prog_data->nr_params);
-      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+
+      uint32_t *param;
+      if (brw_cs_prog_data(prog_data)->uses_variable_group_size) {
+         param = brw_stage_prog_data_add_params(prog_data, 3);
+         for (unsigned i = 0; i < 3; i++) {
+            param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
+            group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+         }
+      }
+
+      /* Subgroup ID must be the last uniform on the list.  This will make
+       * easier later to split between cross thread and per thread
+       * uniforms.
+       */
+      param = brw_stage_prog_data_add_params(prog_data, 1);
       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
    }
@@ -3814,7 +3826,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
        * invocations are already executed lock-step.  Instead of an actual
        * barrier just emit a scheduling fence, that will generate no code.
        */
-      if (workgroup_size() <= dispatch_width) {
+      if (!cs_prog_data->uses_variable_group_size &&
+          workgroup_size() <= dispatch_width) {
          bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
          break;
       }
@@ -3949,6 +3962,14 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       break;
    }
 
+   case nir_intrinsic_load_local_group_size: {
+      for (unsigned i = 0; i < 3; i++) {
+         bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+            group_size[i]);
+      }
+      break;
+   }
+
    default:
       nir_emit_intrinsic(bld, instr);
       break;
@@ -4337,7 +4358,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        *
        * TODO: Check if applies for many HW threads sharing same Data Port.
        */
-      if (slm_fence && workgroup_size() <= dispatch_width)
+      if (!brw_cs_prog_data(prog_data)->uses_variable_group_size &&
+          slm_fence && workgroup_size() <= dispatch_width)
          slm_fence = false;
 
       /* Prior to Gen11, there's only L3 fence, so emit that instead. */
diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index 434ad005281..2393011312c 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -72,8 +72,16 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
             nir_ssa_def *channel = nir_load_subgroup_invocation(b);
             nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
 
-            nir_ssa_def *size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
-            nir_ssa_def *size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+            nir_ssa_def *size_x;
+            nir_ssa_def *size_y;
+            if (state->nir->info.cs.local_size_variable) {
+               nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+               size_x = nir_channel(b, size_xyz, 0);
+               size_y = nir_channel(b, size_xyz, 1);
+            } else {
+               size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
+               size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+            }
 
             /* The local invocation index and ID must respect the following
              *
@@ -152,12 +160,26 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
          break;
 
       case nir_intrinsic_load_num_subgroups: {
-         unsigned local_workgroup_size =
-            nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
-            nir->info.cs.local_size[2];
-         unsigned num_subgroups =
-            DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
-         sysval = nir_imm_int(b, num_subgroups);
+         if (state->nir->info.cs.local_size_variable) {
+            nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+            nir_ssa_def *size_x = nir_channel(b, size_xyz, 0);
+            nir_ssa_def *size_y = nir_channel(b, size_xyz, 1);
+            nir_ssa_def *size_z = nir_channel(b, size_xyz, 2);
+            nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+
+            /* Calculate the equivalent of DIV_ROUND_UP. */
+            sysval = nir_idiv(b,
+                              nir_iadd_imm(b,
+                                 nir_iadd_imm(b, size, state->dispatch_width), -1),
+                              nir_imm_int(b, state->dispatch_width));
+         } else {
+            unsigned local_workgroup_size =
+               nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
+               nir->info.cs.local_size[2];
+            unsigned num_subgroups =
+               DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
+            sysval = nir_imm_int(b, num_subgroups);
+         }
          break;
       }
 
@@ -198,16 +220,21 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir,
       .dispatch_width = dispatch_width,
    };
 
-   assert(!nir->info.cs.local_size_variable);
-   state.local_workgroup_size = nir->info.cs.local_size[0] *
-                                nir->info.cs.local_size[1] *
-                                nir->info.cs.local_size[2];
+   if (!nir->info.cs.local_size_variable) {
+      state.local_workgroup_size = nir->info.cs.local_size[0] *
+                                   nir->info.cs.local_size[1] *
+                                   nir->info.cs.local_size[2];
+   } else {
+      state.local_workgroup_size = nir->info.cs.max_variable_local_size;
+   }
 
    /* Constraints from NV_compute_shader_derivatives. */
-   if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+   if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
+       !nir->info.cs.local_size_variable) {
       assert(nir->info.cs.local_size[0] % 2 == 0);
       assert(nir->info.cs.local_size[1] % 2 == 0);
-   } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+   } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR &&
+              !nir->info.cs.local_size_variable) {
       assert(state.local_workgroup_size % 4 == 0);
    }
author	Plamena Manolova <[email protected]>	2018-11-12 06:29:51 -0800
committer	Caio Marcelo de Oliveira Filho <[email protected]>	2020-04-09 19:23:12 -0700
commit	c77dc51203a45c8ae82d5a88d3e8fe99c32fc5bc (patch)
tree	9ad38a786b4a7c236e2f3f4ede1d428b1a347529 /src/intel/compiler
parent	c54fc0d07b1a92e065000c1301971b93439595e2 (diff)