intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8

Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <[email protected]>
author: Kenneth Graunke <[email protected]> 2019-05-03 14:57:54 -0700
committer: Kenneth Graunke <[email protected]> 2019-05-14 13:16:30 -0700
commit: 646924cfa1bb51c9091a8cf0079f00759cbea372 (patch)
tree: e69a694db5d38d01cc4ba05a659a3729cddffc88 /src/intel/compiler
parent: 076159b40b96096ba01413abc011a26c9acf7176 (diff)
6 files changed, 162 insertions, 31 deletions
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 626ff4bb16a..6a41cd20270 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -99,6 +99,9 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
 
    compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
 
+   compiler->use_tcs_8_patch =
+      devinfo->gen >= 9 && (INTEL_DEBUG & DEBUG_TCS_EIGHT_PATCH);
+
    if (devinfo->gen >= 10) {
       /* We don't support vec4 mode on Cannonlake. */
       for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 332789d7809..8c6ae35636b 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -93,6 +93,7 @@ struct brw_compiler {
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
    bool scalar_stage[MESA_SHADER_STAGES];
+   bool use_tcs_8_patch;
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 
    /**
@@ -1002,6 +1003,9 @@ enum shader_dispatch_mode {
    DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
    DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
    DISPATCH_MODE_SIMD8 = 3,
+
+   DISPATCH_MODE_TCS_SINGLE_PATCH = 0,
+   DISPATCH_MODE_TCS_8_PATCH = 2,
 };
 
 /**
@@ -1074,6 +1078,9 @@ struct brw_tcs_prog_data
 {
    struct brw_vue_prog_data base;
 
+   /** Should the non-SINGLE_PATCH payload provide primitive ID? */
+   bool include_primitive_id;
+
    /** Number vertices in output patch */
    int instances;
 };
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9d05800d1fe..9b4e030b54f 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1805,7 +1805,7 @@ fs_visitor::assign_vs_urb_setup()
 }
 
 void
-fs_visitor::assign_tcs_single_patch_urb_setup()
+fs_visitor::assign_tcs_urb_setup()
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
 
@@ -7396,12 +7396,28 @@ void
 fs_visitor::set_tcs_invocation_id()
 {
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
 
    const unsigned instance_id_mask =
       devinfo->gen >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
    const unsigned instance_id_shift =
       devinfo->gen >= 11 ? 16 : 17;
 
+   /* Get instance number from g0.2 bits 22:16 or 23:17 */
+   fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+           brw_imm_ud(instance_id_mask));
+
+   invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
+      /* gl_InvocationID is just the thread number */
+      bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
+      return;
+   }
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
+
    fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
    fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
    bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
@@ -7410,26 +7426,36 @@ fs_visitor::set_tcs_invocation_id()
    if (tcs_prog_data->instances == 1) {
       invocation_id = channels_ud;
    } else {
-      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
-      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
       fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
-      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
-              brw_imm_ud(instance_id_mask));
       bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
-
       bld.ADD(invocation_id, instance_times_8, channels_ud);
    }
 }
 
 bool
-fs_visitor::run_tcs_single_patch()
+fs_visitor::run_tcs()
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
 
-   /* r1-r4 contain the ICP handles. */
-   payload.num_regs = 5;
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
+          vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      /* r1-r4 contain the ICP handles. */
+      payload.num_regs = 5;
+   } else {
+      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+      assert(tcs_key->input_vertices > 0);
+      /* r1 contains output handles, r2 may contain primitive ID, then the
+       * ICP handles occupy the next 1-32 registers.
+       */
+      payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
+                         tcs_key->input_vertices;
+   }
 
    if (shader_time_index >= 0)
       emit_shader_time_begin();
@@ -7438,6 +7464,7 @@ fs_visitor::run_tcs_single_patch()
    set_tcs_invocation_id();
 
    const bool fix_dispatch_mask =
+      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
       (nir->info.tess.tcs_vertices_out % 8) != 0;
 
    /* Fix the disptach mask */
@@ -7455,7 +7482,7 @@ fs_visitor::run_tcs_single_patch()
 
    /* Emit EOT write; set TR DS Cache bit */
    fs_reg srcs[3] = {
-      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(get_tcs_output_urb_handle()),
       fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
       fs_reg(brw_imm_ud(0)),
    };
@@ -7478,7 +7505,7 @@ fs_visitor::run_tcs_single_patch()
    optimize();
 
    assign_curb_setup();
-   assign_tcs_single_patch_urb_setup();
+   assign_tcs_urb_setup();
 
    fixup_3src_null_dest();
    allocate_registers(8, true);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 8ae73401cdf..7db486688af 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -92,7 +92,7 @@ public:
 
    bool run_fs(bool allow_spilling, bool do_rep_send);
    bool run_vs();
-   bool run_tcs_single_patch();
+   bool run_tcs();
    bool run_tes();
    bool run_gs();
    bool run_cs(unsigned min_dispatch_width);
@@ -110,7 +110,7 @@ public:
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
-   void assign_tcs_single_patch_urb_setup();
+   void assign_tcs_urb_setup();
    void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling, bool spill_all);
@@ -251,6 +251,9 @@ public:
    fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
    fs_reg get_tcs_single_patch_icp_handle(const brw::fs_builder &bld,
                                           nir_intrinsic_instr *instr);
+   fs_reg get_tcs_eight_patch_icp_handle(const brw::fs_builder &bld,
+                                         nir_intrinsic_instr *instr);
+   struct brw_reg get_tcs_output_urb_handle();
 
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 73e2f38145e..a2c8f3f557f 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2605,6 +2605,73 @@ fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
    return icp_handle;
 }
 
+fs_reg
+fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
+                                           nir_intrinsic_instr *instr)
+{
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   const nir_src &vertex_src = instr->src[0];
+
+   unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
+
+   if (nir_src_is_const(vertex_src)) {
+      return fs_reg(retype(brw_vec8_grf(first_icp_handle +
+                                        nir_src_as_uint(vertex_src), 0),
+                           BRW_REGISTER_TYPE_UD));
+   }
+
+   /* The vertex index is non-constant.  We need to use indirect
+    * addressing to fetch the proper URB handle.
+    *
+    * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+    * indicating that channel <n> should read the handle from
+    * DWord <n>.  We convert that to bytes by multiplying by 4.
+    *
+    * Next, we convert the vertex index to bytes by multiplying
+    * by 32 (shifting by 5), and add the two together.  This is
+    * the final indirect byte offset.
+    */
+   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+   fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+   /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+   bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+   /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+   bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+   /* Convert vertex_index to bytes (multiply by 32) */
+   bld.SHL(vertex_offset_bytes,
+           retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(5u));
+   bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+   /* Use first_icp_handle as the base offset.  There is one register
+    * of URB handles per vertex, so inform the register allocator that
+    * we might read up to nir->info.gs.vertices_in registers.
+    */
+   bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+            retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+            icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
+
+   return icp_handle;
+}
+
+struct brw_reg
+fs_visitor::get_tcs_output_urb_handle()
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   } else {
+      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+      return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+   }
+}
+
 void
 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
@@ -2612,6 +2679,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
    assert(stage == MESA_SHADER_TESS_CTRL);
    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+
+   bool eight_patch =
+      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
 
    fs_reg dst;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -2619,7 +2690,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
 
    switch (instr->intrinsic) {
    case nir_intrinsic_load_primitive_id:
-      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+      bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
+                                      : brw_vec1_grf(0, 1)));
       break;
    case nir_intrinsic_load_invocation_id:
       bld.MOV(retype(dst, invocation_id.type), invocation_id);
@@ -2675,7 +2747,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       fs_inst *inst;
 
-      fs_reg icp_handle = get_tcs_single_patch_icp_handle(bld, instr);
+      fs_reg icp_handle =
+         eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
+                     : get_tcs_single_patch_icp_handle(bld, instr);
 
       /* We can only read two double components with each URB read, so
        * we send two read messages in that case, each one loading up to
@@ -2776,12 +2850,15 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       unsigned first_component = nir_intrinsic_component(instr);
 
+      struct brw_reg output_handles = get_tcs_output_urb_handle();
+
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
-         /* Replicate the patch handle to all enabled channels */
+         /* This MOV replicates the output handle to all enabled channels
+          * is SINGLE_PATCH mode.
+          */
          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.MOV(patch_handle,
-                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         bld.MOV(patch_handle, output_handles);
 
          {
             if (first_component != 0) {
@@ -2805,10 +2882,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
-            indirect_offset
-         };
+         const fs_reg srcs[] = { output_handles, indirect_offset };
          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
          if (first_component != 0) {
@@ -2842,8 +2916,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       unsigned mask = instr->const_index[1];
       unsigned header_regs = 0;
+      struct brw_reg output_handles = get_tcs_output_urb_handle();
+
       fs_reg srcs[7];
-      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+      srcs[header_regs++] = output_handles;
 
       if (indirect_offset.file != BAD_FILE) {
          srcs[header_regs++] = indirect_offset;
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index f0ef8c0dd96..c37f34cbe81 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -406,10 +406,26 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
    nir = brw_postprocess_nir(nir, compiler, is_scalar);
 
-   if (is_scalar)
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8);
-   else
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2);
+   bool has_primitive_id =
+      nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID);
+
+   if (compiler->use_tcs_8_patch &&
+       nir->info.tess.tcs_vertices_out <= 16 &&
+       2 + has_primitive_id + key->input_vertices <= 31) {
+      /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode.  First,
+       * the "Instance" field limits the number of output vertices to [1, 16].
+       * Secondly, the "Dispatch GRF Start Register for URB Data" field is
+       * limited to [0, 31] - which imposes a limit on the input vertices.
+       */
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH;
+      prog_data->instances = nir->info.tess.tcs_vertices_out;
+      prog_data->include_primitive_id = has_primitive_id;
+   } else {
+      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_SINGLE_PATCH;
+      prog_data->instances =
+         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+   }
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
@@ -462,14 +478,13 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
                    &prog_data->base.base, NULL, nir, 8,
                    shader_time_index, &input_vue_map);
-      if (!v.run_tcs_single_patch()) {
+      if (!v.run_tcs()) {
          if (error_str)
             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
          return NULL;
       }
 
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
-      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_generator g(compiler, log_data, mem_ctx,
                      &prog_data->base.base, v.promoted_constants, false,
author	Kenneth Graunke <[email protected]>	2019-05-03 14:57:54 -0700
committer	Kenneth Graunke <[email protected]>	2019-05-14 13:16:30 -0700
commit	646924cfa1bb51c9091a8cf0079f00759cbea372 (patch)
tree	e69a694db5d38d01cc4ba05a659a3729cddffc88 /src/intel/compiler
parent	076159b40b96096ba01413abc011a26c9acf7176 (diff)