13 files changed, 193 insertions, 34 deletions
diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
index 30ec3f1ff86..d5c5a32bbc4 100644
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -1526,6 +1526,7 @@ iris_create_tcs_state(struct pipe_context *ctx,
 {
    struct iris_context *ice = (void *) ctx;
    struct iris_screen *screen = (void *) ctx->screen;
+   const struct brw_compiler *compiler = screen->compiler;
    struct iris_uncompiled_shader *ish = iris_create_shader_state(ctx, state);
    struct shader_info *info = &ish->nir->info;
 
@@ -1544,6 +1545,14 @@ iris_create_tcs_state(struct pipe_context *ctx,
          .patch_outputs_written = info->patch_outputs_written,
       };
 
+      /* 8_PATCH mode needs the key to contain the input patch dimensionality.
+       * We don't have that information, so we randomly guess that the input
+       * and output patches are the same size.  This is a bad guess, but we
+       * can't do much better.
+       */
+      if (compiler->use_tcs_8_patch)
+         key.input_vertices = info->tess.tcs_vertices_out;
+
       iris_compile_tcs(ice, ish, &key);
    }
 
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index a9af1cd0dc4..f48bacf77e1 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -3651,6 +3651,11 @@ iris_store_tcs_state(struct iris_context *ice,
       hs.InstanceCount = tcs_prog_data->instances - 1;
       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
       hs.IncludeVertexHandles = true;
+
+#if GEN_GEN >= 9
+      hs.DispatchMode = vue_prog_data->dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
    }
 }
 
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 626ff4bb16a..6a41cd20270 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -99,6 +99,9 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
 
    compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
 
+   compiler->use_tcs_8_patch =
+      devinfo->gen >= 9 && (INTEL_DEBUG & DEBUG_TCS_EIGHT_PATCH);
+
    if (devinfo->gen >= 10) {
       /* We don't support vec4 mode on Cannonlake. */
       for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 332789d7809..8c6ae35636b 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -93,6 +93,7 @@ struct brw_compiler {
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
    bool scalar_stage[MESA_SHADER_STAGES];
+   bool use_tcs_8_patch;
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 
    /**
@@ -1002,6 +1003,9 @@ enum shader_dispatch_mode {
    DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
    DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
    DISPATCH_MODE_SIMD8 = 3,
+
+   DISPATCH_MODE_TCS_SINGLE_PATCH = 0,
+   DISPATCH_MODE_TCS_8_PATCH = 2,
 };
 
 /**
@@ -1074,6 +1078,9 @@ struct brw_tcs_prog_data
 {
    struct brw_vue_prog_data base;
 
+   /** Should the non-SINGLE_PATCH payload provide primitive ID? */
+   bool include_primitive_id;
+
    /** Number vertices in output patch */
    int instances;
 };
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9d05800d1fe..9b4e030b54f 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1805,7 +1805,7 @@ fs_visitor::assign_vs_urb_setup()
 }
 
 void
-fs_visitor::assign_tcs_single_patch_urb_setup()
+fs_visitor::assign_tcs_urb_setup()
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
 
@@ -7396,12 +7396,28 @@ void
 fs_visitor::set_tcs_invocation_id()
 {
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
 
    const unsigned instance_id_mask =
       devinfo->gen >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
    const unsigned instance_id_shift =
       devinfo->gen >= 11 ? 16 : 17;
 
+   /* Get instance number from g0.2 bits 22:16 or 23:17 */
+   fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+           brw_imm_ud(instance_id_mask));
+
+   invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
+      /* gl_InvocationID is just the thread number */
+      bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
+      return;
+   }
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
+
    fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
    fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
    bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
@@ -7410,26 +7426,36 @@ fs_visitor::set_tcs_invocation_id()
    if (tcs_prog_data->instances == 1) {
       invocation_id = channels_ud;
    } else {
-      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
-      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
       fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
-      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
-              brw_imm_ud(instance_id_mask));
       bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
-
       bld.ADD(invocation_id, instance_times_8, channels_ud);
    }
 }
 
 bool
-fs_visitor::run_tcs_single_patch()
+fs_visitor::run_tcs()
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
 
-   /* r1-r4 contain the ICP handles. */
-   payload.num_regs = 5;
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
+          vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      /* r1-r4 contain the ICP handles. */
+      payload.num_regs = 5;
+   } else {
+      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+      assert(tcs_key->input_vertices > 0);
+      /* r1 contains output handles, r2 may contain primitive ID, then the
+       * ICP handles occupy the next 1-32 registers.
+       */
+      payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
+                         tcs_key->input_vertices;
+   }
 
    if (shader_time_index >= 0)
       emit_shader_time_begin();
@@ -7438,6 +7464,7 @@ fs_visitor::run_tcs_single_patch()
    set_tcs_invocation_id();
 
    const bool fix_dispatch_mask =
+      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
       (nir->info.tess.tcs_vertices_out % 8) != 0;
 
    /* Fix the disptach mask */
@@ -7455,7 +7482,7 @@ fs_visitor::run_tcs_single_patch()
 
    /* Emit EOT write; set TR DS Cache bit */
    fs_reg srcs[3] = {
-      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(get_tcs_output_urb_handle()),
       fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
       fs_reg(brw_imm_ud(0)),
    };
@@ -7478,7 +7505,7 @@ fs_visitor::run_tcs_single_patch()
    optimize();
 
    assign_curb_setup();
-   assign_tcs_single_patch_urb_setup();
+   assign_tcs_urb_setup();
 
    fixup_3src_null_dest();
    allocate_registers(8, true);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 8ae73401cdf..7db486688af 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -92,7 +92,7 @@ public:
 
    bool run_fs(bool allow_spilling, bool do_rep_send);
    bool run_vs();
-   bool run_tcs_single_patch();
+   bool run_tcs();
    bool run_tes();
    bool run_gs();
    bool run_cs(unsigned min_dispatch_width);
@@ -110,7 +110,7 @@ public:
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
-   void assign_tcs_single_patch_urb_setup();
+   void assign_tcs_urb_setup();
    void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling, bool spill_all);
@@ -251,6 +251,9 @@ public:
    fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
    fs_reg get_tcs_single_patch_icp_handle(const brw::fs_builder &bld,
                                           nir_intrinsic_instr *instr);
+   fs_reg get_tcs_eight_patch_icp_handle(const brw::fs_builder &bld,
+                                         nir_intrinsic_instr *instr);
+   struct brw_reg get_tcs_output_urb_handle();
 
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 73e2f38145e..a2c8f3f557f 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2605,6 +2605,73 @@ fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
    return icp_handle;
 }
 
+fs_reg
+fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
+                                           nir_intrinsic_instr *instr)
+{
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   const nir_src &vertex_src = instr->src[0];
+
+   unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
+
+   if (nir_src_is_const(vertex_src)) {
+      return fs_reg(retype(brw_vec8_grf(first_icp_handle +
+                                        nir_src_as_uint(vertex_src), 0),
+                           BRW_REGISTER_TYPE_UD));
+   }
+
+   /* The vertex index is non-constant.  We need to use indirect
+    * addressing to fetch the proper URB handle.
+    *
+    * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+    * indicating that channel <n> should read the handle from
+    * DWord <n>.  We convert that to bytes by multiplying by 4.
+    *
+    * Next, we convert the vertex index to bytes by multiplying
+    * by 32 (shifting by 5), and add the two together.  This is
+    * the final indirect byte offset.
+    */
+   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+   fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+   /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+   bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+   /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+   bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+   /* Convert vertex_index to bytes (multiply by 32) */
+   bld.SHL(vertex_offset_bytes,
+           retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(5u));
+   bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+   /* Use first_icp_handle as the base offset.  There is one register
+    * of URB handles per vertex, so inform the register allocator that
+    * we might read up to nir->info.gs.vertices_in registers.
+    */
+   bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+            retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+            icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
+
+   return icp_handle;
+}
+
+struct brw_reg
+fs_visitor::get_tcs_output_urb_handle()
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   } else {
+      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
+      return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+   }
+}
+
 void
 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
@@ -2612,6 +2679,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
    assert(stage == MESA_SHADER_TESS_CTRL);
    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+
+   bool eight_patch =
+      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
 
    fs_reg dst;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -2619,7 +2690,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
 
    switch (instr->intrinsic) {
    case nir_intrinsic_load_primitive_id:
-      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+      bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
+                                      : brw_vec1_grf(0, 1)));
       break;
    case nir_intrinsic_load_invocation_id:
       bld.MOV(retype(dst, invocation_id.type), invocation_id);
@@ -2675,7 +2747,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       fs_inst *inst;
 
-      fs_reg icp_handle = get_tcs_single_patch_icp_handle(bld, instr);
+      fs_reg icp_handle =
+         eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
+                     : get_tcs_single_patch_icp_handle(bld, instr);
 
       /* We can only read two double components with each URB read, so
        * we send two read messages in that case, each one loading up to
@@ -2776,12 +2850,15 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       unsigned first_component = nir_intrinsic_component(instr);
 
+      struct brw_reg output_handles = get_tcs_output_urb_handle();
+
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
-         /* Replicate the patch handle to all enabled channels */
+         /* This MOV replicates the output handle to all enabled channels
+          * is SINGLE_PATCH mode.
+          */
          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.MOV(patch_handle,
-                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         bld.MOV(patch_handle, output_handles);
 
          {
             if (first_component != 0) {
@@ -2805,10 +2882,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
-            indirect_offset
-         };
+         const fs_reg srcs[] = { output_handles, indirect_offset };
          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
          if (first_component != 0) {
@@ -2842,8 +2916,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned imm_offset = instr->const_index[0];
       unsigned mask = instr->const_index[1];
       unsigned header_regs = 0;
+      struct brw_reg output_handles = get_tcs_output_urb_handle();
+
       fs_reg srcs[7];
-      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+      srcs[header_regs++] = output_handles;
 
       if (indirect_offset.file != BAD_FILE) {
          srcs[header_regs++] = indirect_offset;
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index f0ef8c0dd96..c37f34cbe81 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -406,10 +406,26 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
    nir = brw_postprocess_nir(nir, compiler, is_scalar);
 
-   if (is_scalar)
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8);
-   else
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2);
+   bool has_primitive_id =
+      nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID);
+
+   if (compiler->use_tcs_8_patch &&
+       nir->info.tess.tcs_vertices_out <= 16 &&
+       2 + has_primitive_id + key->input_vertices <= 31) {
+      /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode.  First,
+       * the "Instance" field limits the number of output vertices to [1, 16].
+       * Secondly, the "Dispatch GRF Start Register for URB Data" field is
+       * limited to [0, 31] - which imposes a limit on the input vertices.
+       */
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH;
+      prog_data->instances = nir->info.tess.tcs_vertices_out;
+      prog_data->include_primitive_id = has_primitive_id;
+   } else {
+      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_SINGLE_PATCH;
+      prog_data->instances =
+         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+   }
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
@@ -462,14 +478,13 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
                    &prog_data->base.base, NULL, nir, 8,
                    shader_time_index, &input_vue_map);
-      if (!v.run_tcs_single_patch()) {
+      if (!v.run_tcs()) {
          if (error_str)
             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
          return NULL;
       }
 
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
-      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_generator g(compiler, log_data, mem_ctx,
                      &prog_data->base.base, v.promoted_constants, false,
diff --git a/src/intel/dev/gen_debug.c b/src/intel/dev/gen_debug.c
index b0d0d1a574a..5fa3d29fa51 100644
--- a/src/intel/dev/gen_debug.c
+++ b/src/intel/dev/gen_debug.c
@@ -86,6 +86,7 @@ static const struct debug_control debug_control[] = {
    { "color",       DEBUG_COLOR },
    { "reemit",      DEBUG_REEMIT },
    { "soft64",      DEBUG_SOFT64 },
+   { "tcs8",        DEBUG_TCS_EIGHT_PATCH },
    { NULL,    0 }
 };
 
diff --git a/src/intel/dev/gen_debug.h b/src/intel/dev/gen_debug.h
index e4dabc67f8d..a6592354a64 100644
--- a/src/intel/dev/gen_debug.h
+++ b/src/intel/dev/gen_debug.h
@@ -84,6 +84,7 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_COLOR               (1ull << 40)
 #define DEBUG_REEMIT              (1ull << 41)
 #define DEBUG_SOFT64              (1ull << 42)
+#define DEBUG_TCS_EIGHT_PATCH     (1ull << 43)
 
 /* These flags are not compatible with the disk shader cache */
 #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -91,7 +92,8 @@ extern uint64_t INTEL_DEBUG;
 /* These flags may affect program generation */
 #define DEBUG_DISK_CACHE_MASK \
    (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 |  DEBUG_SPILL_FS | \
-   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64)
+   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \
+   DEBUG_TCS_EIGHT_PATCH)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 0b58dce05b0..6b64f7ea8c7 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1434,6 +1434,11 @@ emit_3dstate_hs_te_ds(struct anv_pipeline *pipeline,
       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
       hs.ScratchSpaceBasePointer =
          get_scratch_address(pipeline, MESA_SHADER_TESS_CTRL, tcs_bin);
+
+#if GEN_GEN >= 9
+      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
    }
 
    const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 1c4d7988638..1050850bb1c 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -160,6 +160,7 @@ brw_tcs_populate_key(struct brw_context *brw,
                      struct brw_tcs_prog_key *key)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   const struct brw_compiler *compiler = brw->screen->compiler;
    struct brw_program *tcp =
       (struct brw_program *) brw->programs[MESA_SHADER_TESS_CTRL];
    struct brw_program *tep =
@@ -177,7 +178,7 @@ brw_tcs_populate_key(struct brw_context *brw,
       per_patch_slots |= prog->info.patch_outputs_written;
    }
 
-   if (devinfo->gen < 8 || !tcp)
+   if (devinfo->gen < 8 || !tcp || compiler->use_tcs_8_patch)
       key->input_vertices = brw->ctx.TessCtrlProgram.patch_vertices;
    key->outputs_written = per_vertex_slots;
    key->patch_outputs_written = per_patch_slots;
@@ -251,7 +252,7 @@ brw_tcs_populate_default_key(const struct brw_compiler *compiler,
    brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
 
    /* Guess that the input and output patches have the same dimensionality. */
-   if (devinfo->gen < 8)
+   if (devinfo->gen < 8 || compiler->use_tcs_8_patch)
       key->input_vertices = prog->info.tess.tcs_vertices_out;
 
    if (tes) {
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index ecffa2e8e86..961306b04fd 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -4111,6 +4111,11 @@ genX(upload_hs_state)(struct brw_context *brw)
          hs.IncludeVertexHandles = true;
 
          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+
+#if GEN_GEN >= 9
+         hs.DispatchMode = vue_prog_data->dispatch_mode;
+         hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
       }
    }
 }