4 files changed, 90 insertions, 38 deletions
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 73a5a9a0697..1b50b2cc1f1 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -106,6 +106,7 @@ struct radv_shader_context {
 	uint64_t tcs_outputs_read;
 	uint32_t tcs_vertices_per_patch;
 	uint32_t tcs_num_inputs;
+	uint32_t tcs_num_patches;
 };
 
 enum radeon_llvm_calling_convention {
@@ -136,6 +137,46 @@ static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx)
 	}
 }
 
+static unsigned
+get_tcs_num_patches(struct radv_shader_context *ctx)
+{
+	unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices;
+	unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch;
+	uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
+	uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
+	uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
+	uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
+	uint32_t output_vertex_size = num_tcs_outputs * 16;
+	uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
+	uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+	unsigned num_patches;
+	unsigned hardware_lds_size;
+
+	/* Ensure that we only need one wave per SIMD so we don't need to check
+	 * resource usage. Also ensures that the number of tcs in and out
+	 * vertices per threadgroup are at most 256.
+	 */
+	num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+	/* Make sure that the data fits in LDS. This assumes the shaders only
+	 * use LDS for the inputs and outputs.
+	 */
+	hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
+	num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
+	/* Make sure the output data fits in the offchip buffer */
+	num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size);
+	/* Not necessary for correctness, but improves performance. The
+	 * specific value is taken from the proprietary driver.
+	 */
+	num_patches = MIN2(num_patches, 40);
+
+	/* SI bug workaround - limit LS-HS threadgroups to only one wave. */
+	if (ctx->options->chip_class == SI) {
+		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+		num_patches = MIN2(num_patches, one_wave);
+	}
+	return num_patches;
+}
+
 /* Tessellation shaders pass outputs to the next shader using LDS.
  *
  * LS outputs = TCS inputs
@@ -195,17 +236,17 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
 	uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
 	uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
 	uint32_t output_patch0_offset = input_patch_size;
-	LLVMValueRef num_patches = ac_unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
+	unsigned num_patches = ctx->tcs_num_patches;
 
+	output_patch0_offset *= num_patches;
 	output_patch0_offset /= 4;
-	return LLVMBuildMul(ctx->ac.builder,
-			    num_patches,
-			    LLVMConstInt(ctx->ac.i32, output_patch0_offset, false), "");
+	return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 }
 
 static LLVMValueRef
 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
 {
+	assert (ctx->stage == MESA_SHADER_TESS_CTRL);
 	uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
 	uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
 	uint32_t output_patch0_offset = input_patch_size;
@@ -213,15 +254,12 @@ get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
 	uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
 	uint32_t output_vertex_size = num_tcs_outputs * 16;
 	uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
-	LLVMValueRef num_patches = ac_unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
+	unsigned num_patches = ctx->tcs_num_patches;
 
+	output_patch0_offset *= num_patches;
+	output_patch0_offset += pervertex_output_patch_size;
 	output_patch0_offset /= 4;
-	LLVMValueRef value = LLVMBuildMul(ctx->ac.builder,
-					  num_patches,
-					  LLVMConstInt(ctx->ac.i32, output_patch0_offset, false), "");
-	return LLVMBuildAdd(ctx->ac.builder,
-			    value,
-			    LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size / 4, false), "");
+	return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 }
 
 static LLVMValueRef
@@ -493,7 +531,6 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,
 			if (previous_stage == MESA_SHADER_VERTEX)
 				user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
 		}
-		user_sgpr_info->sgpr_count += 1;
 		break;
 	case MESA_SHADER_TESS_EVAL:
 		user_sgpr_info->sgpr_count += 1;
@@ -789,8 +826,6 @@ static void create_function(struct radv_shader_context *ctx,
 							has_previous_stage,
 							previous_stage, &args);
 
-			add_arg(&args, ARG_SGPR, ctx->ac.i32,
-				&ctx->tcs_offchip_layout);
 			if (needs_view_index)
 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
 					&ctx->abi.view_index);
@@ -808,8 +843,6 @@ static void create_function(struct radv_shader_context *ctx,
 						   &user_sgpr_info, &args,
 						   &desc_sets);
 
-			add_arg(&args, ARG_SGPR, ctx->ac.i32,
-				&ctx->tcs_offchip_layout);
 			if (needs_view_index)
 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
 					&ctx->abi.view_index);
@@ -1018,7 +1051,6 @@ static void create_function(struct radv_shader_context *ctx,
 	case MESA_SHADER_TESS_CTRL:
 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
 					   previous_stage, &user_sgpr_idx);
-		set_loc_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
 		if (ctx->abi.view_index)
 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		break;
@@ -1115,30 +1147,58 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
  *
  * Note that every attribute has 4 components.
  */
+static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx)
+{
+	if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+		uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
+		uint32_t output_vertex_size = num_tcs_outputs * 16;
+		uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
+		uint32_t num_patches = ctx->tcs_num_patches;
+
+		return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * num_patches, false);
+	} else
+		return ac_unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 16, 16);
+}
+
+static LLVMValueRef calc_param_stride(struct radv_shader_context *ctx,
+				      LLVMValueRef vertex_index)
+{
+	LLVMValueRef param_stride;
+	if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+		if (vertex_index)
+			param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch * ctx->tcs_num_patches, false);
+		else
+			param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_num_patches, false);
+	} else {
+		LLVMValueRef num_patches = ac_unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
+		LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch, false);
+		if (vertex_index)
+			param_stride = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
+					    num_patches, "");
+		else
+			param_stride = num_patches;
+	}
+	return param_stride;
+}
+
 static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx,
                                                LLVMValueRef vertex_index,
                                                LLVMValueRef param_index)
 {
-	LLVMValueRef base_addr, vertices_per_patch, num_patches;
+	LLVMValueRef base_addr;
 	LLVMValueRef param_stride, constant16;
 	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
-
-	vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch, false);
-	num_patches = ac_unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
-
+	LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch, false);
 	constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
+	param_stride = calc_param_stride(ctx, vertex_index);
 	if (vertex_index) {
 		base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
 		                         vertices_per_patch, "");
 
 		base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 		                         vertex_index, "");
-
-		param_stride = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
-					    num_patches, "");
 	} else {
 		base_addr = rel_patch_id;
-		param_stride = num_patches;
 	}
 
 	base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
@@ -1148,8 +1208,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx,
 	base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 
 	if (!vertex_index) {
-		LLVMValueRef patch_data_offset =
-		           ac_unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 16, 16);
+		LLVMValueRef patch_data_offset = get_non_vertex_index_offset(ctx);
 
 		base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 		                         patch_data_offset, "");
@@ -3043,6 +3102,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 				ctx.tcs_num_inputs = ctx.options->key.tcs.num_inputs;
 			else
 				ctx.tcs_num_inputs = util_last_bit64(shader_info->info.vs.ls_outputs_written);
+			ctx.tcs_num_patches = get_tcs_num_patches(&ctx);
 		} else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
 			ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode;
 			ctx.abi.load_tess_varyings = load_tes_input;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 449381c3460..641dc5558b8 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2604,15 +2604,6 @@ radv_pipeline_generate_tess_shaders(struct radeon_winsys_cs *cs,
 
 	struct radv_userdata_info *loc;
 
-	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_CTRL, AC_UD_TCS_OFFCHIP_LAYOUT);
-	if (loc->sgpr_idx != -1) {
-		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_TESS_CTRL];
-		assert(loc->num_sgprs == 1);
-		assert(!loc->indirect);
-		radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, 1);
-		radeon_emit(cs, tess->offchip_layout);
-	}
-
 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_EVAL, AC_UD_TES_OFFCHIP_LAYOUT);
 	if (loc->sgpr_idx != -1) {
 		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_TESS_EVAL];
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 89875a56a02..704461e02cb 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -463,6 +463,7 @@ shader_variant_create(struct radv_device *device,
 	options->dump_preoptir = options->dump_shader &&
 				 device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
 	options->record_llvm_ir = device->keep_shader_info;
+	options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
 
 	if (options->supports_spill)
 		tm_options |= AC_TM_SUPPORTS_SPILL;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 1e006622edc..3726adb2593 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -103,6 +103,7 @@ struct radv_nir_compiler_options {
 	bool record_llvm_ir;
 	enum radeon_family family;
 	enum chip_class chip_class;
+	uint32_t tess_offchip_block_dw_size;
 };
 
 enum radv_ud_index {
@@ -120,7 +121,6 @@ enum radv_ud_index {
 	AC_UD_CS_MAX_UD,
 	AC_UD_GS_VS_RING_STRIDE_ENTRIES = AC_UD_VS_MAX_UD,
 	AC_UD_GS_MAX_UD,
-	AC_UD_TCS_OFFCHIP_LAYOUT = AC_UD_VS_MAX_UD,
 	AC_UD_TCS_MAX_UD,
 	AC_UD_TES_OFFCHIP_LAYOUT = AC_UD_SHADER_START,
 	AC_UD_TES_MAX_UD,