radv/pipeline: start calculating tess stage.

This calculates the pipeline state for tessellation. It moves the gs ring calculation down to below where the tessellation shaders will be compiled, as it needs the info from those shaders. Reviewed-by: Bas Nieuwenhuizen <[email protected]> Signed-off-by: Dave Airlie <[email protected]>
author: Dave Airlie <[email protected]> 2017-03-30 08:18:13 +0100
committer: Dave Airlie <[email protected]> 2017-04-01 07:16:19 +1000
commit: 4c60c68bd16486dfb57ba177487bc599ad3ef9f5 (patch)
tree: b5f49b6b6d233ce127b5d27836b7cd8912f587b5 /src
parent: 823b55a8a90ad1cb9a3f9652cf46789f5e0b79f5 (diff)
2 files changed, 208 insertions, 9 deletions
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index dbae47b9736..fdf6f87fac7 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1465,7 +1465,7 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 }
 
 static union ac_shader_variant_key
-radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es)
+radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es, bool as_ls)
 {
 	union ac_shader_variant_key key;
 	const VkPipelineVertexInputStateCreateInfo *input_state =
@@ -1474,6 +1474,7 @@ radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es)
 	memset(&key, 0, sizeof(key));
 	key.vs.instance_rate_inputs = 0;
 	key.vs.as_es = as_es;
+	key.vs.as_ls = as_ls;
 
 	for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
 		unsigned binding;
@@ -1495,8 +1496,10 @@ calculate_gs_ring_sizes(struct radv_pipeline *pipeline)
 	unsigned alignment = 256 * num_se;
 	/* The maximum size is 63.999 MB per SE. */
 	unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
-	struct ac_es_output_info *es_info = &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.es_info;
 	struct ac_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
+	struct ac_es_output_info *es_info = radv_pipeline_has_tess(pipeline) ?
+		&pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.es_info :
+		&pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.es_info;
 
 	/* Calculate the minimum size. */
 	unsigned min_esgs_ring_size = align(es_info->esgs_itemsize * gs_vertex_reuse *
@@ -1515,6 +1518,164 @@ calculate_gs_ring_sizes(struct radv_pipeline *pipeline)
 	pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
 }
 
+static void si_multiwave_lds_size_workaround(struct radv_device *device,
+					     unsigned *lds_size)
+{
+	/* SPI barrier management bug:
+	 *   Make sure we have at least 4k of LDS in use to avoid the bug.
+	 *   It applies to workgroup sizes of more than one wavefront.
+	 */
+	if (device->physical_device->rad_info.family == CHIP_BONAIRE ||
+	    device->physical_device->rad_info.family == CHIP_KABINI ||
+	    device->physical_device->rad_info.family == CHIP_MULLINS)
+		*lds_size = MAX2(*lds_size, 8);
+}
+
+static void
+calculate_tess_state(struct radv_pipeline *pipeline,
+		     const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+	unsigned num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints;
+	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+	unsigned num_tcs_patch_outputs;
+	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+	unsigned input_patch_size, output_patch_size, output_patch0_offset;
+	unsigned lds_size, hardware_lds_size;
+	unsigned perpatch_output_offset;
+	unsigned num_patches;
+	struct radv_tessellation_state *tess = &pipeline->graphics.tess;
+
+	/* This calculates how shader inputs and outputs among VS, TCS, and TES
+	 * are laid out in LDS. */
+	num_tcs_inputs = util_last_bit64(pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outputs_written);
+
+	num_tcs_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.outputs_written); //tcs->outputs_written
+	num_tcs_output_cp = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; //TCS VERTICES OUT
+	num_tcs_patch_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.patch_outputs_written);
+
+	/* Ensure that we only need one wave per SIMD so we don't need to check
+	 * resource usage. Also ensures that the number of tcs in and out
+	 * vertices per threadgroup are at most 256.
+	 */
+	input_vertex_size = num_tcs_inputs * 16;
+	output_vertex_size = num_tcs_outputs * 16;
+
+	input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+	/* Ensure that we only need one wave per SIMD so we don't need to check
+	 * resource usage. Also ensures that the number of tcs in and out
+	 * vertices per threadgroup are at most 256.
+	 */
+	num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+
+	/* Make sure that the data fits in LDS. This assumes the shaders only
+	 * use LDS for the inputs and outputs.
+	 */
+	hardware_lds_size = pipeline->device->physical_device->rad_info.chip_class >= CIK ? 65536 : 32768;
+	num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
+
+	/* Make sure the output data fits in the offchip buffer */
+	num_patches = MIN2(num_patches,
+			    (pipeline->device->tess_offchip_block_dw_size * 4) /
+			    output_patch_size);
+
+	/* Not necessary for correctness, but improves performance. The
+	 * specific value is taken from the proprietary driver.
+	 */
+	num_patches = MIN2(num_patches, 40);
+
+	/* SI bug workaround - limit LS-HS threadgroups to only one wave. */
+	if (pipeline->device->physical_device->rad_info.chip_class == SI) {
+		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+		num_patches = MIN2(num_patches, one_wave);
+	}
+
+	output_patch0_offset = input_patch_size * num_patches;
+	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+	lds_size = output_patch0_offset + output_patch_size * num_patches;
+
+	if (pipeline->device->physical_device->rad_info.chip_class >= CIK) {
+		assert(lds_size <= 65536);
+		lds_size = align(lds_size, 512) / 512;
+	} else {
+		assert(lds_size <= 32768);
+		lds_size = align(lds_size, 256) / 256;
+	}
+	si_multiwave_lds_size_workaround(pipeline->device, &lds_size);
+
+	tess->lds_size = lds_size;
+
+	tess->tcs_in_layout = (input_patch_size / 4) |
+		((input_vertex_size / 4) << 13);
+	tess->tcs_out_layout = (output_patch_size / 4) |
+		((output_vertex_size / 4) << 13);
+	tess->tcs_out_offsets = (output_patch0_offset / 16) |
+		((perpatch_output_offset / 16) << 16);
+	tess->offchip_layout = (pervertex_output_patch_size * num_patches << 16) |
+		(num_tcs_output_cp << 9) | num_patches;
+
+	tess->ls_hs_config = S_028B58_NUM_PATCHES(num_patches) |
+		S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
+		S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
+	tess->num_patches = num_patches;
+	tess->num_tcs_input_cp = num_tcs_input_cp;
+
+	struct radv_shader_variant *tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
+	unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
+
+	switch (tes->info.tes.primitive_mode) {
+	case GL_TRIANGLES:
+		type = V_028B6C_TESS_TRIANGLE;
+		break;
+	case GL_QUADS:
+		type = V_028B6C_TESS_QUAD;
+		break;
+	case GL_ISOLINES:
+		type = V_028B6C_TESS_ISOLINE;
+		break;
+	}
+
+	switch (tes->info.tes.spacing) {
+	case TESS_SPACING_EQUAL:
+		partitioning = V_028B6C_PART_INTEGER;
+		break;
+	case TESS_SPACING_FRACTIONAL_ODD:
+		partitioning = V_028B6C_PART_FRAC_ODD;
+		break;
+	case TESS_SPACING_FRACTIONAL_EVEN:
+		partitioning = V_028B6C_PART_FRAC_EVEN;
+		break;
+	default:
+		break;
+	}
+
+	if (tes->info.tes.point_mode)
+		topology = V_028B6C_OUTPUT_POINT;
+	else if (tes->info.tes.primitive_mode == GL_ISOLINES)
+		topology = V_028B6C_OUTPUT_LINE;
+	else if (tes->info.tes.ccw)
+		topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+	else
+		topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+
+	if (pipeline->device->has_distributed_tess) {
+		if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI ||
+		    pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10)
+			distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
+		else
+			distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
+	} else
+		distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
+
+	tess->tf_param = S_028B6C_TYPE(type) |
+		S_028B6C_PARTITIONING(partitioning) |
+		S_028B6C_TOPOLOGY(topology) |
+		S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+}
+
 static const struct radv_prim_vertex_count prim_size_table[] = {
 	[V_008958_DI_PT_NONE] = {0, 0},
 	[V_008958_DI_PT_POINTLIST] = {1, 1},
@@ -1558,7 +1719,7 @@ static uint32_t si_vgt_gs_mode(struct radv_shader_variant *gs)
 static void calculate_pa_cl_vs_out_cntl(struct radv_pipeline *pipeline)
 {
 	struct radv_shader_variant *vs;
-	vs = radv_pipeline_has_gs(pipeline) ? pipeline->gs_copy_shader : pipeline->shaders[MESA_SHADER_VERTEX];
+	vs = radv_pipeline_has_gs(pipeline) ? pipeline->gs_copy_shader : (radv_pipeline_has_tess(pipeline) ? pipeline->shaders[MESA_SHADER_TESS_EVAL] :  pipeline->shaders[MESA_SHADER_VERTEX]);
 
 	struct ac_vs_output_info *outinfo = &vs->info.vs.outinfo;
 
@@ -1588,7 +1749,7 @@ static void calculate_ps_inputs(struct radv_pipeline *pipeline)
 	struct ac_vs_output_info *outinfo;
 
 	ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
-	vs = radv_pipeline_has_gs(pipeline) ? pipeline->gs_copy_shader : pipeline->shaders[MESA_SHADER_VERTEX];
+	vs = radv_pipeline_has_gs(pipeline) ? pipeline->gs_copy_shader : (radv_pipeline_has_tess(pipeline) ? pipeline->shaders[MESA_SHADER_TESS_EVAL] :  pipeline->shaders[MESA_SHADER_VERTEX]);
 
 	outinfo = &vs->info.vs.outinfo;
 
@@ -1681,8 +1842,13 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 	radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
 
 	if (modules[MESA_SHADER_VERTEX]) {
-		bool as_es = modules[MESA_SHADER_GEOMETRY] != NULL;
-		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, as_es);
+		bool as_es = false;
+		bool as_ls = false;
+		if (modules[MESA_SHADER_TESS_CTRL])
+			as_ls = true;
+		else if (modules[MESA_SHADER_GEOMETRY])
+			as_es = true;
+		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, as_es, as_ls);
 
 		pipeline->shaders[MESA_SHADER_VERTEX] =
 			 radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_VERTEX],
@@ -1695,7 +1861,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 	}
 
 	if (modules[MESA_SHADER_GEOMETRY]) {
-		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, false);
+		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, false, false);
 
 		pipeline->shaders[MESA_SHADER_GEOMETRY] =
 			 radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_GEOMETRY],
@@ -1705,7 +1871,6 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 					       pipeline->layout, &key);
 
 		pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_GEOMETRY);
-		calculate_gs_ring_sizes(pipeline);
 
 		pipeline->graphics.vgt_gs_mode = si_vgt_gs_mode(pipeline->shaders[MESA_SHADER_GEOMETRY]);
 	} else
@@ -1805,12 +1970,33 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 	calculate_ps_inputs(pipeline);
 
 	uint32_t stages = 0;
-	if (radv_pipeline_has_gs(pipeline))
+	if (radv_pipeline_has_tess(pipeline)) {
+		stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
+			S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
+
+		if (radv_pipeline_has_gs(pipeline))
+			stages |=  S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+				S_028B54_GS_EN(1) |
+				S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+		else
+			stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+	} else if (radv_pipeline_has_gs(pipeline))
 		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
 			S_028B54_GS_EN(1) |
 			S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
 	pipeline->graphics.vgt_shader_stages_en = stages;
 
+	if (radv_pipeline_has_gs(pipeline))
+		calculate_gs_ring_sizes(pipeline);
+
+	if (radv_pipeline_has_tess(pipeline)) {
+		if (pipeline->graphics.prim == V_008958_DI_PT_PATCH) {
+			pipeline->graphics.prim_vertex_count.min = pCreateInfo->pTessellationState->patchControlPoints;
+			pipeline->graphics.prim_vertex_count.incr = 1;
+		}
+		calculate_tess_state(pipeline, pCreateInfo);
+	}
+
 	const VkPipelineVertexInputStateCreateInfo *vi_info =
 		pCreateInfo->pVertexInputState;
 	for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index d6982d826aa..3f92d59ac97 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -936,6 +936,18 @@ struct radv_prim_vertex_count {
 	uint8_t incr;
 };
 
+struct radv_tessellation_state {
+	uint32_t ls_hs_config;
+	uint32_t tcs_in_layout;
+	uint32_t tcs_out_layout;
+	uint32_t tcs_out_offsets;
+	uint32_t offchip_layout;
+	unsigned num_patches;
+	unsigned lds_size;
+	unsigned num_tcs_input_cp;
+	uint32_t tf_param;
+};
+
 struct radv_pipeline {
 	struct radv_device *                          device;
 	uint32_t                                     dynamic_state_mask;
@@ -962,6 +974,7 @@ struct radv_pipeline {
 			struct radv_depth_stencil_state ds;
 			struct radv_raster_state raster;
 			struct radv_multisample_state ms;
+			struct radv_tessellation_state tess;
 			uint32_t db_shader_control;
 			uint32_t shader_z_format;
 			unsigned prim;
author	Dave Airlie <[email protected]>	2017-03-30 08:18:13 +0100
committer	Dave Airlie <[email protected]>	2017-04-01 07:16:19 +1000
commit	4c60c68bd16486dfb57ba177487bc599ad3ef9f5 (patch)
tree	b5f49b6b6d233ce127b5d27836b7cd8912f587b5 /src
parent	823b55a8a90ad1cb9a3f9652cf46789f5e0b79f5 (diff)