radeonsi: Process multiple patches per threadgroup.

Using more than 1 wave per threadgroup does increase performance generally. Not using too many patches per threadgroup also increases performance. Both catalyst and amdgpu-pro seem to use 40 patches as their maximum, but I haven't really seen any performance increase from limiting the number of patches to 40 instead of 64. Note that the trick where we overlap the input and output LDS does not work anymore as the insertion of the tess factors changes the patch stride. v2: - Add comment about LDS assumptions. - Add constant for buffer size. - Fix code style. v3: - Correct limits for not splitting patches between waves. - Set max num_patches to 40 as in the proprietary driver. Signed-off-by: Bas Nieuwenhuizen <[email protected]> Reviewed-by: Marek Olšák <[email protected]>
author: Bas Nieuwenhuizen <[email protected]> 2016-05-02 15:00:21 +0200
committer: Bas Nieuwenhuizen <[email protected]> 2016-05-26 22:07:04 +0200
commit: f91c85b29b8261ab3f44d292ed2130d4f577d976 (patch)
tree: ae9cea7c227e4df9673c25204acf2fa04fe22d37 /src
parent: fd0a7a382f2accea67396584826f5f8e40239ef3 (diff)
1 files changed, 35 insertions, 15 deletions
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 6fe2619d601..c8b87a9f1a0 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -108,20 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	unsigned input_patch_size, output_patch_size, output_patch0_offset;
 	unsigned perpatch_output_offset, lds_size, ls_rsrc2;
 	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
-	unsigned offchip_layout;
-
-	*num_patches = 1; /* TODO: calculate this */
-
-	if (sctx->last_ls == ls->current &&
-	    sctx->last_tcs == tcs &&
-	    sctx->last_tes_sh_base == tes_sh_base &&
-	    sctx->last_num_tcs_input_cp == num_tcs_input_cp)
-		return;
-
-	sctx->last_ls = ls->current;
-	sctx->last_tcs = tcs;
-	sctx->last_tes_sh_base = tes_sh_base;
-	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+	unsigned offchip_layout, hardware_lds_size;
 
 	/* This calculates how shader inputs and outputs among VS, TCS, and TES
 	 * are laid out in LDS. */
@@ -146,7 +133,29 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
 	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 
-	output_patch0_offset = sctx->tcs_shader.cso ? input_patch_size * *num_patches : 0;
+	/* Ensure that we only need one wave per SIMD so we don't need to check
+	 * resource usage. Also ensures that the number of tcs in and out
+	 * vertices per threadgroup are at most 256.
+	 */
+	*num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+
+	/* Make sure that the data fits in LDS. This assumes the shaders only
+	 * use LDS for the inputs and outputs.
+	 */
+	hardware_lds_size = sctx->b.chip_class >= CIK ? 65536 : 32768;
+	*num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
+	                                                       output_patch_size));
+
+	/* Make sure the output data fits in the offchip buffer */
+	*num_patches = MIN2(*num_patches, SI_TESS_OFFCHIP_BLOCK_SIZE /
+	                                  output_patch_size);
+
+	/* Not necessary for correctness, but improves performance. The
+	 * specific value is taken from the proprietary driver.
+	 */
+	*num_patches = MIN2(*num_patches, 40);
+
+	output_patch0_offset = input_patch_size * *num_patches;
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
@@ -160,6 +169,17 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256);
 	}
 
+	if (sctx->last_ls == ls->current &&
+	    sctx->last_tcs == tcs &&
+	    sctx->last_tes_sh_base == tes_sh_base &&
+	    sctx->last_num_tcs_input_cp == num_tcs_input_cp)
+		return;
+
+	sctx->last_ls = ls->current;
+	sctx->last_tcs = tcs;
+	sctx->last_tes_sh_base = tes_sh_base;
+	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+
 	/* Due to a hw bug, RSRC2_LS must be written twice with another
 	 * LS register written in between. */
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
author	Bas Nieuwenhuizen <[email protected]>	2016-05-02 15:00:21 +0200
committer	Bas Nieuwenhuizen <[email protected]>	2016-05-26 22:07:04 +0200
commit	f91c85b29b8261ab3f44d292ed2130d4f577d976 (patch)
tree	ae9cea7c227e4df9673c25204acf2fa04fe22d37 /src
parent	fd0a7a382f2accea67396584826f5f8e40239ef3 (diff)