diff options
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_draw.c | 50 |
1 files changed, 35 insertions, 15 deletions
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 6fe2619d601..c8b87a9f1a0 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -108,20 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, unsigned input_patch_size, output_patch_size, output_patch0_offset; unsigned perpatch_output_offset, lds_size, ls_rsrc2; unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets; - unsigned offchip_layout; - - *num_patches = 1; /* TODO: calculate this */ - - if (sctx->last_ls == ls->current && - sctx->last_tcs == tcs && - sctx->last_tes_sh_base == tes_sh_base && - sctx->last_num_tcs_input_cp == num_tcs_input_cp) - return; - - sctx->last_ls = ls->current; - sctx->last_tcs = tcs; - sctx->last_tes_sh_base = tes_sh_base; - sctx->last_num_tcs_input_cp = num_tcs_input_cp; + unsigned offchip_layout, hardware_lds_size; /* This calculates how shader inputs and outputs among VS, TCS, and TES * are laid out in LDS. */ @@ -146,7 +133,29 @@ static void si_emit_derived_tess_state(struct si_context *sctx, pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - output_patch0_offset = sctx->tcs_shader.cso ? input_patch_size * *num_patches : 0; + /* Ensure that we only need one wave per SIMD so we don't need to check + * resource usage. Also ensures that the number of tcs in and out + * vertices per threadgroup are at most 256. + */ + *num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4; + + /* Make sure that the data fits in LDS. This assumes the shaders only + * use LDS for the inputs and outputs. + */ + hardware_lds_size = sctx->b.chip_class >= CIK ? 65536 : 32768; + *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size + + output_patch_size)); + + /* Make sure the output data fits in the offchip buffer */ + *num_patches = MIN2(*num_patches, SI_TESS_OFFCHIP_BLOCK_SIZE / + output_patch_size); + + /* Not necessary for correctness, but improves performance. The + * specific value is taken from the proprietary driver. + */ + *num_patches = MIN2(*num_patches, 40); + + output_patch0_offset = input_patch_size * *num_patches; perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; lds_size = output_patch0_offset + output_patch_size * *num_patches; @@ -160,6 +169,17 @@ static void si_emit_derived_tess_state(struct si_context *sctx, ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256); } + if (sctx->last_ls == ls->current && + sctx->last_tcs == tcs && + sctx->last_tes_sh_base == tes_sh_base && + sctx->last_num_tcs_input_cp == num_tcs_input_cp) + return; + + sctx->last_ls = ls->current; + sctx->last_tcs = tcs; + sctx->last_tes_sh_base = tes_sh_base; + sctx->last_num_tcs_input_cp = num_tcs_input_cp; + /* Due to a hw bug, RSRC2_LS must be written twice with another * LS register written in between. */ if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII) |