diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 16 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 6 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 189 |
3 files changed, 210 insertions, 1 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index f85a61be9c6..7febee97f8c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -671,6 +671,21 @@ struct si_shader { } gs; struct { + unsigned ge_max_output_per_subgroup; + unsigned ge_ngg_subgrp_cntl; + unsigned vgt_primitiveid_en; + unsigned vgt_gs_onchip_cntl; + unsigned vgt_gs_instance_cnt; + unsigned vgt_esgs_ring_itemsize; + unsigned vgt_reuse_off; + unsigned spi_vs_out_config; + unsigned spi_shader_idx_format; + unsigned spi_shader_pos_format; + unsigned pa_cl_vte_cntl; + unsigned vgt_gs_max_vert_out; /* for API GS */ + } ngg; + + struct { unsigned vgt_gs_mode; unsigned vgt_primitiveid_en; unsigned vgt_reuse_off; @@ -693,6 +708,7 @@ struct si_shader { /*For save precompute registers value */ unsigned vgt_tf_param; /* VGT_TF_PARAM */ unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */ + unsigned ge_cntl; }; struct si_shader_part { diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index c271effe053..23c7b3245f5 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -318,8 +318,12 @@ enum si_tracked_reg { SI_TRACKED_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_REUSE_OFF, SI_TRACKED_SPI_VS_OUT_CONFIG, - SI_TRACKED_SPI_SHADER_POS_FORMAT, SI_TRACKED_PA_CL_VTE_CNTL, + SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, + SI_TRACKED_GE_NGG_SUBGRP_CNTL, + + SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */ + SI_TRACKED_SPI_SHADER_POS_FORMAT, SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */ SI_TRACKED_SPI_PS_INPUT_ADDR, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 35a8577f76b..2537dd90b5a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -941,6 +941,191 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) } } +/* Common tail code for NGG primitive shaders. */ +static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, + struct si_shader *shader, + unsigned initial_cdw) +{ + radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, + shader->ctx_reg.ngg.ge_max_output_per_subgroup); + radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, + SI_TRACKED_GE_NGG_SUBGRP_CNTL, + shader->ctx_reg.ngg.ge_ngg_subgrp_cntl); + radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, + SI_TRACKED_VGT_PRIMITIVEID_EN, + shader->ctx_reg.ngg.vgt_primitiveid_en); + radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, + SI_TRACKED_VGT_GS_ONCHIP_CNTL, + shader->ctx_reg.ngg.vgt_gs_onchip_cntl); + radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, + SI_TRACKED_VGT_GS_INSTANCE_CNT, + shader->ctx_reg.ngg.vgt_gs_instance_cnt); + radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize); + radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, + SI_TRACKED_VGT_REUSE_OFF, + shader->ctx_reg.ngg.vgt_reuse_off); + radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, + SI_TRACKED_SPI_VS_OUT_CONFIG, + shader->ctx_reg.ngg.spi_vs_out_config); + radeon_opt_set_context_reg2(sctx, R_028708_SPI_SHADER_IDX_FORMAT, + SI_TRACKED_SPI_SHADER_IDX_FORMAT, + shader->ctx_reg.ngg.spi_shader_idx_format, + shader->ctx_reg.ngg.spi_shader_pos_format); + radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, + SI_TRACKED_PA_CL_VTE_CNTL, + shader->ctx_reg.ngg.pa_cl_vte_cntl); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; + + if (shader->ge_cntl != sctx->last_multi_vgt_param) { + radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, shader->ge_cntl); + sctx->last_multi_vgt_param = shader->ge_cntl; + } +} + +static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) +{ + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (!shader) + return; + + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); +} + +static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) +{ + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (!shader) + return; + + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, + SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); + + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); +} + +/** + * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader + * in NGG mode. + */ +static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader) +{ + const struct tgsi_shader_info *info = &shader->selector->info; + enum pipe_shader_type es_type = shader->selector->type; + unsigned num_user_sgprs; + unsigned nparams, es_vgpr_comp_cnt; + uint64_t va; + unsigned window_space = + info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid; + struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + pm4->atom.emit = es_type == PIPE_SHADER_TESS_EVAL ? gfx10_emit_shader_ngg_tess_nogs + : gfx10_emit_shader_ngg_notess_nogs; + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + + if (es_type == PIPE_SHADER_VERTEX) { + /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */ + es_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0; + + if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) { + num_user_sgprs = SI_SGPR_VS_BLIT_DATA + + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + } else { + num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR); + } + } else { + assert(es_type == PIPE_SHADER_TESS_EVAL); + es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2; + num_user_sgprs = SI_TES_NUM_USER_SGPR; + } + + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); + si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, + S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B228_FLOAT_MODE(shader->config.float_mode) | + S_00B228_DX10_CLAMP(1) | + S_00B228_GS_VGPR_COMP_CNT(3)); + si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, + S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) | + S_00B22C_USER_SGPR(num_user_sgprs) | + S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | + S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5)); + + /* TODO: Use NO_PC_EXPORT when applicable. */ + nparams = MAX2(shader->info.nr_param_exports, 1); + shader->ctx_reg.ngg.spi_vs_out_config = + S_0286C4_VS_EXPORT_COUNT(nparams - 1); + + shader->ctx_reg.ngg.spi_shader_idx_format = + S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP); + shader->ctx_reg.ngg.spi_shader_pos_format = + S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE) | + S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE) | + S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE); + + shader->ctx_reg.ngg.vgt_primitiveid_en = + S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | + S_028A84_NGG_DISABLE_PROVOK_REUSE(es_enable_prim_id); + + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; + + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(sscreen, shader->selector, pm4); + + uint32_t es_verts_per_subgrp = 252; + uint32_t gs_prims_per_subgrp = 255; + shader->ctx_reg.ngg.vgt_gs_onchip_cntl = + S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgrp) | + S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgrp) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_prims_per_subgrp); + shader->ctx_reg.ngg.ge_max_output_per_subgroup = + S_0287FC_MAX_VERTS_PER_SUBGROUP(es_verts_per_subgrp); + shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = + S_028B4C_PRIM_AMP_FACTOR(1) | + S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */ + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(gs_prims_per_subgrp) | + S_03096C_VERT_GRP_SIZE(es_verts_per_subgrp); + + if (window_space) { + shader->ctx_reg.ngg.pa_cl_vte_cntl = + S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1); + } else { + shader->ctx_reg.ngg.pa_cl_vte_cntl = + S_028818_VTX_W0_FMT(1) | + S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | + S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | + S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); + } + + shader->ctx_reg.ngg.vgt_reuse_off = + S_028AB4_REUSE_OFF(sscreen->info.family == CHIP_NAVI10 && + sscreen->info.chip_external_rev == 0x1 && + es_type == PIPE_SHADER_TESS_EVAL); +} + static void si_emit_shader_vs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.vs->shader; @@ -1327,6 +1512,8 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, si_shader_ls(sscreen, shader); else if (shader->key.as_es) si_shader_es(sscreen, shader); + else if (shader->key.as_ngg) + gfx10_shader_ngg(sscreen, shader); else si_shader_vs(sscreen, shader, NULL); break; @@ -1336,6 +1523,8 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, case PIPE_SHADER_TESS_EVAL: if (shader->key.as_es) si_shader_es(sscreen, shader); + else if (shader->key.as_ngg) + gfx10_shader_ngg(sscreen, shader); else si_shader_vs(sscreen, shader, NULL); break; |