diff options
author | Nicolai Hähnle <[email protected]> | 2018-05-23 22:31:41 +0200 |
---|---|---|
committer | Marek Olšák <[email protected]> | 2019-07-03 15:51:12 -0400 |
commit | 4ecc39e1aa1568f19ebf54a99ffe14643bac7d15 (patch) | |
tree | db0e1b388cdf5bf395e4d2bf1b31f53ae1096a55 | |
parent | a04aa4be2bda7cfac541cd72a1a64fa23cb2e6a5 (diff) |
radeonsi/gfx10: NGG geometry shader PM4 and upload
Acked-by: Bas Nieuwenhuizen <[email protected]>
-rw-r--r-- | src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 187 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 16 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 9 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader_internal.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 132 |
5 files changed, 316 insertions, 29 deletions
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 014fe1f96c9..87ca56b1fdf 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -651,3 +651,190 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) } ac_build_endif(&ctx->ac, 5145); } + +static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, + unsigned min_verts_per_prim, bool use_adjacency) +{ + unsigned max_reuse = max_esverts - min_verts_per_prim; + if (use_adjacency) + max_reuse /= 2; + *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); +} + +/** + * Determine subgroup information like maximum number of vertices and prims. + * + * This happens before the shader is uploaded, since LDS relocations during + * upload depend on the subgroup size. + */ +void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) +{ + const struct si_shader_selector *gs_sel = shader->selector; + const struct si_shader_selector *es_sel = + shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel; + const enum pipe_shader_type gs_type = gs_sel->type; + const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); + /* TODO: Specialize for known primitive type without GS. */ + const unsigned input_prim = gs_type == PIPE_SHADER_GEOMETRY ? + gs_sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM] : + PIPE_PRIM_TRIANGLES; + const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && + input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; + const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim); + const unsigned min_verts_per_prim = + gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1; + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. + * + * Streamout can increase the ESGS buffer size later on, so be more + * conservative with streamout and use 4K dwords. This may be suboptimal. + * + * Otherwise, use the limit of 7K dwords. The reason is that we need + * to leave some headroom for the max_esverts increase at the end. + * + * TODO: We should really take the shader's internal LDS use into + * account. The linker will fail if the size is greater than + * 8K dwords. + */ + const unsigned max_lds_size = (gs_sel->so.num_outputs ? 4 : 7) * 1024 - 128; + const unsigned target_lds_size = max_lds_size; + unsigned esvert_lds_size = 0; + unsigned gsprim_lds_size = 0; + + /* All these are per subgroup: */ + bool max_vert_out_per_gs_instance = false; + unsigned max_esverts_base = 256; + unsigned max_gsprims_base = 128; /* default prim group size clamp */ + + /* Hardware has the following non-natural restrictions on the value + * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of + * the draw: + * - at most 252 for any line input primitive type + * - at most 251 for any quad input primitive type + * - at most 251 for triangle strips with adjacency (this happens to + * be the natural limit for triangle *lists* with adjacency) + */ + max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); + + if (gs_type == PIPE_SHADER_GEOMETRY) { + unsigned max_out_verts_per_gsprim = + gs_sel->gs_max_out_vertices * gs_num_invocations; + + if (max_out_verts_per_gsprim <= 256) { + if (max_out_verts_per_gsprim) { + max_gsprims_base = MIN2(max_gsprims_base, + 256 / max_out_verts_per_gsprim); + } + } else { + /* Use special multi-cycling mode in which each GS + * instance gets its own subgroup. Does not work with + * tessellation. */ + max_vert_out_per_gs_instance = true; + max_gsprims_base = 1; + max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices; + } + + esvert_lds_size = es_sel->esgs_itemsize / 4; + gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; + } else { + /* TODO: This needs to be adjusted once LDS use for compaction + * after culling is implemented. */ + } + + unsigned max_gsprims = max_gsprims_base; + unsigned max_esverts = max_esverts_base; + + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + + if (esvert_lds_size || gsprim_lds_size) { + /* Now that we have a rough proportionality between esverts + * and gsprims based on the primitive type, scale both of them + * down simultaneously based on required LDS space. + * + * We could be smarter about this if we knew how much vertex + * reuse to expect. + */ + unsigned lds_total = max_esverts * esvert_lds_size + + max_gsprims * gsprim_lds_size; + if (lds_total > target_lds_size) { + max_esverts = max_esverts * target_lds_size / lds_total; + max_gsprims = max_gsprims * target_lds_size / lds_total; + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, + min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } + } + + /* Round up towards full wave sizes for better ALU utilization. */ + if (!max_vert_out_per_gs_instance) { + const unsigned wavesize = 64; + unsigned orig_max_esverts; + unsigned orig_max_gsprims; + do { + orig_max_esverts = max_esverts; + orig_max_gsprims = max_gsprims; + + max_esverts = align(max_esverts, wavesize); + max_esverts = MIN2(max_esverts, max_esverts_base); + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, + (max_lds_size - max_gsprims * gsprim_lds_size) / + esvert_lds_size); + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + + max_gsprims = align(max_gsprims, wavesize); + max_gsprims = MIN2(max_gsprims, max_gsprims_base); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, + (max_lds_size - max_esverts * esvert_lds_size) / + gsprim_lds_size); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, + min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); + } + + /* Hardware restriction: minimum value of max_esverts */ + max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim); + + unsigned max_out_vertices = + max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices : + gs_type == PIPE_SHADER_GEOMETRY ? + max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices : + max_esverts; + assert(max_out_vertices <= 256); + + unsigned prim_amp_factor = 1; + if (gs_type == PIPE_SHADER_GEOMETRY) { + /* Number of output primitives per GS input primitive after + * GS instancing. */ + prim_amp_factor = gs_sel->gs_max_out_vertices; + } + + /* The GE only checks against the maximum number of ES verts after + * allocating a full GS primitive. So we need to ensure that whenever + * this check passes, there is enough space for a full primitive without + * vertex reuse. + */ + shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; + shader->ngg.max_gsprims = max_gsprims; + shader->ngg.max_out_verts = max_out_vertices; + shader->ngg.prim_amp_factor = prim_amp_factor; + shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; + + shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size; + shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; + + assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */ +} diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index cc05b33ae1b..48b27c9e2ed 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5156,7 +5156,7 @@ static bool si_shader_binary_open(struct si_screen *screen, #undef add_part - struct ac_rtld_symbol lds_symbols[1]; + struct ac_rtld_symbol lds_symbols[2]; unsigned num_lds_symbols = 0; if (sel && screen->info.chip_class >= GFX9 && @@ -5170,6 +5170,13 @@ static bool si_shader_binary_open(struct si_screen *screen, sym->align = 64 * 1024; } + if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) { + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "ngg_emit"; + sym->size = shader->ngg.ngg_emit_size * 4; + sym->align = 4; + } + bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){ .info = &screen->info, .options = { @@ -5198,7 +5205,6 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh return rtld.rx_size; } - static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) { uint64_t *scratch_va = data; @@ -8219,8 +8225,12 @@ bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compil si_calculate_max_simd_waves(shader); } - if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) + if (shader->key.as_ngg) { + assert(!shader->key.as_es && !shader->key.as_ls); + gfx10_ngg_calculate_subgroup_info(shader); + } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) { gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); + } si_fix_resource_usage(sscreen, shader); si_shader_dump(sscreen, shader, debug, sel->info.processor, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 7febee97f8c..39af557bcae 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -643,6 +643,15 @@ struct si_shader { struct ac_shader_config config; struct si_shader_info info; + struct { + uint16_t ngg_emit_size; /* in dwords */ + uint16_t hw_max_esverts; + uint16_t max_gsprims; + uint16_t max_out_verts; + uint16_t prim_amp_factor; + bool max_vert_out_per_gs_instance; + } ngg; + /* Shader key + LLVM IR + disassembly + statistics. * Generated for debug contexts only. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 55f32c66117..09efc91b9f5 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -389,5 +389,6 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs); void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); +void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); #endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 27835811cb7..be9ab3bcdd6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1013,26 +1013,75 @@ static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); } +static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) +{ + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (!shader) + return; + + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, + SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ctx_reg.ngg.vgt_gs_max_vert_out); + + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); +} + +static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) +{ + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (!shader) + return; + + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, + SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ctx_reg.ngg.vgt_gs_max_vert_out); + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, + SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); + + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); +} + /** * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader * in NGG mode. */ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader) { - const struct tgsi_shader_info *info = &shader->selector->info; - enum pipe_shader_type es_type = shader->selector->type; + const struct si_shader_selector *gs_sel = shader->selector; + const struct tgsi_shader_info *gs_info = &gs_sel->info; + enum pipe_shader_type gs_type = shader->selector->type; + const struct si_shader_selector *es_sel = + shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector; + const struct tgsi_shader_info *es_info = &es_sel->info; + enum pipe_shader_type es_type = es_sel->type; unsigned num_user_sgprs; - unsigned nparams, es_vgpr_comp_cnt; + unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt; uint64_t va; unsigned window_space = - info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid; + gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid; + unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); + unsigned input_prim = + gs_type == PIPE_SHADER_GEOMETRY ? + gs_info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] : + PIPE_PRIM_TRIANGLES; /* TODO: Optimize when primtype is known */ + bool break_wave_at_eoi = false; struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); if (!pm4) return; - pm4->atom.emit = es_type == PIPE_SHADER_TESS_EVAL ? gfx10_emit_shader_ngg_tess_nogs - : gfx10_emit_shader_ngg_notess_nogs; + if (es_type == PIPE_SHADER_TESS_EVAL) { + pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs + : gfx10_emit_shader_ngg_tess_nogs; + } else { + pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs + : gfx10_emit_shader_ngg_notess_nogs; + } va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); @@ -1041,30 +1090,48 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */ es_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0; - if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) { + if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) { num_user_sgprs = SI_SGPR_VS_BLIT_DATA + - info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; } else { - num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); } } else { assert(es_type == PIPE_SHADER_TESS_EVAL); es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2; - num_user_sgprs = SI_TES_NUM_USER_SGPR; + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + + if (es_enable_prim_id || gs_info->uses_primid) + break_wave_at_eoi = true; } + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and + * VGPR[0:4] are always loaded. + */ + if (gs_info->uses_invocationid) + gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ + else if (gs_info->uses_primid) + gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ + else if (input_prim >= PIPE_PRIM_TRIANGLES) + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ + else + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) | - S_00B228_GS_VGPR_COMP_CNT(3)); + S_00B228_MEM_ORDERED(1) | + S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) | S_00B22C_USER_SGPR(num_user_sgprs) | S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | - S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5)); + S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) | + S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | + S_00B22C_LDS_SIZE(shader->config.lds_size)); /* TODO: Use NO_PC_EXPORT when applicable. */ nparams = MAX2(shader->info.nr_param_exports, 1); @@ -1089,25 +1156,35 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | S_028A84_NGG_DISABLE_PROVOK_REUSE(es_enable_prim_id); - shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; + if (gs_type == PIPE_SHADER_GEOMETRY) { + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4; + shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices; + } else { + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; + } - if (shader->selector->type == PIPE_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); + if (es_type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(sscreen, es_sel, pm4); - uint32_t es_verts_per_subgrp = 252; - uint32_t gs_prims_per_subgrp = 255; shader->ctx_reg.ngg.vgt_gs_onchip_cntl = - S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgrp) | - S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgrp) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_prims_per_subgrp); + S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | + S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations); shader->ctx_reg.ngg.ge_max_output_per_subgroup = - S_0287FC_MAX_VERTS_PER_SUBGROUP(es_verts_per_subgrp); + S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts); shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = - S_028B4C_PRIM_AMP_FACTOR(1) | + S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) | S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */ + shader->ctx_reg.ngg.vgt_gs_instance_cnt = + S_028B90_CNT(gs_num_invocations) | + S_028B90_ENABLE(gs_num_invocations > 1) | + S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE( + shader->ngg.max_vert_out_per_gs_instance); + shader->ge_cntl = - S_03096C_PRIM_GRP_SIZE(gs_prims_per_subgrp) | - S_03096C_VERT_GRP_SIZE(es_verts_per_subgrp); + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); if (window_space) { shader->ctx_reg.ngg.pa_cl_vte_cntl = @@ -1529,7 +1606,10 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, si_shader_vs(sscreen, shader, NULL); break; case PIPE_SHADER_GEOMETRY: - si_shader_gs(sscreen, shader); + if (shader->key.as_ngg) + gfx10_shader_ngg(sscreen, shader); + else + si_shader_gs(sscreen, shader); break; case PIPE_SHADER_FRAGMENT: si_shader_ps(sscreen, shader); |