summaryrefslogtreecommitdiffstats
path: root/src/amd
diff options
context:
space:
mode:
Diffstat (limited to 'src/amd')
-rw-r--r--src/amd/vulkan/radv_nir_to_llvm.c224
-rw-r--r--src/amd/vulkan/radv_pipeline.c374
-rw-r--r--src/amd/vulkan/radv_private.h2
-rw-r--r--src/amd/vulkan/radv_shader.c19
-rw-r--r--src/amd/vulkan/radv_shader.h2
-rw-r--r--src/amd/vulkan/si_cmd_buffer.c11
6 files changed, 610 insertions, 22 deletions
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 7c0275be7cf..51414be2304 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -70,6 +70,13 @@ struct radv_shader_context {
LLVMValueRef tes_u;
LLVMValueRef tes_v;
+ /* HW GS */
+ /* On gfx10:
+ * - bits 0..10: ordered_wave_id
+ * - bits 12..20: number of vertices in group
+ * - bits 22..30: number of primitives in group
+ */
+ LLVMValueRef gs_tg_info;
LLVMValueRef gs2vs_offset;
LLVMValueRef gs_wave_id;
LLVMValueRef gs_vtx_offset[6];
@@ -823,11 +830,18 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
if (ctx->options->key.vs.out.as_ls) {
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+ add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
} else {
- add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
- add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+ if (ctx->ac.chip_class >= GFX10) {
+ add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+ add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+ add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+ } else {
+ add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+ add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+ add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
+ }
}
- add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
}
}
@@ -969,6 +983,12 @@ static void set_llvm_calling_convention(LLVMValueRef func,
LLVMSetFunctionCallConv(func, calling_conv);
}
+/* Returns whether the stage is a stage that can be directly before the GS */
+static bool is_pre_gs_stage(gl_shader_stage stage)
+{
+ return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
+}
+
static void create_function(struct radv_shader_context *ctx,
gl_shader_stage stage,
bool has_previous_stage,
@@ -987,6 +1007,15 @@ static void create_function(struct radv_shader_context *ctx,
&ctx->ring_offsets);
}
+ if (ctx->ac.chip_class >= GFX10) {
+ if (stage == MESA_SHADER_VERTEX && ctx->options->key.vs.out.as_ngg) {
+ /* On GFX10, VS is merged into GS for NGG. */
+ stage = MESA_SHADER_GEOMETRY;
+ has_previous_stage = true;
+ previous_stage = MESA_SHADER_VERTEX;
+ }
+ }
+
switch (stage) {
case MESA_SHADER_COMPUTE:
declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
@@ -1101,8 +1130,14 @@ static void create_function(struct radv_shader_context *ctx,
case MESA_SHADER_GEOMETRY:
if (has_previous_stage) {
// First 6 system regs
- add_arg(&args, ARG_SGPR, ctx->ac.i32,
- &ctx->gs2vs_offset);
+ if (ctx->options->key.vs.out.as_ngg) {
+ add_arg(&args, ARG_SGPR, ctx->ac.i32,
+ &ctx->gs_tg_info);
+ } else {
+ add_arg(&args, ARG_SGPR, ctx->ac.i32,
+ &ctx->gs2vs_offset);
+ }
+
add_arg(&args, ARG_SGPR, ctx->ac.i32,
&ctx->merged_wave_info);
add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
@@ -3194,6 +3229,168 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
}
}
+static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx)
+{
+ return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
+}
+
+static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx)
+{
+ return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+ LLVMConstInt(ctx->ac.i32, 12, false),
+ LLVMConstInt(ctx->ac.i32, 9, false),
+ false);
+}
+
+static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx)
+{
+ return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+ LLVMConstInt(ctx->ac.i32, 22, false),
+ LLVMConstInt(ctx->ac.i32, 9, false),
+ false);
+}
+
+/* Send GS Alloc Req message from the first wave of the group to SPI.
+ * Message payload is:
+ * - bits 0..10: vertices in group
+ * - bits 12..22: primitives in group
+ */
+static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx,
+ LLVMValueRef vtx_cnt,
+ LLVMValueRef prim_cnt)
+{
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5020);
+
+ tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
+ tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
+ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
+
+ ac_build_endif(&ctx->ac, 5020);
+}
+
+struct ngg_prim {
+ unsigned num_vertices;
+ LLVMValueRef isnull;
+ LLVMValueRef index[3];
+ LLVMValueRef edgeflag[3];
+};
+
+static void build_export_prim(struct radv_shader_context *ctx,
+ const struct ngg_prim *prim)
+{
+ LLVMBuilderRef builder = ctx->ac.builder;
+ struct ac_export_args args;
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
+ args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
+
+ for (unsigned i = 0; i < prim->num_vertices; ++i) {
+ tmp = LLVMBuildShl(builder, prim->index[i],
+ LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
+ args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+ tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
+ tmp = LLVMBuildShl(builder, tmp,
+ LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
+ args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+ }
+
+ args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
+ args.out[1] = LLVMGetUndef(ctx->ac.f32);
+ args.out[2] = LLVMGetUndef(ctx->ac.f32);
+ args.out[3] = LLVMGetUndef(ctx->ac.f32);
+
+ args.target = V_008DFC_SQ_EXP_PRIM;
+ args.enabled_channels = 1;
+ args.done = true;
+ args.valid_mask = false;
+ args.compr = false;
+
+ ac_build_export(&ctx->ac, &args);
+}
+
+static void
+handle_ngg_outputs_post(struct radv_shader_context *ctx)
+{
+ LLVMBuilderRef builder = ctx->ac.builder;
+ struct ac_build_if_state if_state;
+ unsigned num_vertices = 3;
+ LLVMValueRef tmp;
+
+ assert(ctx->stage == MESA_SHADER_VERTEX && !ctx->is_gs_copy_shader);
+
+ LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8);
+ LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8);
+ LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
+ ac_get_thread_id(&ctx->ac), prims_in_wave, "");
+ LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
+ ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
+ LLVMValueRef vtxindex[] = {
+ ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16),
+ ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16),
+ ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16),
+ };
+
+ /* TODO: streamout */
+
+ /* TODO: VS primitive ID */
+ if (ctx->options->key.vs.out.export_prim_id)
+ assert(0);
+
+ /* TODO: primitive culling */
+
+ build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
+
+ /* TODO: streamout queries */
+ /* Export primitive data to the index buffer. Format is:
+ * - bits 0..8: index 0
+ * - bit 9: edge flag 0
+ * - bits 10..18: index 1
+ * - bit 19: edge flag 1
+ * - bits 20..28: index 2
+ * - bit 29: edge flag 2
+ * - bit 31: null primitive (skip)
+ *
+ * For the first version, we will always build up all three indices
+ * independent of the primitive type. The additional garbage data
+ * shouldn't hurt.
+ *
+ * TODO: culling depends on the primitive type, so can have some
+ * interaction here.
+ */
+ ac_nir_build_if(&if_state, ctx, is_gs_thread);
+ {
+ struct ngg_prim prim = {};
+
+ prim.num_vertices = num_vertices;
+ prim.isnull = ctx->ac.i1false;
+ memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
+
+ for (unsigned i = 0; i < num_vertices; ++i) {
+ tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
+ LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
+ prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+ }
+
+ build_export_prim(ctx, &prim);
+ }
+ ac_nir_build_endif(&if_state);
+
+ /* Export per-vertex data (positions and parameters). */
+ ac_nir_build_if(&if_state, ctx, is_es_thread);
+ {
+ handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
+ ctx->options->key.vs.out.export_layer_id,
+ ctx->options->key.vs.out.export_clip_dists,
+ &ctx->shader_info->vs.outinfo);
+ }
+ ac_nir_build_endif(&if_state);
+}
+
static void
write_tess_factors(struct radv_shader_context *ctx)
{
@@ -3452,6 +3649,8 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
handle_ls_outputs_post(ctx);
else if (ctx->options->key.vs.out.as_es)
handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
+ else if (ctx->options->key.vs.out.as_ngg)
+ handle_ngg_outputs_post(ctx);
else
handle_vs_outputs_post(ctx, ctx->options->key.vs.out.export_prim_id,
ctx->options->key.vs.out.export_layer_id,
@@ -3703,6 +3902,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
shaders[i]));
}
+ if (ctx.ac.chip_class >= GFX10) {
+ if (shaders[0]->info.stage == MESA_SHADER_VERTEX &&
+ options->key.vs.out.as_ngg) {
+ ctx.max_workgroup_size = 128;
+ }
+ }
+
create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX);
@@ -3722,7 +3928,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
*/
ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
- if (shader_count >= 2)
+ bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs.out.as_ngg;
+ if (shader_count >= 2 || is_ngg)
ac_init_exec_full_mask(&ctx.ac);
if ((ctx.ac.family == CHIP_VEGA10 ||
@@ -3788,7 +3995,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
ac_setup_rings(&ctx);
LLVMBasicBlockRef merge_block;
- if (shader_count >= 2) {
+ if (shader_count >= 2 || is_ngg) {
LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
@@ -3811,7 +4018,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]);
- if (shader_count >= 2) {
+ if (shader_count >= 2 || is_ngg) {
LLVMBuildBr(ctx.ac.builder, merge_block);
LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
}
@@ -3955,6 +4162,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha
shader_info->vs.as_es = options->key.vs.out.as_es;
shader_info->vs.as_ls = options->key.vs.out.as_ls;
shader_info->vs.export_prim_id = options->key.vs.out.export_prim_id;
+ shader_info->is_ngg = options->key.vs.out.as_ngg;
break;
default:
break;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index ae08d57677f..cc5f339f34f 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -96,6 +96,30 @@ struct radv_gs_state {
uint32_t lds_size;
};
+struct radv_ngg_state {
+ uint16_t ngg_emit_size; /* in dwords */
+ uint32_t hw_max_esverts;
+ uint32_t max_gsprims;
+ uint32_t max_out_verts;
+ uint32_t prim_amp_factor;
+ uint32_t vgt_esgs_ring_itemsize;
+ bool max_vert_out_per_gs_instance;
+};
+
+bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
+{
+ struct radv_shader_variant *variant = NULL;
+ if (pipeline->shaders[MESA_SHADER_GEOMETRY])
+ variant = pipeline->shaders[MESA_SHADER_GEOMETRY];
+ else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
+ variant = pipeline->shaders[MESA_SHADER_TESS_EVAL];
+ else if (pipeline->shaders[MESA_SHADER_VERTEX])
+ variant = pipeline->shaders[MESA_SHADER_VERTEX];
+ else
+ return false;
+ return variant->info.is_ngg;
+}
+
static void
radv_pipeline_destroy(struct radv_device *device,
struct radv_pipeline *pipeline,
@@ -1583,6 +1607,203 @@ calculate_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
return gs;
}
+static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
+ unsigned min_verts_per_prim, bool use_adjacency)
+{
+ unsigned max_reuse = max_esverts - min_verts_per_prim;
+ if (use_adjacency)
+ max_reuse /= 2;
+ *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+}
+
+static struct radv_ngg_state
+calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ struct radv_pipeline *pipeline)
+{
+ struct radv_ngg_state ngg = {0};
+ struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
+ struct radv_es_output_info *es_info =
+ radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info;
+ unsigned gs_type = MESA_SHADER_VERTEX;
+ unsigned max_verts_per_prim = 3; // triangles
+ unsigned min_verts_per_prim =
+ gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
+ unsigned gs_num_invocations = 1;//MAX2(gs_info->gs.invocations, 1);
+ bool uses_adjacency;
+ switch(pCreateInfo->pInputAssemblyState->topology) {
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+ uses_adjacency = true;
+ break;
+ default:
+ uses_adjacency = false;
+ break;
+ }
+
+ /* All these are in dwords: */
+ /* We can't allow using the whole LDS, because GS waves compete with
+ * other shader stages for LDS space.
+ *
+ * Streamout can increase the ESGS buffer size later on, so be more
+ * conservative with streamout and use 4K dwords. This may be suboptimal.
+ *
+ * Otherwise, use the limit of 7K dwords. The reason is that we need
+ * to leave some headroom for the max_esverts increase at the end.
+ *
+ * TODO: We should really take the shader's internal LDS use into
+ * account. The linker will fail if the size is greater than
+ * 8K dwords.
+ */
+ const unsigned max_lds_size = (0 /*gs_info->info.so.num_outputs*/ ? 4 : 7) * 1024 - 128;
+ const unsigned target_lds_size = max_lds_size;
+ unsigned esvert_lds_size = 0;
+ unsigned gsprim_lds_size = 0;
+
+ /* All these are per subgroup: */
+ bool max_vert_out_per_gs_instance = false;
+ unsigned max_esverts_base = 256;
+ unsigned max_gsprims_base = 128; /* default prim group size clamp */
+
+ /* Hardware has the following non-natural restrictions on the value
+ * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+ * the draw:
+ * - at most 252 for any line input primitive type
+ * - at most 251 for any quad input primitive type
+ * - at most 251 for triangle strips with adjacency (this happens to
+ * be the natural limit for triangle *lists* with adjacency)
+ */
+ max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+ if (gs_type == MESA_SHADER_GEOMETRY) {
+ unsigned max_out_verts_per_gsprim =
+ gs_info->gs.vertices_out * gs_num_invocations;
+
+ if (max_out_verts_per_gsprim <= 256) {
+ if (max_out_verts_per_gsprim) {
+ max_gsprims_base = MIN2(max_gsprims_base,
+ 256 / max_out_verts_per_gsprim);
+ }
+ } else {
+ /* Use special multi-cycling mode in which each GS
+ * instance gets its own subgroup. Does not work with
+ * tessellation. */
+ max_vert_out_per_gs_instance = true;
+ max_gsprims_base = 1;
+ max_out_verts_per_gsprim = gs_info->gs.vertices_out;
+ }
+
+ esvert_lds_size = es_info->esgs_itemsize / 4;
+ gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+ } else {
+ /* TODO: This needs to be adjusted once LDS use for compaction
+ * after culling is implemented. */
+ /*
+ if (es_info->info.so.num_outputs)
+ esvert_lds_size = 4 * es_info->info.so.num_outputs + 1;
+ */
+ }
+
+ unsigned max_gsprims = max_gsprims_base;
+ unsigned max_esverts = max_esverts_base;
+
+ if (esvert_lds_size)
+ max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+ if (gsprim_lds_size)
+ max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+ if (esvert_lds_size || gsprim_lds_size) {
+ /* Now that we have a rough proportionality between esverts
+ * and gsprims based on the primitive type, scale both of them
+ * down simultaneously based on required LDS space.
+ *
+ * We could be smarter about this if we knew how much vertex
+ * reuse to expect.
+ */
+ unsigned lds_total = max_esverts * esvert_lds_size +
+ max_gsprims * gsprim_lds_size;
+ if (lds_total > target_lds_size) {
+ max_esverts = max_esverts * target_lds_size / lds_total;
+ max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
+ min_verts_per_prim, uses_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+ }
+ }
+
+ /* Round up towards full wave sizes for better ALU utilization. */
+ if (!max_vert_out_per_gs_instance) {
+ const unsigned wavesize = 64;
+ unsigned orig_max_esverts;
+ unsigned orig_max_gsprims;
+ do {
+ orig_max_esverts = max_esverts;
+ orig_max_gsprims = max_gsprims;
+
+ max_esverts = align(max_esverts, wavesize);
+ max_esverts = MIN2(max_esverts, max_esverts_base);
+ if (esvert_lds_size)
+ max_esverts = MIN2(max_esverts,
+ (max_lds_size - max_gsprims * gsprim_lds_size) /
+ esvert_lds_size);
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+ max_gsprims = align(max_gsprims, wavesize);
+ max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+ if (gsprim_lds_size)
+ max_gsprims = MIN2(max_gsprims,
+ (max_lds_size - max_esverts * esvert_lds_size) /
+ gsprim_lds_size);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
+ min_verts_per_prim, uses_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+ } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+ }
+
+ /* Hardware restriction: minimum value of max_esverts */
+ max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
+
+ unsigned max_out_vertices =
+ max_vert_out_per_gs_instance ? gs_info->gs.vertices_out :
+ gs_type == MESA_SHADER_GEOMETRY ?
+ max_gsprims * gs_num_invocations * gs_info->gs.vertices_out :
+ max_esverts;
+ assert(max_out_vertices <= 256);
+
+ unsigned prim_amp_factor = 1;
+ if (gs_type == MESA_SHADER_GEOMETRY) {
+ /* Number of output primitives per GS input primitive after
+ * GS instancing. */
+ prim_amp_factor = gs_info->gs.vertices_out;
+ }
+
+ /* The GE only checks against the maximum number of ES verts after
+ * allocating a full GS primitive. So we need to ensure that whenever
+ * this check passes, there is enough space for a full primitive without
+ * vertex reuse.
+ */
+ ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+ ngg.max_gsprims = max_gsprims;
+ ngg.max_out_verts = max_out_vertices;
+ ngg.prim_amp_factor = prim_amp_factor;
+ ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+ ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
+ ngg.vgt_esgs_ring_itemsize = 1;
+
+ pipeline->graphics.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
+
+ assert(ngg.hw_max_esverts >= 24); /* HW limitation */
+
+ return ngg;
+}
+
static void
calculate_gs_ring_sizes(struct radv_pipeline *pipeline, const struct radv_gs_state *gs)
{
@@ -2000,7 +2221,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
}
static void
-radv_fill_shader_keys(struct radv_shader_variant_key *keys,
+radv_fill_shader_keys(struct radv_device *device,
+ struct radv_shader_variant_key *keys,
const struct radv_pipeline_key *key,
nir_shader **nir)
{
@@ -2031,6 +2253,10 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
keys[MESA_SHADER_VERTEX].vs.out.as_es = true;
}
+ if (device->physical_device->rad_info.chip_class >= GFX10) {
+ keys[MESA_SHADER_VERTEX].vs.out.as_ngg = true;
+ }
+
for(int i = 0; i < MESA_SHADER_STAGES; ++i)
keys[i].has_multiview_view_index = key->has_multiview_view_index;
@@ -2221,7 +2447,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
nir_print_shader(nir[i], stderr);
}
- radv_fill_shader_keys(keys, key, nir);
+ radv_fill_shader_keys(device, keys, key, nir);
if (nir[MESA_SHADER_FRAGMENT]) {
if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
@@ -2356,6 +2582,8 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
{
bool has_gs = radv_pipeline_has_gs(pipeline);
bool has_tess = radv_pipeline_has_tess(pipeline);
+ bool has_ngg = radv_pipeline_has_ngg(pipeline);
+
switch (stage) {
case MESA_SHADER_FRAGMENT:
return R_00B030_SPI_SHADER_USER_DATA_PS_0;
@@ -2379,6 +2607,9 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
}
}
+ if (has_ngg)
+ return R_00B230_SPI_SHADER_USER_DATA_GS_0;
+
return R_00B130_SPI_SHADER_USER_DATA_VS_0;
case MESA_SHADER_GEOMETRY:
return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
@@ -2968,8 +3199,7 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
struct radv_pipeline *pipeline)
{
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
-
- uint32_t vgt_primitiveid_en = false;
+ unsigned vgt_primitiveid_en = 0;
uint32_t vgt_gs_mode = 0;
if (radv_pipeline_has_gs(pipeline)) {
@@ -2978,9 +3208,17 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
pipeline->device->physical_device->rad_info.chip_class);
+ } else if (radv_pipeline_has_ngg(pipeline)) {
+ const struct radv_shader_variant *vs =
+ pipeline->shaders[MESA_SHADER_VERTEX];
+ bool enable_prim_id =
+ outinfo->export_prim_id || vs->info.info.uses_prim_id;
+
+ vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(enable_prim_id) |
+ S_028A84_NGG_DISABLE_PROVOK_REUSE(enable_prim_id);
} else if (outinfo->export_prim_id) {
vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
- vgt_primitiveid_en = true;
+ vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
}
radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
@@ -3085,6 +3323,105 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs,
}
static void
+radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs,
+ struct radeon_cmdbuf *cs,
+ struct radv_pipeline *pipeline,
+ struct radv_shader_variant *shader,
+ const struct radv_ngg_state *ngg_state)
+{
+ uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+
+ radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
+ radeon_emit(cs, va >> 8);
+ radeon_emit(cs, va >> 40);
+ radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
+ radeon_emit(cs, shader->config.rsrc1);
+ radeon_emit(cs, shader->config.rsrc2);
+
+ const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
+ unsigned clip_dist_mask, cull_dist_mask, total_mask;
+ clip_dist_mask = outinfo->clip_dist_mask;
+ cull_dist_mask = outinfo->cull_dist_mask;
+ total_mask = clip_dist_mask | cull_dist_mask;
+ bool misc_vec_ena = outinfo->writes_pointsize ||
+ outinfo->writes_layer ||
+ outinfo->writes_viewport_index;
+ bool break_wave_at_eoi = false;
+
+ radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
+ S_0286C4_VS_EXPORT_COUNT(MAX2(1, outinfo->param_exports) - 1));
+ radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
+ S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
+ radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
+ S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+ S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
+ V_02870C_SPI_SHADER_4COMP :
+ V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ?
+ V_02870C_SPI_SHADER_4COMP :
+ V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ?
+ V_02870C_SPI_SHADER_4COMP :
+ V_02870C_SPI_SHADER_NONE));
+
+ radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL,
+ S_028818_VTX_W0_FMT(1) |
+ S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+ S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+ S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+ radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
+ S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
+ S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
+ S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
+ S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+ S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
+ S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
+ S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
+ cull_dist_mask << 8 |
+ clip_dist_mask);
+
+ /* TODO: Correctly set REUSE_OFF */
+ radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF,
+ S_028AB4_REUSE_OFF(0));
+ radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ ngg_state->vgt_esgs_ring_itemsize);
+
+ /* NGG specific registers. */
+ struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
+ uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
+
+ radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
+ S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
+ radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+ S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
+ radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
+ S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
+ S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
+ radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
+ S_028B90_CNT(gs_num_invocations) |
+ S_028B90_ENABLE(gs_num_invocations > 1) |
+ S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
+
+ /* User edge flags are set by the pos exports. If user edge flags are
+ * not used, we must use hw-generated edge flags and pass them via
+ * the prim export to prevent drawing lines on internal edges of
+ * decomposed primitives (such as quads) with polygon mode = lines.
+ *
+ * TODO: We should combine hw-generated edge flags with user edge
+ * flags in the shader.
+ */
+ radeon_set_context_reg(ctx_cs, R_028838_PA_CL_NGG_CNTL,
+ S_028838_INDEX_BUF_EDGE_FLAG_ENA(1));
+
+ radeon_set_context_reg(ctx_cs, R_03096C_GE_CNTL,
+ S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts) |
+ S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
+}
+
+static void
radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
struct radv_pipeline *pipeline,
struct radv_shader_variant *shader,
@@ -3127,7 +3464,8 @@ static void
radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
struct radeon_cmdbuf *cs,
struct radv_pipeline *pipeline,
- const struct radv_tessellation_state *tess)
+ const struct radv_tessellation_state *tess,
+ const struct radv_ngg_state *ngg)
{
struct radv_shader_variant *vs;
@@ -3140,6 +3478,8 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
radv_pipeline_generate_hw_ls(cs, pipeline, vs, tess);
else if (vs->info.vs.as_es)
radv_pipeline_generate_hw_es(cs, pipeline, vs);
+ else if (vs->info.is_ngg)
+ radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs, ngg);
else
radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
}
@@ -3468,13 +3808,20 @@ radv_compute_vgt_shader_stages_en(const struct radv_pipeline *pipeline)
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
S_028B54_GS_EN(1) |
S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+ else if (radv_pipeline_has_ngg(pipeline))
+ stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+ S_028B54_PRIMGEN_EN(1);
else
stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
- } else if (radv_pipeline_has_gs(pipeline))
+ } else if (radv_pipeline_has_gs(pipeline)) {
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
S_028B54_GS_EN(1) |
S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+ } else if (radv_pipeline_has_ngg(pipeline)) {
+ stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
+ S_028B54_PRIMGEN_EN(1);
+ }
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
@@ -3555,6 +3902,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
const struct radv_blend_state *blend,
const struct radv_tessellation_state *tess,
const struct radv_gs_state *gs,
+ const struct radv_ngg_state *ngg,
unsigned prim, unsigned gs_out)
{
struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
@@ -3570,7 +3918,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
- radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess);
+ radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess, ngg);
radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess);
radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs);
radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
@@ -3578,7 +3926,7 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo);
- if (pipeline->device->physical_device->rad_info.chip_class >= GFX10)
+ if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline))
gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess, gs);
radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE,
@@ -3848,8 +4196,12 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
}
}
+ struct radv_ngg_state ngg = {0};
struct radv_gs_state gs = {0};
- if (radv_pipeline_has_gs(pipeline)) {
+
+ if (radv_pipeline_has_ngg(pipeline)) {
+ ngg = calculate_ngg_info(pCreateInfo, pipeline);
+ } else if (radv_pipeline_has_gs(pipeline)) {
gs = calculate_gs_info(pCreateInfo, pipeline);
calculate_gs_ring_sizes(pipeline, &gs);
}
@@ -3885,7 +4237,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
result = radv_pipeline_scratch_init(device, pipeline);
- radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out);
+ radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, &ngg, prim, gs_out);
return result;
}
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 3fa71905adf..fd1f8972adc 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1510,6 +1510,8 @@ static inline bool radv_pipeline_has_tess(const struct radv_pipeline *pipeline)
return pipeline->shaders[MESA_SHADER_TESS_CTRL] ? true : false;
}
+bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline);
+
struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
gl_shader_stage stage,
int idx);
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 315d522b63e..d36dfbdf332 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -583,7 +583,9 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
break;
case MESA_SHADER_VERTEX:
- if (info->vs.as_ls) {
+ if (info->is_ngg) {
+ config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+ } else if (info->vs.as_ls) {
assert(pdevice->rad_info.chip_class <= GFX8);
/* We need at least 2 components for LS.
* VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
@@ -632,8 +634,19 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
break;
}
- if (pdevice->rad_info.chip_class >= GFX9 &&
- stage == MESA_SHADER_GEOMETRY) {
+ if (pdevice->rad_info.chip_class >= GFX10 &&
+ stage == MESA_SHADER_VERTEX) {
+ unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
+
+ /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
+ es_vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 0;
+ gs_vgpr_comp_cnt = 3;
+
+ config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+ config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+ S_00B22C_LDS_SIZE(config_in->lds_size);
+ } else if (pdevice->rad_info.chip_class >= GFX9 &&
+ stage == MESA_SHADER_GEOMETRY) {
unsigned es_type = info->gs.es_type;
unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 1ee7fea5890..acd417cdb57 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -65,6 +65,7 @@ enum {
struct radv_vs_out_key {
uint32_t as_es:1;
uint32_t as_ls:1;
+ uint32_t as_ngg:1;
uint32_t export_prim_id:1;
uint32_t export_layer_id:1;
uint32_t export_clip_dists:1;
@@ -264,6 +265,7 @@ struct radv_shader_variant_info {
unsigned num_input_vgprs;
unsigned private_mem_vgprs;
bool need_indirect_descriptor_sets;
+ bool is_ngg;
struct {
struct {
struct radv_vs_output_info outinfo;
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index b3d12df4575..84e9663963b 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -317,6 +317,17 @@ si_emit_graphics(struct radv_physical_device *physical_device,
}
if (physical_device->rad_info.chip_class >= GFX10) {
+ /* Break up a pixel wave if it contains deallocs for more than
+ * half the parameter cache.
+ *
+ * To avoid a deadlock where pixel waves aren't launched
+ * because they're waiting for more pixels while the frontend
+ * is stuck waiting for PC space, the maximum allowed value is
+ * the size of the PC minus the largest possible allocation for
+ * a single primitive shader subgroup.
+ */
+ radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL,
+ S_028C50_MAX_DEALLOCS_IN_WAVE(512));
radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
physical_device->rad_info.pa_sc_tile_steering_override);