aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimur Kristóf <[email protected]>2020-03-26 17:45:55 +0100
committerMarge Bot <[email protected]>2020-03-30 13:09:08 +0000
commit798dd98d6e530afc5dab2f973785fbbd4e598dee (patch)
tree24193bbb1f745dfd4d91942a1283d61bcb897419
parent0a91c086b8649a65befa3fdf3ef8460761bb87aa (diff)
aco: When LS and HS invocations are the same, pass LS outputs in temps.
We know that in this case, the LS and HS invocations are working on the exact same vertex, so it's safe to skip the LDS. Totals: VGPRS: 3960744 -> 3961844 (0.03 %) Code Size: 254824300 -> 254764624 (-0.02 %) bytes Max Waves: 1053748 -> 1053574 (-0.02 %) Totals from affected shaders: VGPRS: 26152 -> 27252 (4.21 %) Code Size: 1496600 -> 1436924 (-3.99 %) bytes Max Waves: 4860 -> 4686 (-3.58 %) Signed-off-by: Timur Kristóf <[email protected]> Reviewed-by: Rhys Perry <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4165>
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp35
1 files changed, 35 insertions, 0 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index b8816f51cde..716853d23ce 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3329,6 +3329,34 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
return true;
}
+bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+{
+ /* Only TCS per-vertex inputs are supported by this function.
+ * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+ */
+ if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
+ return false;
+
+ nir_src *off_src = nir_get_io_offset_src(instr);
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+ bool can_use_temps = nir_src_is_const(*off_src) &&
+ vertex_index_instr->type == nir_instr_type_intrinsic &&
+ nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+
+ if (!can_use_temps)
+ return false;
+
+ unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
+ Temp *src = &ctx->inputs.temps[idx];
+ Temp vec = create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u);
+ assert(vec.size() == dst.size());
+
+ Builder bld(ctx->program, ctx->block);
+ bld.copy(Definition(dst), vec);
+ return true;
+}
+
void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
@@ -3338,6 +3366,9 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
unsigned write_mask = nir_intrinsic_write_mask(instr);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
+ if (ctx->tcs_in_out_eq)
+ store_output_to_temps(ctx, instr);
+
if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
/* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
@@ -3974,6 +4005,10 @@ void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *ins
Builder bld(ctx->program, ctx->block);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ if (load_input_from_temps(ctx, instr, dst))
+ return;
+
std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
unsigned lds_align = calculate_lds_alignment(ctx, offs.second);