diff options
author | Timur Kristóf <[email protected]> | 2020-03-13 12:39:23 +0100 |
---|---|---|
committer | Marge Bot <[email protected]> | 2020-03-30 13:09:08 +0000 |
commit | e7d733fdab58b7fd08aa79ef7713e7be847377f4 (patch) | |
tree | 3a331acf97702241adcb262726caeef2086b7dd8 | |
parent | 17c779ab9e2bb9329f07299e327ac2c1c81f3cb3 (diff) |
aco: Use more optimal sequence at the beginning of merged shaders.
It can be further optimized in the future, but
the new sequence already has a few advantages:
* Uses fewer instructions
* Uses even fewer instructions in wave32 mode
* Doesn't use the VALU at all
Totals from affected shaders (GFX10):
VGPRS: 43504 -> 43496 (-0.02 %)
Code Size: 2436000 -> 2423688 (-0.51 %) bytes
Max Waves: 8704 -> 8705 (0.01 %)
Signed-off-by: Timur Kristóf <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
Reviewed-by: Rhys Perry <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4165>
-rw-r--r-- | src/amd/compiler/aco_instruction_selection.cpp | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b6add9ae7f3..979fd357c06 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -9783,9 +9783,23 @@ void select_program(Program *program, bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info; if (check_merged_wave_info) { Builder bld(ctx.program, ctx.block); - Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | (i * 8u))); - Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); - Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); + + /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */ + Temp count = i == 0 ? get_arg(&ctx, args->merged_wave_info) + : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), + get_arg(&ctx, args->merged_wave_info), Operand(i * 8u)); + + Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u)); + Temp cond; + + if (ctx.program->wave_size == 64) { + /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */ + Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */)); + cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64)); + } else { + /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */ + cond = emit_extract_vector(&ctx, mask, 0, bld.lm); + } begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond); } |