diff options
author | Timur Kristóf <[email protected]> | 2019-09-21 18:03:56 +0200 |
---|---|---|
committer | Rhys Perry <[email protected]> | 2019-10-28 23:52:50 +0000 |
commit | d59f702e268004fd43a0b781f39671be66728d46 (patch) | |
tree | d08c84c0c883262606f60fca05f7a7622bbb9238 /src/amd/compiler/aco_reduce_assign.cpp | |
parent | c2eebfe3eaa75168661e559e6786ce6d1d0ea875 (diff) |
aco: Implement subgroup shuffle in GFX10 wave64 mode.
Previously subgroup shuffle was implemented using the bpermute
instruction, which only works accross half-waves, so by itself it's
not suitable for implementing subgroup shuffle when the shader is
running in wave64 mode.
This commit adds a trick using shared VGPRs that allows to implement
subgroup shuffle still relatively effectively in this mode.
Signed-off-by: Timur Kristóf <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
Diffstat (limited to 'src/amd/compiler/aco_reduce_assign.cpp')
-rw-r--r-- | src/amd/compiler/aco_reduce_assign.cpp | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 66a3ec64c04..d9c762a65db 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -118,10 +118,12 @@ void setup_reduce_temp(Program* program) unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size; bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || op == fmax64; - if (program->chip_class >= GFX10 && cluster_size == 64) + + if (program->chip_class >= GFX10 && cluster_size == 64 && op != gfx10_wave64_bpermute) need_vtmp = true; need_vtmp |= cluster_size == 32; + vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { vtmp = {program->allocateId(), vtmp.regClass()}; |