summaryrefslogtreecommitdiffstats
path: root/src/amd/compiler/aco_reduce_assign.cpp
diff options
context:
space:
mode:
authorTimur Kristóf <[email protected]>2019-09-21 18:03:56 +0200
committerRhys Perry <[email protected]>2019-10-28 23:52:50 +0000
commitd59f702e268004fd43a0b781f39671be66728d46 (patch)
treed08c84c0c883262606f60fca05f7a7622bbb9238 /src/amd/compiler/aco_reduce_assign.cpp
parentc2eebfe3eaa75168661e559e6786ce6d1d0ea875 (diff)
aco: Implement subgroup shuffle in GFX10 wave64 mode.
Previously subgroup shuffle was implemented using the bpermute instruction, which only works accross half-waves, so by itself it's not suitable for implementing subgroup shuffle when the shader is running in wave64 mode. This commit adds a trick using shared VGPRs that allows to implement subgroup shuffle still relatively effectively in this mode. Signed-off-by: Timur Kristóf <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]>
Diffstat (limited to 'src/amd/compiler/aco_reduce_assign.cpp')
-rw-r--r--src/amd/compiler/aco_reduce_assign.cpp4
1 files changed, 3 insertions, 1 deletions
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index 66a3ec64c04..d9c762a65db 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -118,10 +118,12 @@ void setup_reduce_temp(Program* program)
unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
op == fmin64 || op == fmax64;
- if (program->chip_class >= GFX10 && cluster_size == 64)
+
+ if (program->chip_class >= GFX10 && cluster_size == 64 && op != gfx10_wave64_bpermute)
need_vtmp = true;
need_vtmp |= cluster_size == 32;
+
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
vtmp = {program->allocateId(), vtmp.regClass()};