aboutsummaryrefslogtreecommitdiffstats
path: root/src/amd/compiler/aco_reduce_assign.cpp
diff options
context:
space:
mode:
authorRhys Perry <[email protected]>2020-02-03 17:54:07 +0000
committerMarge Bot <[email protected]>2020-02-06 16:43:03 +0000
commit20eb1acb6f404ffa4e502e7de8dec8ac83e7a8a8 (patch)
tree154b81e46696543af8bc5e1214cb945d6b10b329 /src/amd/compiler/aco_reduce_assign.cpp
parent1c79afd94620925cb9e0903f24f91c3ab9ecfcb4 (diff)
aco: fix gfx10_wave64_bpermute
Since 9254fb4fc72, the pass replaced the SCC clobber with the scalar identity temporary. Just skip most of the temporary setup, since we don't need it for gfx10_wave64_bpermute. Although shuffles are disabled on GFX10, Detroit: Become Human seems to use them anyway. Signed-off-by: Rhys Perry <[email protected]> Reviewed-By: Timur Kristóf <[email protected]> Fixes: 9254fb4fc72ed289ffded28ef067b4582973e90c ('aco: don't use a scalar temporary for reductions on GFX10') Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3683>
Diffstat (limited to 'src/amd/compiler/aco_reduce_assign.cpp')
-rw-r--r--src/amd/compiler/aco_reduce_assign.cpp7
1 files changed, 6 insertions, 1 deletions
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index 58c64cfb019..96846e926d4 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -114,6 +114,11 @@ void setup_reduce_temp(Program* program)
}
}
+ if (op == gfx10_wave64_bpermute) {
+ instr->operands[1] = Operand(reduceTmp);
+ continue;
+ }
+
/* same as before, except for the vector temporary instead of the reduce temporary */
unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
@@ -121,7 +126,7 @@ void setup_reduce_temp(Program* program)
op == umax64 || op == imin64 || op == imax64 ||
op == imul64;
- if (program->chip_class >= GFX10 && cluster_size == 64 && op != gfx10_wave64_bpermute)
+ if (program->chip_class >= GFX10 && cluster_size == 64)
need_vtmp = true;
if (program->chip_class >= GFX10 && op == iadd64)
need_vtmp = true;