aboutsummaryrefslogtreecommitdiffstats
path: root/src/amd/compiler/aco_reduce_assign.cpp
diff options
context:
space:
mode:
authorRhys Perry <[email protected]>2019-09-12 19:28:52 +0100
committerRhys Perry <[email protected]>2019-10-28 23:52:50 +0000
commit3865448012b16d0e98e706e1b462242a754436c7 (patch)
tree8be960a9fe9931500e1b8f061bf2cf98a6b3abd5 /src/amd/compiler/aco_reduce_assign.cpp
parentcd04b63c0007e5fc7a9a1deb2de58091942cb2fc (diff)
aco: Fix reductions on GFX10.
Fixes p_reduce (all cluster sizes), p_inclusive_scan and p_exclusive_scan with all reduction operations. Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]>
Diffstat (limited to 'src/amd/compiler/aco_reduce_assign.cpp')
-rw-r--r--src/amd/compiler/aco_reduce_assign.cpp19
1 files changed, 12 insertions, 7 deletions
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index 663a43c539a..66a3ec64c04 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -115,10 +115,13 @@ void setup_reduce_temp(Program* program)
}
/* same as before, except for the vector temporary instead of the reduce temporary */
+ unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
op == fmin64 || op == fmax64;
+ if (program->chip_class >= GFX10 && cluster_size == 64)
+ need_vtmp = true;
- need_vtmp |= static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size == 32;
+ need_vtmp |= cluster_size == 32;
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
vtmp = {program->allocateId(), vtmp.regClass()};
@@ -144,12 +147,14 @@ void setup_reduce_temp(Program* program)
instr->definitions[1] = bld.def(s2);
/* scalar identity temporary */
- if (instr->opcode == aco_opcode::p_exclusive_scan &&
- (op == imin32 || op == imin64 ||
- op == imax32 || op == imax64 ||
- op == fmin32 || op == fmin64 ||
- op == fmax32 || op == fmax64 ||
- op == fmul64)) {
+ bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
+ if (instr->opcode == aco_opcode::p_exclusive_scan) {
+ need_sitmp |=
+ (op == imin32 || op == imin64 || op == imax32 || op == imax64 ||
+ op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 ||
+ op == fmul64);
+ }
+ if (need_sitmp) {
instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size()));
}