summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Schürmann <[email protected]>2019-11-21 10:23:13 +0100
committerDaniel Schürmann <[email protected]>2019-12-07 11:23:11 +0100
commit746165e540c995deb421acdc9582ecd70f611535 (patch)
treef32b94c4dfa7a8e80bf88db5817743e223b609ad
parent7ae227effd245502b0d9433cf7b637aaf3b694ab (diff)
aco: implement exclusive scan for SI/CI
Reviewed-by: Rhys Perry <[email protected]>
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp37
1 files changed, 35 insertions, 2 deletions
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 3065f283549..592c3868e20 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -545,11 +545,44 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
}
std::swap(tmp, vtmp);
- } else {
+ } else if (ctx->program->chip_class >= GFX8) {
emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
+ } else {
+ // TODO: use LDS on CS with a single write and shifted read
+ /* wavefront shift_right by 1 on SI/CI */
+ emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
+ emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
+
+ bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+ emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
+
+ bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+ emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
+ bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u));
+ bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u));
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
+
+ bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+ for (unsigned i = 0; i < src.size(); i++) {
+ bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1));
+ bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
+ bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
+ identity[i] = Operand(0u); /* prevent further uses of identity */
+ }
+ std::swap(tmp, vtmp);
}
+
for (unsigned i = 0; i < src.size(); i++) {
- if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
+ if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
if (ctx->program->chip_class < GFX10)
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));