aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/amd/compiler/aco_ir.h2
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp92
-rw-r--r--src/amd/compiler/aco_reduce_assign.cpp19
3 files changed, 95 insertions, 18 deletions
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 739ef869e6a..90fc3c6fe36 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -841,7 +841,7 @@ enum ReduceOp {
* Operand(2): vector temporary
* Definition(0): result
* Definition(1): scalar temporary
- * Definition(2): scalar identity temporary
+ * Definition(2): scalar identity temporary (not used to store identity on GFX10)
* Definition(3): scc clobber
* Definition(4): vcc clobber
*
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 39585111954..2cd451e48c5 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -85,6 +85,22 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, Ph
}
}
+void emit_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
+ aco_opcode op, Format format, bool clobber_vcc, unsigned size)
+{
+ aco_ptr<Instruction> instr;
+ if (format == Format::VOP3)
+ instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
+ else
+ instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
+ instr->operands[0] = Operand(src0, src0.reg >= 256 ? v1 : s1);
+ instr->operands[1] = Operand(src1, v1);
+ instr->definitions[0] = Definition(dst, v1);
+ if (clobber_vcc)
+ instr->definitions[1] = Definition(vcc, s2);
+ ctx->instructions.emplace_back(std::move(instr));
+}
+
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
{
switch (op) {
@@ -236,12 +252,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
Operand vcndmask_identity[2] = {identity[0], identity[1]};
/* First, copy the source to tmp and set inactive lanes to the identity */
- // note: this clobbers SCC!
bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
for (unsigned i = 0; i < src.size(); i++) {
- /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */
- if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
+ /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
+ * except on GFX10, where v_writelane_b32 can take a literal. */
+ if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
identity[i] = Operand(PhysReg{sitmp+i}, s1);
@@ -283,6 +299,16 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
exec_restored = true;
emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc);
dst_written = true;
+ } else if (ctx->program->chip_class >= GFX10) {
+ assert(cluster_size == 64);
+ /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
+ emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
+
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
} else {
assert(cluster_size == 64);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
@@ -292,11 +318,38 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
break;
case aco_opcode::p_exclusive_scan:
- emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
- dpp_wf_sr1, 0xf, 0xf, true, src.size());
+ if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
+ /* shift rows right */
+ for (unsigned i = 0; i < src.size(); i++) {
+ bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), dpp_row_sr(1), 0xf, 0xf, true);
+ }
+
+ /* fill in the gaps in rows 1 and 3 */
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
+ for (unsigned i = 0; i < src.size(); i++) {
+ Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
+ Definition(PhysReg{vtmp+i}, v1),
+ Operand(PhysReg{tmp+i}, v1),
+ Operand(0xffffffffu), Operand(0xffffffffu)).instr;
+ static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
+ }
+ bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+
+ /* fill in the gap in row 2 */
+ for (unsigned i = 0; i < src.size(); i++) {
+ bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
+ }
+ std::swap(tmp, vtmp);
+ } else {
+ emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
+ dpp_wf_sr1, 0xf, 0xf, true, src.size());
+ }
for (unsigned i = 0; i < src.size(); i++) {
if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
- assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
+ if (ctx->program->chip_class < GFX10)
+ assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
identity[i], Operand(0u));
}
@@ -312,10 +365,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity);
- emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
- dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
- emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
- dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
+ if (ctx->program->chip_class >= GFX10) {
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xffff0000u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffff0000u));
+ for (unsigned i = 0; i < src.size(); i++) {
+ Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
+ Definition(PhysReg{vtmp+i}, v1),
+ Operand(PhysReg{tmp+i}, v1),
+ Operand(0xffffffffu), Operand(0xffffffffu)).instr;
+ static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
+ }
+ emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
+
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
+ } else {
+ emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
+ dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
+ emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
+ dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
+ }
break;
default:
unreachable("Invalid reduction mode");
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index 663a43c539a..66a3ec64c04 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -115,10 +115,13 @@ void setup_reduce_temp(Program* program)
}
/* same as before, except for the vector temporary instead of the reduce temporary */
+ unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
op == fmin64 || op == fmax64;
+ if (program->chip_class >= GFX10 && cluster_size == 64)
+ need_vtmp = true;
- need_vtmp |= static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size == 32;
+ need_vtmp |= cluster_size == 32;
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
vtmp = {program->allocateId(), vtmp.regClass()};
@@ -144,12 +147,14 @@ void setup_reduce_temp(Program* program)
instr->definitions[1] = bld.def(s2);
/* scalar identity temporary */
- if (instr->opcode == aco_opcode::p_exclusive_scan &&
- (op == imin32 || op == imin64 ||
- op == imax32 || op == imax64 ||
- op == fmin32 || op == fmin64 ||
- op == fmax32 || op == fmax64 ||
- op == fmul64)) {
+ bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
+ if (instr->opcode == aco_opcode::p_exclusive_scan) {
+ need_sitmp |=
+ (op == imin32 || op == imin64 || op == imax32 || op == imax64 ||
+ op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 ||
+ op == fmul64);
+ }
+ if (need_sitmp) {
instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size()));
}