summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp8
-rw-r--r--src/amd/compiler/aco_instruction_selection_setup.cpp3
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp64
3 files changed, 45 insertions, 30 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 84c88e4eaa5..0f89cb1aee5 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -5232,15 +5232,15 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
//subgroupClusteredOr(val, 4) -> wqm(val & exec)
return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
- } else if (op == nir_op_iand && cluster_size == 64) {
+ } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
//subgroupAnd(val) -> (exec & ~val) == 0
Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
- } else if (op == nir_op_ior && cluster_size == 64) {
+ } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
//subgroupOr(val) -> (val & exec) != 0
Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
return bool_to_vector_condition(ctx, tmp);
- } else if (op == nir_op_ixor && cluster_size == 64) {
+ } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
//subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
@@ -5839,7 +5839,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
nir_intrinsic_cluster_size(instr) : 0;
- cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
+ cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
emit_uniform_subgroup(ctx, instr, src);
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 47f5778822f..469aebbb8d9 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -390,8 +390,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
if (intrinsic->dest.ssa.bit_size == 1) {
size = lane_mask_size;
type = RegType::sgpr;
- } else if (nir_intrinsic_cluster_size(intrinsic) == 0 ||
- !ctx->divergent_vals[intrinsic->dest.ssa.index]) {
+ } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
type = RegType::sgpr;
} else {
type = RegType::vgpr;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index e9c2d66d823..19e0f598074 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -412,7 +412,8 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
{
- assert(cluster_size == 64 || op == aco_opcode::p_reduce);
+ assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
+ assert(cluster_size <= ctx->program->wave_size);
Builder bld(ctx->program, &ctx->instructions);
@@ -462,23 +463,34 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
dpp_row_mirror, 0xf, 0xf, false);
if (cluster_size == 16) break;
- if (cluster_size == 32) {
+
+ if (ctx->program->chip_class >= GFX10) {
+ /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
+
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
+
+ if (cluster_size == 32 && dst.regClass().type() == RegType::vgpr) {
+ bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
+ exec_restored = true;
+ emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
+ dst_written = true;
+ } else {
+ emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
+ }
+
+ if (cluster_size == 64) {
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
+ }
+ } else if (cluster_size == 32) {
for (unsigned i = 0; i < src.size(); i++)
bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10));
bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
exec_restored = true;
emit_op(ctx, dst.physReg(), vtmp, tmp, PhysReg{0}, reduce_op, src.size());
dst_written = true;
- } else if (ctx->program->chip_class >= GFX10) {
- assert(cluster_size == 64);
- /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
- for (unsigned i = 0; i < src.size(); i++)
- bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
- emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
-
- for (unsigned i = 0; i < src.size(); i++)
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
- emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
} else {
assert(cluster_size == 64);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
@@ -504,10 +516,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
- /* fill in the gap in row 2 */
- for (unsigned i = 0; i < src.size(); i++) {
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
- bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
+ if (ctx->program->wave_size == 64) {
+ /* fill in the gap in row 2 */
+ for (unsigned i = 0; i < src.size(); i++) {
+ bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
+ }
}
std::swap(tmp, vtmp);
} else {
@@ -523,7 +537,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
/* fall through */
case aco_opcode::p_inclusive_scan:
- assert(cluster_size == 64);
+ assert(cluster_size == ctx->program->wave_size);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
dpp_row_sr(1), 0xf, 0xf, false, identity);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
@@ -544,11 +558,13 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
- bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
- bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
- for (unsigned i = 0; i < src.size(); i++)
- bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
- emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
+ if (ctx->program->wave_size == 64) {
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
+ for (unsigned i = 0; i < src.size(); i++)
+ bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+ emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
+ }
} else {
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
dpp_row_bcast15, 0xa, 0xf, false, identity);
@@ -563,10 +579,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (!exec_restored)
bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
- if (op == aco_opcode::p_reduce && cluster_size == 64) {
+ if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) {
for (unsigned k = 0; k < src.size(); k++) {
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
- Operand(PhysReg{tmp + k}, v1), Operand(63u));
+ Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
}
} else if (!(dst.physReg() == tmp) && !dst_written) {
for (unsigned k = 0; k < src.size(); k++) {