summaryrefslogtreecommitdiffstats
path: root/src/amd/compiler/aco_instruction_selection.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/amd/compiler/aco_instruction_selection.cpp')
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp36
1 files changed, 30 insertions, 6 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index fc1838724e1..768860a2c9b 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -146,6 +146,33 @@ Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_ne
return dst;
}
+static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
+{
+ Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
+
+ /* Currently not implemented on GFX6-7 */
+ assert(ctx->options->chip_class >= GFX8);
+
+ if (ctx->options->chip_class <= GFX9 || ctx->options->wave_size == 32) {
+ return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
+ }
+
+ /* GFX10, wave64 mode:
+ * The bpermute instruction is limited to half-wave operation, which means that it can't
+ * properly support subgroup shuffle like older generations (or wave32 mode), so we
+ * emulate it here.
+ */
+
+ Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u));
+ lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), lane_id);
+ Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
+ Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
+ Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
+
+ return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
+ bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
+}
+
Temp as_vgpr(isel_context *ctx, Temp val)
{
if (val.type() == RegType::sgpr) {
@@ -5528,15 +5555,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
assert(tid.regClass() == v1);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
if (src.regClass() == v1) {
- tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
- emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
+ emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
} else if (src.regClass() == v2) {
- tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
-
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
- lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
- hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
+ lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
+ hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
} else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {