diff options
author | Rhys Perry <[email protected]> | 2019-11-12 15:00:48 +0000 |
---|---|---|
committer | Rhys Perry <[email protected]> | 2019-11-12 17:21:38 +0000 |
commit | 3204e837682dc130e33d4339f5a1d81899f09bf1 (patch) | |
tree | f4c361f2702621fd31e82efdfd4504644eed31b2 | |
parent | 06347989a0e9490955bfe59c5e0011d59387c088 (diff) |
aco: use DPP instead of exec modification when lowering GFX10 shuffles
Seems we can use DPP's row_mask field to get an effect similar to
modifying exec.
Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
-rw-r--r-- | src/amd/compiler/aco_lower_to_hw_instr.cpp | 22 |
1 files changed, 7 insertions, 15 deletions
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 3d01e59fef7..1502619b9db 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -835,29 +835,21 @@ void lower_to_hw_instr(Program* program) Definition def_temp1(tmp1.physReg(), v1); Definition def_temp2(tmp2.physReg(), v1); - /* Save EXEC and clear it */ - bld.sop1(aco_opcode::s_and_saveexec_b64, instr->definitions[1], instr->definitions[2], - Definition(exec, s2), Operand(0u), Operand(exec, s2)); + /* Save EXEC and set it for all lanes */ + bld.sop1(aco_opcode::s_or_saveexec_b64, instr->definitions[1], instr->definitions[2], + Definition(exec, s2), Operand((uint64_t)-1), Operand(exec, s2)); - /* Set EXEC to enable HI lanes only */ - bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand((uint32_t)-1)); /* HI: Copy data from high lanes 32-63 to shared vgpr */ - bld.vop1(aco_opcode::v_mov_b32, shared_vgpr_hi, input_data); + bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_hi, input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); - /* Invert EXEC to enable LO lanes only */ - bld.sop1(aco_opcode::s_not_b64, Definition(exec, s2), Operand(exec, s2)); /* LO: Copy data from low lanes 0-31 to shared vgpr */ - bld.vop1(aco_opcode::v_mov_b32, shared_vgpr_lo, input_data); + bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_lo, input_data, dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); /* LO: Copy shared vgpr (high lanes' data) to output vgpr */ - bld.vop1(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_hi, v1)); + bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); - /* Invert EXEC to enable HI lanes only */ - bld.sop1(aco_opcode::s_not_b64, Definition(exec, s2), Operand(exec, s2)); /* HI: Copy shared vgpr (low lanes' data) to output vgpr */ - bld.vop1(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_lo, v1)); + bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); - /* Enable exec mask for all lanes */ - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand((uint32_t)-1)); /* Permute the original input */ bld.ds(aco_opcode::ds_bpermute_b32, def_temp2, index_x4, input_data); /* Permute the swapped input */ |