diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/amd/compiler/aco_assembler.cpp | 6 | ||||
-rw-r--r-- | src/amd/compiler/aco_insert_NOPs.cpp | 147 |
2 files changed, 117 insertions, 36 deletions
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 76361241e4b..d569d5ca25f 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -221,9 +221,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* } case Format::VOP1: { uint32_t encoding = (0b0111111 << 25); - encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17; + if (!instr->definitions.empty()) + encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17; encoding |= opcode << 9; - encoding |= instr->operands[0].physReg().reg; + if (!instr->operands.empty()) + encoding |= instr->operands[0].physReg().reg; out.push_back(encoding); break; } diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index fea1364072e..eb325c940ba 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -28,12 +28,18 @@ namespace aco { namespace { struct NOP_ctx { + enum chip_class chip_class; + unsigned vcc_physical; + + /* pre-GFX10 */ /* just initialize these with something less than max NOPs */ int VALU_wrexec = -10; int VALU_wrvcc = -10; int VALU_wrsgpr = -10; - enum chip_class chip_class; - unsigned vcc_physical; + + /* GFX10 */ + int last_VMEM_since_scalar_write = -1; + NOP_ctx(Program* program) : chip_class(program->chip_class) { vcc_physical = program->config->num_sgprs - 2; } @@ -57,6 +63,45 @@ bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_si (b_reg - a_reg < a_size); } +unsigned handle_SMEM_clause(aco_ptr<Instruction>& instr, int new_idx, + std::vector<aco_ptr<Instruction>>& new_instructions) +{ + //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?) + const bool is_store = instr->definitions.empty(); + for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) { + aco_ptr<Instruction>& pred = new_instructions[pred_idx]; + if (pred->format != Format::SMEM) + break; + + /* Don't allow clauses with store instructions since the clause's + * instructions may use the same address. */ + if (is_store || pred->definitions.empty()) + return 1; + + Definition& instr_def = instr->definitions[0]; + Definition& pred_def = pred->definitions[0]; + + /* ISA reference doesn't say anything about this, but best to be safe */ + if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size())) + return 1; + + for (const Operand& op : pred->operands) { + if (op.isConstant() || !op.isFixed()) + continue; + if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size())) + return 1; + } + for (const Operand& op : instr->operands) { + if (op.isConstant() || !op.isFixed()) + continue; + if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) + return 1; + } + } + + return 0; +} + int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& old_instructions, std::vector<aco_ptr<Instruction>>& new_instructions) @@ -68,37 +113,7 @@ int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr, /* break off from prevous SMEM clause if needed */ if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) { - const bool is_store = instr->definitions.empty(); - for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) { - aco_ptr<Instruction>& pred = new_instructions[pred_idx]; - if (pred->format != Format::SMEM) - break; - - /* Don't allow clauses with store instructions since the clause's - * instructions may use the same address. */ - if (is_store || pred->definitions.empty()) - return 1; - - Definition& instr_def = instr->definitions[0]; - Definition& pred_def = pred->definitions[0]; - - /* ISA reference doesn't say anything about this, but best to be safe */ - if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size())) - return 1; - - for (const Operand& op : pred->operands) { - if (op.isConstant() || !op.isFixed()) - continue; - if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size())) - return 1; - } - for (const Operand& op : instr->operands) { - if (op.isConstant() || !op.isFixed()) - continue; - if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) - return 1; - } - } + return handle_SMEM_clause(instr, new_idx, new_instructions); } else if (instr->isVALU() || instr->format == Format::VINTRP) { int NOPs = 0; @@ -239,6 +254,38 @@ int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr, return 0; } +std::pair<int, int> handle_instruction_gfx10(NOP_ctx& ctx, aco_ptr<Instruction>& instr, + std::vector<aco_ptr<Instruction>>& old_instructions, + std::vector<aco_ptr<Instruction>>& new_instructions) +{ + int new_idx = new_instructions.size(); + unsigned vNOPs = 0; + unsigned sNOPs = 0; + + /* break off from prevous SMEM group ("clause" seems to mean something different in RDNA) if needed */ + if (instr->format == Format::SMEM) + sNOPs = std::max(sNOPs, handle_SMEM_clause(instr, new_idx, new_instructions)); + + /* handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between */ + if (instr->isSALU() || instr->format == Format::SMEM) { + if (!instr->definitions.empty() && ctx.last_VMEM_since_scalar_write != -1) { + ctx.last_VMEM_since_scalar_write = -1; + vNOPs = 1; + } + } else if (instr->isVMEM() || instr->isFlatOrGlobal()) { + ctx.last_VMEM_since_scalar_write = new_idx; + } else if (instr->opcode == aco_opcode::s_waitcnt) { + uint16_t imm = static_cast<SOPP_instruction*>(instr.get())->imm; + unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); + if (vmcnt == 0) + ctx.last_VMEM_since_scalar_write = -1; + } else if (instr->isVALU()) { + ctx.last_VMEM_since_scalar_write = -1; + } + + return std::make_pair(sNOPs, vNOPs); +} + void handle_block(NOP_ctx& ctx, Block& block) { @@ -265,17 +312,49 @@ void handle_block(NOP_ctx& ctx, Block& block) block.instructions = std::move(instructions); } +void handle_block_gfx10(NOP_ctx& ctx, Block& block) +{ + std::vector<aco_ptr<Instruction>> instructions; + instructions.reserve(block.instructions.size()); + for (unsigned i = 0; i < block.instructions.size(); i++) { + aco_ptr<Instruction>& instr = block.instructions[i]; + std::pair<int, int> NOPs = handle_instruction_gfx10(ctx, instr, block.instructions, instructions); + for (int i = 0; i < NOPs.second; i++) { + // TODO: try to move the instruction down + /* create NOP */ + aco_ptr<VOP1_instruction> nop{create_instruction<VOP1_instruction>(aco_opcode::v_nop, Format::VOP1, 0, 0)}; + instructions.emplace_back(std::move(nop)); + } + if (NOPs.first) { + // TODO: try to move the instruction down + /* create NOP */ + aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)}; + nop->imm = NOPs.first - 1; + nop->block = -1; + instructions.emplace_back(std::move(nop)); + } + + instructions.emplace_back(std::move(instr)); + } + + block.instructions = std::move(instructions); +} + } /* end namespace */ void insert_NOPs(Program* program) { NOP_ctx ctx(program); + for (Block& block : program->blocks) { if (block.instructions.empty()) continue; - handle_block(ctx, block); + if (ctx.chip_class >= GFX10) + handle_block_gfx10(ctx, block); + else + handle_block(ctx, block); } } |